Invalid JSON: Unexpected token 'N', ..."al_loss": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8208820882088209, | |
| "eval_steps": 23, | |
| "global_step": 76, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010801080108010801, | |
| "grad_norm": 63.828086432644454, | |
| "learning_rate": 5.333333333333333e-07, | |
| "loss": 6.498, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010801080108010801, | |
| "eval_loss": NaN, | |
| "eval_runtime": 122.6901, | |
| "eval_samples_per_second": 8.77, | |
| "eval_steps_per_second": 2.193, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.021602160216021602, | |
| "grad_norm": 66.4128492022695, | |
| "learning_rate": 1.0666666666666667e-06, | |
| "loss": 6.5865, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.032403240324032405, | |
| "grad_norm": 71.37254447564402, | |
| "learning_rate": 1.6e-06, | |
| "loss": 6.789, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.043204320432043204, | |
| "grad_norm": 60.356993414372305, | |
| "learning_rate": 2.1333333333333334e-06, | |
| "loss": 6.4573, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.054005400540054004, | |
| "grad_norm": 55.853845541360975, | |
| "learning_rate": 2.6666666666666664e-06, | |
| "loss": 6.6238, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06480648064806481, | |
| "grad_norm": 33.76487342816726, | |
| "learning_rate": 3.2e-06, | |
| "loss": 5.9981, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07560756075607561, | |
| "grad_norm": 23.47380616191989, | |
| "learning_rate": 3.7333333333333333e-06, | |
| "loss": 5.721, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.08640864086408641, | |
| "grad_norm": 19.539382371968657, | |
| "learning_rate": 4.266666666666667e-06, | |
| "loss": 5.6131, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.09720972097209721, | |
| "grad_norm": 14.327743602474902, | |
| "learning_rate": 4.8e-06, | |
| "loss": 5.2541, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.10801080108010801, | |
| "grad_norm": 13.592775716109132, | |
| "learning_rate": 5.333333333333333e-06, | |
| "loss": 5.1978, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1188118811881188, | |
| "grad_norm": 10.979858971620041, | |
| "learning_rate": 5.866666666666666e-06, | |
| "loss": 5.0864, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.12961296129612962, | |
| "grad_norm": 9.794947757674, | |
| "learning_rate": 6.4e-06, | |
| "loss": 4.8428, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1404140414041404, | |
| "grad_norm": 9.874078411160985, | |
| "learning_rate": 6.933333333333334e-06, | |
| "loss": 4.5415, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.15121512151215122, | |
| "grad_norm": 9.139382870149774, | |
| "learning_rate": 7.466666666666667e-06, | |
| "loss": 4.5053, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.162016201620162, | |
| "grad_norm": 8.650696439508351, | |
| "learning_rate": 8e-06, | |
| "loss": 4.2912, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.17281728172817282, | |
| "grad_norm": 8.866106038295735, | |
| "learning_rate": 7.996671197378331e-06, | |
| "loss": 4.0837, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.18361836183618363, | |
| "grad_norm": 8.09340750575385, | |
| "learning_rate": 7.986690329976772e-06, | |
| "loss": 3.794, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.19441944194419442, | |
| "grad_norm": 7.969568017047467, | |
| "learning_rate": 7.97007400996411e-06, | |
| "loss": 3.7631, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.20522052205220523, | |
| "grad_norm": 8.622778590137475, | |
| "learning_rate": 7.946849893565155e-06, | |
| "loss": 3.6164, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.21602160216021601, | |
| "grad_norm": 8.935855589318281, | |
| "learning_rate": 7.917056635029685e-06, | |
| "loss": 3.4103, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.22682268226822683, | |
| "grad_norm": 8.294919154259096, | |
| "learning_rate": 7.880743822296258e-06, | |
| "loss": 3.1529, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.2376237623762376, | |
| "grad_norm": 10.069947949166714, | |
| "learning_rate": 7.837971894457989e-06, | |
| "loss": 3.1933, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.24842484248424843, | |
| "grad_norm": 7.613383614891953, | |
| "learning_rate": 7.78881204116764e-06, | |
| "loss": 2.8644, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.24842484248424843, | |
| "eval_loss": NaN, | |
| "eval_runtime": 122.7992, | |
| "eval_samples_per_second": 8.762, | |
| "eval_steps_per_second": 2.191, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.25922592259225924, | |
| "grad_norm": 7.683615665676093, | |
| "learning_rate": 7.733346084149467e-06, | |
| "loss": 2.7816, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.27002700270027, | |
| "grad_norm": 6.616788299926916, | |
| "learning_rate": 7.671666341015038e-06, | |
| "loss": 2.6587, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2808280828082808, | |
| "grad_norm": 8.248800446233975, | |
| "learning_rate": 7.6038754716096755e-06, | |
| "loss": 2.5965, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.29162916291629165, | |
| "grad_norm": 5.768076660552814, | |
| "learning_rate": 7.5300863071452845e-06, | |
| "loss": 2.3966, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.30243024302430244, | |
| "grad_norm": 6.248576515957054, | |
| "learning_rate": 7.450421662403922e-06, | |
| "loss": 2.3689, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3132313231323132, | |
| "grad_norm": 6.340341378272499, | |
| "learning_rate": 7.365014131324725e-06, | |
| "loss": 2.2287, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.324032403240324, | |
| "grad_norm": 4.698539858663757, | |
| "learning_rate": 7.274005866314374e-06, | |
| "loss": 2.1459, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.33483348334833485, | |
| "grad_norm": 5.352213910449207, | |
| "learning_rate": 7.17754834164845e-06, | |
| "loss": 2.1892, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.34563456345634563, | |
| "grad_norm": 3.8948062761704185, | |
| "learning_rate": 7.075802101357448e-06, | |
| "loss": 2.0545, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.3564356435643564, | |
| "grad_norm": 4.275507225855982, | |
| "learning_rate": 6.96893649201708e-06, | |
| "loss": 1.9693, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.36723672367236726, | |
| "grad_norm": 3.827840441222993, | |
| "learning_rate": 6.857129380887614e-06, | |
| "loss": 2.2615, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.37803780378037805, | |
| "grad_norm": 4.061222798809487, | |
| "learning_rate": 6.740566859871377e-06, | |
| "loss": 1.907, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.38883888388838883, | |
| "grad_norm": 3.764768942445119, | |
| "learning_rate": 6.619442935781141e-06, | |
| "loss": 1.7926, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3996399639963996, | |
| "grad_norm": 3.6889692023791203, | |
| "learning_rate": 6.493959207434934e-06, | |
| "loss": 1.8239, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.41044104410441046, | |
| "grad_norm": 3.877637135877313, | |
| "learning_rate": 6.364324530114706e-06, | |
| "loss": 1.7841, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.42124212421242124, | |
| "grad_norm": 3.419151921848927, | |
| "learning_rate": 6.230754667947318e-06, | |
| "loss": 1.7599, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.43204320432043203, | |
| "grad_norm": 3.400475978424493, | |
| "learning_rate": 6.093471934786448e-06, | |
| "loss": 1.7395, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.44284428442844287, | |
| "grad_norm": 3.0959631594585924, | |
| "learning_rate": 5.952704824193125e-06, | |
| "loss": 1.6571, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.45364536453645365, | |
| "grad_norm": 2.5886990345494696, | |
| "learning_rate": 5.808687629130743e-06, | |
| "loss": 1.6854, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.46444644464446444, | |
| "grad_norm": 3.764322932438931, | |
| "learning_rate": 5.661660052007546e-06, | |
| "loss": 1.627, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.4752475247524752, | |
| "grad_norm": 2.99531105375008, | |
| "learning_rate": 5.511866805715626e-06, | |
| "loss": 1.9103, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.48604860486048607, | |
| "grad_norm": 3.5132876271811373, | |
| "learning_rate": 5.359557206330466e-06, | |
| "loss": 1.572, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.49684968496849685, | |
| "grad_norm": 2.5665673168109153, | |
| "learning_rate": 5.2049847581489365e-06, | |
| "loss": 1.5603, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.49684968496849685, | |
| "eval_loss": NaN, | |
| "eval_runtime": 123.3224, | |
| "eval_samples_per_second": 8.725, | |
| "eval_steps_per_second": 2.181, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5076507650765076, | |
| "grad_norm": 3.4576040645448205, | |
| "learning_rate": 5.048406731756408e-06, | |
| "loss": 1.5601, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5184518451845185, | |
| "grad_norm": 2.687807440391295, | |
| "learning_rate": 4.890083735825257e-06, | |
| "loss": 1.6058, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5292529252925292, | |
| "grad_norm": 2.8487624057382654, | |
| "learning_rate": 4.730279283357447e-06, | |
| "loss": 1.5174, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.54005400540054, | |
| "grad_norm": 2.9197691857946797, | |
| "learning_rate": 4.569259353093141e-06, | |
| "loss": 1.5429, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5508550855085509, | |
| "grad_norm": 2.7015326813188962, | |
| "learning_rate": 4.407291946815342e-06, | |
| "loss": 1.6792, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5616561656165616, | |
| "grad_norm": 3.22348351998704, | |
| "learning_rate": 4.244646643287371e-06, | |
| "loss": 1.5513, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5724572457245725, | |
| "grad_norm": 2.1285589649043466, | |
| "learning_rate": 4.081594149565622e-06, | |
| "loss": 1.5162, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.5832583258325833, | |
| "grad_norm": 2.780214646079723, | |
| "learning_rate": 3.918405850434379e-06, | |
| "loss": 1.4872, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.594059405940594, | |
| "grad_norm": 2.2883370030394503, | |
| "learning_rate": 3.75535335671263e-06, | |
| "loss": 1.5223, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6048604860486049, | |
| "grad_norm": 2.0052138675618894, | |
| "learning_rate": 3.5927080531846593e-06, | |
| "loss": 1.5324, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6156615661566157, | |
| "grad_norm": 2.1807110551254403, | |
| "learning_rate": 3.4307406469068596e-06, | |
| "loss": 1.5016, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6264626462646264, | |
| "grad_norm": 2.105213591481246, | |
| "learning_rate": 3.2697207166425537e-06, | |
| "loss": 1.4809, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.6372637263726373, | |
| "grad_norm": 2.2669612123587544, | |
| "learning_rate": 3.1099162641747427e-06, | |
| "loss": 1.4593, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.648064806480648, | |
| "grad_norm": 2.064860643781488, | |
| "learning_rate": 2.9515932682435922e-06, | |
| "loss": 1.4086, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6588658865886589, | |
| "grad_norm": 1.8917741415627494, | |
| "learning_rate": 2.7950152418510636e-06, | |
| "loss": 1.45, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.6696669666966697, | |
| "grad_norm": 2.245572260361595, | |
| "learning_rate": 2.6404427936695337e-06, | |
| "loss": 1.4905, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6804680468046804, | |
| "grad_norm": 2.045925075000629, | |
| "learning_rate": 2.4881331942843742e-06, | |
| "loss": 1.4649, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.6912691269126913, | |
| "grad_norm": 1.877871724293037, | |
| "learning_rate": 2.3383399479924544e-06, | |
| "loss": 1.4154, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7020702070207021, | |
| "grad_norm": 2.0602410525870822, | |
| "learning_rate": 2.1913123708692577e-06, | |
| "loss": 1.4173, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7128712871287128, | |
| "grad_norm": 1.8021432076698494, | |
| "learning_rate": 2.047295175806876e-06, | |
| "loss": 1.4453, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.7236723672367237, | |
| "grad_norm": 2.010814723300771, | |
| "learning_rate": 1.9065280652135524e-06, | |
| "loss": 1.4403, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.7344734473447345, | |
| "grad_norm": 1.829550265461561, | |
| "learning_rate": 1.7692453320526827e-06, | |
| "loss": 1.4541, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.7452745274527453, | |
| "grad_norm": 1.8322738616732164, | |
| "learning_rate": 1.6356754698852942e-06, | |
| "loss": 1.4255, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7452745274527453, | |
| "eval_loss": NaN, | |
| "eval_runtime": 122.9964, | |
| "eval_samples_per_second": 8.748, | |
| "eval_steps_per_second": 2.187, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7560756075607561, | |
| "grad_norm": 1.768581496163203, | |
| "learning_rate": 1.506040792565066e-06, | |
| "loss": 1.4073, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7668766876687669, | |
| "grad_norm": 1.6601983391555746, | |
| "learning_rate": 1.38055706421886e-06, | |
| "loss": 1.3416, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.7776777677767777, | |
| "grad_norm": 1.5416403691873033, | |
| "learning_rate": 1.2594331401286233e-06, | |
| "loss": 1.3795, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.7884788478847885, | |
| "grad_norm": 1.708242131742048, | |
| "learning_rate": 1.1428706191123855e-06, | |
| "loss": 1.4519, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7992799279927992, | |
| "grad_norm": 1.58302694826516, | |
| "learning_rate": 1.0310635079829202e-06, | |
| "loss": 1.4105, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.8100810081008101, | |
| "grad_norm": 1.8230575660793402, | |
| "learning_rate": 9.241978986425513e-07, | |
| "loss": 1.4175, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8208820882088209, | |
| "grad_norm": 1.738630228058889, | |
| "learning_rate": 8.224516583515493e-07, | |
| "loss": 1.4058, | |
| "step": 76 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 92, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 19, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 43330264104960.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |