MagGemma2b / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
437dffc verified
Invalid JSON: Unexpected token 'N', ..."al_loss": NaN, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8208820882088209,
"eval_steps": 23,
"global_step": 76,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010801080108010801,
"grad_norm": 63.828086432644454,
"learning_rate": 5.333333333333333e-07,
"loss": 6.498,
"step": 1
},
{
"epoch": 0.010801080108010801,
"eval_loss": NaN,
"eval_runtime": 122.6901,
"eval_samples_per_second": 8.77,
"eval_steps_per_second": 2.193,
"step": 1
},
{
"epoch": 0.021602160216021602,
"grad_norm": 66.4128492022695,
"learning_rate": 1.0666666666666667e-06,
"loss": 6.5865,
"step": 2
},
{
"epoch": 0.032403240324032405,
"grad_norm": 71.37254447564402,
"learning_rate": 1.6e-06,
"loss": 6.789,
"step": 3
},
{
"epoch": 0.043204320432043204,
"grad_norm": 60.356993414372305,
"learning_rate": 2.1333333333333334e-06,
"loss": 6.4573,
"step": 4
},
{
"epoch": 0.054005400540054004,
"grad_norm": 55.853845541360975,
"learning_rate": 2.6666666666666664e-06,
"loss": 6.6238,
"step": 5
},
{
"epoch": 0.06480648064806481,
"grad_norm": 33.76487342816726,
"learning_rate": 3.2e-06,
"loss": 5.9981,
"step": 6
},
{
"epoch": 0.07560756075607561,
"grad_norm": 23.47380616191989,
"learning_rate": 3.7333333333333333e-06,
"loss": 5.721,
"step": 7
},
{
"epoch": 0.08640864086408641,
"grad_norm": 19.539382371968657,
"learning_rate": 4.266666666666667e-06,
"loss": 5.6131,
"step": 8
},
{
"epoch": 0.09720972097209721,
"grad_norm": 14.327743602474902,
"learning_rate": 4.8e-06,
"loss": 5.2541,
"step": 9
},
{
"epoch": 0.10801080108010801,
"grad_norm": 13.592775716109132,
"learning_rate": 5.333333333333333e-06,
"loss": 5.1978,
"step": 10
},
{
"epoch": 0.1188118811881188,
"grad_norm": 10.979858971620041,
"learning_rate": 5.866666666666666e-06,
"loss": 5.0864,
"step": 11
},
{
"epoch": 0.12961296129612962,
"grad_norm": 9.794947757674,
"learning_rate": 6.4e-06,
"loss": 4.8428,
"step": 12
},
{
"epoch": 0.1404140414041404,
"grad_norm": 9.874078411160985,
"learning_rate": 6.933333333333334e-06,
"loss": 4.5415,
"step": 13
},
{
"epoch": 0.15121512151215122,
"grad_norm": 9.139382870149774,
"learning_rate": 7.466666666666667e-06,
"loss": 4.5053,
"step": 14
},
{
"epoch": 0.162016201620162,
"grad_norm": 8.650696439508351,
"learning_rate": 8e-06,
"loss": 4.2912,
"step": 15
},
{
"epoch": 0.17281728172817282,
"grad_norm": 8.866106038295735,
"learning_rate": 7.996671197378331e-06,
"loss": 4.0837,
"step": 16
},
{
"epoch": 0.18361836183618363,
"grad_norm": 8.09340750575385,
"learning_rate": 7.986690329976772e-06,
"loss": 3.794,
"step": 17
},
{
"epoch": 0.19441944194419442,
"grad_norm": 7.969568017047467,
"learning_rate": 7.97007400996411e-06,
"loss": 3.7631,
"step": 18
},
{
"epoch": 0.20522052205220523,
"grad_norm": 8.622778590137475,
"learning_rate": 7.946849893565155e-06,
"loss": 3.6164,
"step": 19
},
{
"epoch": 0.21602160216021601,
"grad_norm": 8.935855589318281,
"learning_rate": 7.917056635029685e-06,
"loss": 3.4103,
"step": 20
},
{
"epoch": 0.22682268226822683,
"grad_norm": 8.294919154259096,
"learning_rate": 7.880743822296258e-06,
"loss": 3.1529,
"step": 21
},
{
"epoch": 0.2376237623762376,
"grad_norm": 10.069947949166714,
"learning_rate": 7.837971894457989e-06,
"loss": 3.1933,
"step": 22
},
{
"epoch": 0.24842484248424843,
"grad_norm": 7.613383614891953,
"learning_rate": 7.78881204116764e-06,
"loss": 2.8644,
"step": 23
},
{
"epoch": 0.24842484248424843,
"eval_loss": NaN,
"eval_runtime": 122.7992,
"eval_samples_per_second": 8.762,
"eval_steps_per_second": 2.191,
"step": 23
},
{
"epoch": 0.25922592259225924,
"grad_norm": 7.683615665676093,
"learning_rate": 7.733346084149467e-06,
"loss": 2.7816,
"step": 24
},
{
"epoch": 0.27002700270027,
"grad_norm": 6.616788299926916,
"learning_rate": 7.671666341015038e-06,
"loss": 2.6587,
"step": 25
},
{
"epoch": 0.2808280828082808,
"grad_norm": 8.248800446233975,
"learning_rate": 7.6038754716096755e-06,
"loss": 2.5965,
"step": 26
},
{
"epoch": 0.29162916291629165,
"grad_norm": 5.768076660552814,
"learning_rate": 7.5300863071452845e-06,
"loss": 2.3966,
"step": 27
},
{
"epoch": 0.30243024302430244,
"grad_norm": 6.248576515957054,
"learning_rate": 7.450421662403922e-06,
"loss": 2.3689,
"step": 28
},
{
"epoch": 0.3132313231323132,
"grad_norm": 6.340341378272499,
"learning_rate": 7.365014131324725e-06,
"loss": 2.2287,
"step": 29
},
{
"epoch": 0.324032403240324,
"grad_norm": 4.698539858663757,
"learning_rate": 7.274005866314374e-06,
"loss": 2.1459,
"step": 30
},
{
"epoch": 0.33483348334833485,
"grad_norm": 5.352213910449207,
"learning_rate": 7.17754834164845e-06,
"loss": 2.1892,
"step": 31
},
{
"epoch": 0.34563456345634563,
"grad_norm": 3.8948062761704185,
"learning_rate": 7.075802101357448e-06,
"loss": 2.0545,
"step": 32
},
{
"epoch": 0.3564356435643564,
"grad_norm": 4.275507225855982,
"learning_rate": 6.96893649201708e-06,
"loss": 1.9693,
"step": 33
},
{
"epoch": 0.36723672367236726,
"grad_norm": 3.827840441222993,
"learning_rate": 6.857129380887614e-06,
"loss": 2.2615,
"step": 34
},
{
"epoch": 0.37803780378037805,
"grad_norm": 4.061222798809487,
"learning_rate": 6.740566859871377e-06,
"loss": 1.907,
"step": 35
},
{
"epoch": 0.38883888388838883,
"grad_norm": 3.764768942445119,
"learning_rate": 6.619442935781141e-06,
"loss": 1.7926,
"step": 36
},
{
"epoch": 0.3996399639963996,
"grad_norm": 3.6889692023791203,
"learning_rate": 6.493959207434934e-06,
"loss": 1.8239,
"step": 37
},
{
"epoch": 0.41044104410441046,
"grad_norm": 3.877637135877313,
"learning_rate": 6.364324530114706e-06,
"loss": 1.7841,
"step": 38
},
{
"epoch": 0.42124212421242124,
"grad_norm": 3.419151921848927,
"learning_rate": 6.230754667947318e-06,
"loss": 1.7599,
"step": 39
},
{
"epoch": 0.43204320432043203,
"grad_norm": 3.400475978424493,
"learning_rate": 6.093471934786448e-06,
"loss": 1.7395,
"step": 40
},
{
"epoch": 0.44284428442844287,
"grad_norm": 3.0959631594585924,
"learning_rate": 5.952704824193125e-06,
"loss": 1.6571,
"step": 41
},
{
"epoch": 0.45364536453645365,
"grad_norm": 2.5886990345494696,
"learning_rate": 5.808687629130743e-06,
"loss": 1.6854,
"step": 42
},
{
"epoch": 0.46444644464446444,
"grad_norm": 3.764322932438931,
"learning_rate": 5.661660052007546e-06,
"loss": 1.627,
"step": 43
},
{
"epoch": 0.4752475247524752,
"grad_norm": 2.99531105375008,
"learning_rate": 5.511866805715626e-06,
"loss": 1.9103,
"step": 44
},
{
"epoch": 0.48604860486048607,
"grad_norm": 3.5132876271811373,
"learning_rate": 5.359557206330466e-06,
"loss": 1.572,
"step": 45
},
{
"epoch": 0.49684968496849685,
"grad_norm": 2.5665673168109153,
"learning_rate": 5.2049847581489365e-06,
"loss": 1.5603,
"step": 46
},
{
"epoch": 0.49684968496849685,
"eval_loss": NaN,
"eval_runtime": 123.3224,
"eval_samples_per_second": 8.725,
"eval_steps_per_second": 2.181,
"step": 46
},
{
"epoch": 0.5076507650765076,
"grad_norm": 3.4576040645448205,
"learning_rate": 5.048406731756408e-06,
"loss": 1.5601,
"step": 47
},
{
"epoch": 0.5184518451845185,
"grad_norm": 2.687807440391295,
"learning_rate": 4.890083735825257e-06,
"loss": 1.6058,
"step": 48
},
{
"epoch": 0.5292529252925292,
"grad_norm": 2.8487624057382654,
"learning_rate": 4.730279283357447e-06,
"loss": 1.5174,
"step": 49
},
{
"epoch": 0.54005400540054,
"grad_norm": 2.9197691857946797,
"learning_rate": 4.569259353093141e-06,
"loss": 1.5429,
"step": 50
},
{
"epoch": 0.5508550855085509,
"grad_norm": 2.7015326813188962,
"learning_rate": 4.407291946815342e-06,
"loss": 1.6792,
"step": 51
},
{
"epoch": 0.5616561656165616,
"grad_norm": 3.22348351998704,
"learning_rate": 4.244646643287371e-06,
"loss": 1.5513,
"step": 52
},
{
"epoch": 0.5724572457245725,
"grad_norm": 2.1285589649043466,
"learning_rate": 4.081594149565622e-06,
"loss": 1.5162,
"step": 53
},
{
"epoch": 0.5832583258325833,
"grad_norm": 2.780214646079723,
"learning_rate": 3.918405850434379e-06,
"loss": 1.4872,
"step": 54
},
{
"epoch": 0.594059405940594,
"grad_norm": 2.2883370030394503,
"learning_rate": 3.75535335671263e-06,
"loss": 1.5223,
"step": 55
},
{
"epoch": 0.6048604860486049,
"grad_norm": 2.0052138675618894,
"learning_rate": 3.5927080531846593e-06,
"loss": 1.5324,
"step": 56
},
{
"epoch": 0.6156615661566157,
"grad_norm": 2.1807110551254403,
"learning_rate": 3.4307406469068596e-06,
"loss": 1.5016,
"step": 57
},
{
"epoch": 0.6264626462646264,
"grad_norm": 2.105213591481246,
"learning_rate": 3.2697207166425537e-06,
"loss": 1.4809,
"step": 58
},
{
"epoch": 0.6372637263726373,
"grad_norm": 2.2669612123587544,
"learning_rate": 3.1099162641747427e-06,
"loss": 1.4593,
"step": 59
},
{
"epoch": 0.648064806480648,
"grad_norm": 2.064860643781488,
"learning_rate": 2.9515932682435922e-06,
"loss": 1.4086,
"step": 60
},
{
"epoch": 0.6588658865886589,
"grad_norm": 1.8917741415627494,
"learning_rate": 2.7950152418510636e-06,
"loss": 1.45,
"step": 61
},
{
"epoch": 0.6696669666966697,
"grad_norm": 2.245572260361595,
"learning_rate": 2.6404427936695337e-06,
"loss": 1.4905,
"step": 62
},
{
"epoch": 0.6804680468046804,
"grad_norm": 2.045925075000629,
"learning_rate": 2.4881331942843742e-06,
"loss": 1.4649,
"step": 63
},
{
"epoch": 0.6912691269126913,
"grad_norm": 1.877871724293037,
"learning_rate": 2.3383399479924544e-06,
"loss": 1.4154,
"step": 64
},
{
"epoch": 0.7020702070207021,
"grad_norm": 2.0602410525870822,
"learning_rate": 2.1913123708692577e-06,
"loss": 1.4173,
"step": 65
},
{
"epoch": 0.7128712871287128,
"grad_norm": 1.8021432076698494,
"learning_rate": 2.047295175806876e-06,
"loss": 1.4453,
"step": 66
},
{
"epoch": 0.7236723672367237,
"grad_norm": 2.010814723300771,
"learning_rate": 1.9065280652135524e-06,
"loss": 1.4403,
"step": 67
},
{
"epoch": 0.7344734473447345,
"grad_norm": 1.829550265461561,
"learning_rate": 1.7692453320526827e-06,
"loss": 1.4541,
"step": 68
},
{
"epoch": 0.7452745274527453,
"grad_norm": 1.8322738616732164,
"learning_rate": 1.6356754698852942e-06,
"loss": 1.4255,
"step": 69
},
{
"epoch": 0.7452745274527453,
"eval_loss": NaN,
"eval_runtime": 122.9964,
"eval_samples_per_second": 8.748,
"eval_steps_per_second": 2.187,
"step": 69
},
{
"epoch": 0.7560756075607561,
"grad_norm": 1.768581496163203,
"learning_rate": 1.506040792565066e-06,
"loss": 1.4073,
"step": 70
},
{
"epoch": 0.7668766876687669,
"grad_norm": 1.6601983391555746,
"learning_rate": 1.38055706421886e-06,
"loss": 1.3416,
"step": 71
},
{
"epoch": 0.7776777677767777,
"grad_norm": 1.5416403691873033,
"learning_rate": 1.2594331401286233e-06,
"loss": 1.3795,
"step": 72
},
{
"epoch": 0.7884788478847885,
"grad_norm": 1.708242131742048,
"learning_rate": 1.1428706191123855e-06,
"loss": 1.4519,
"step": 73
},
{
"epoch": 0.7992799279927992,
"grad_norm": 1.58302694826516,
"learning_rate": 1.0310635079829202e-06,
"loss": 1.4105,
"step": 74
},
{
"epoch": 0.8100810081008101,
"grad_norm": 1.8230575660793402,
"learning_rate": 9.241978986425513e-07,
"loss": 1.4175,
"step": 75
},
{
"epoch": 0.8208820882088209,
"grad_norm": 1.738630228058889,
"learning_rate": 8.224516583515493e-07,
"loss": 1.4058,
"step": 76
}
],
"logging_steps": 1,
"max_steps": 92,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 19,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 43330264104960.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}