LIMO / trainer_state.json
YangXiao-nlp's picture
Add files using upload-large-folder tool
1833bab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.0,
"eval_steps": 500,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.038461538461538464,
"grad_norm": 3.6227714421853245,
"learning_rate": 4.9999188892913205e-06,
"loss": 0.9247,
"step": 1
},
{
"epoch": 0.07692307692307693,
"grad_norm": 2.33743873241245,
"learning_rate": 4.999675562428437e-06,
"loss": 0.8839,
"step": 2
},
{
"epoch": 0.11538461538461539,
"grad_norm": 1.6117080535089503,
"learning_rate": 4.999270035200483e-06,
"loss": 0.8349,
"step": 3
},
{
"epoch": 0.15384615384615385,
"grad_norm": 1.5518021106969933,
"learning_rate": 4.998702333921538e-06,
"loss": 0.8214,
"step": 4
},
{
"epoch": 0.19230769230769232,
"grad_norm": 1.3811790146555043,
"learning_rate": 4.997972495428924e-06,
"loss": 0.844,
"step": 5
},
{
"epoch": 0.23076923076923078,
"grad_norm": 1.310050810479314,
"learning_rate": 4.9970805670808174e-06,
"loss": 0.8319,
"step": 6
},
{
"epoch": 0.2692307692307692,
"grad_norm": 1.2786534191408507,
"learning_rate": 4.996026606753167e-06,
"loss": 0.8674,
"step": 7
},
{
"epoch": 0.3076923076923077,
"grad_norm": 1.3148686759764385,
"learning_rate": 4.994810682835951e-06,
"loss": 0.7042,
"step": 8
},
{
"epoch": 0.34615384615384615,
"grad_norm": 1.090285225955643,
"learning_rate": 4.9934328742287285e-06,
"loss": 0.7071,
"step": 9
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.907111664124928,
"learning_rate": 4.991893270335526e-06,
"loss": 0.8748,
"step": 10
},
{
"epoch": 0.4230769230769231,
"grad_norm": 0.6642504831184634,
"learning_rate": 4.990191971059033e-06,
"loss": 0.7435,
"step": 11
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.6678557889351041,
"learning_rate": 4.988329086794122e-06,
"loss": 0.6785,
"step": 12
},
{
"epoch": 0.5,
"grad_norm": 0.701381067554106,
"learning_rate": 4.986304738420684e-06,
"loss": 0.6449,
"step": 13
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.6790086078752827,
"learning_rate": 4.984119057295783e-06,
"loss": 0.7425,
"step": 14
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.6905224183064627,
"learning_rate": 4.981772185245135e-06,
"loss": 0.7057,
"step": 15
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.7984130598075898,
"learning_rate": 4.979264274553906e-06,
"loss": 0.8067,
"step": 16
},
{
"epoch": 0.6538461538461539,
"grad_norm": 0.6497176581551809,
"learning_rate": 4.976595487956824e-06,
"loss": 0.7255,
"step": 17
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.6629822563364289,
"learning_rate": 4.973765998627628e-06,
"loss": 0.798,
"step": 18
},
{
"epoch": 0.7307692307692307,
"grad_norm": 0.6132162810922874,
"learning_rate": 4.970775990167826e-06,
"loss": 0.632,
"step": 19
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.5554781405012508,
"learning_rate": 4.967625656594782e-06,
"loss": 0.6248,
"step": 20
},
{
"epoch": 0.8076923076923077,
"grad_norm": 0.6049571126552993,
"learning_rate": 4.964315202329127e-06,
"loss": 0.8391,
"step": 21
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.5430982822611681,
"learning_rate": 4.9608448421814944e-06,
"loss": 0.7089,
"step": 22
},
{
"epoch": 0.8846153846153846,
"grad_norm": 0.48739605520739077,
"learning_rate": 4.9572148013385815e-06,
"loss": 0.8009,
"step": 23
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.45997941430542894,
"learning_rate": 4.953425315348534e-06,
"loss": 0.583,
"step": 24
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.47230353827776894,
"learning_rate": 4.94947663010567e-06,
"loss": 0.5849,
"step": 25
},
{
"epoch": 1.0,
"grad_norm": 0.47082502311551994,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.7537,
"step": 26
},
{
"epoch": 1.0384615384615385,
"grad_norm": 0.46211533472097527,
"learning_rate": 4.941102697073181e-06,
"loss": 0.5991,
"step": 27
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.46162874786523983,
"learning_rate": 4.9366779926560705e-06,
"loss": 0.7157,
"step": 28
},
{
"epoch": 1.1153846153846154,
"grad_norm": 0.43803956195742527,
"learning_rate": 4.932095175695911e-06,
"loss": 0.7947,
"step": 29
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.43236553483657136,
"learning_rate": 4.927354543565131e-06,
"loss": 0.5883,
"step": 30
},
{
"epoch": 1.1923076923076923,
"grad_norm": 0.40276363608769367,
"learning_rate": 4.922456403876552e-06,
"loss": 0.6482,
"step": 31
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.4722131511905768,
"learning_rate": 4.917401074463441e-06,
"loss": 0.72,
"step": 32
},
{
"epoch": 1.2692307692307692,
"grad_norm": 0.4248174640747636,
"learning_rate": 4.912188883358879e-06,
"loss": 0.7177,
"step": 33
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.42868915417283154,
"learning_rate": 4.9068201687744774e-06,
"loss": 0.602,
"step": 34
},
{
"epoch": 1.3461538461538463,
"grad_norm": 0.42437464257989843,
"learning_rate": 4.901295279078431e-06,
"loss": 0.684,
"step": 35
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.4148057850825471,
"learning_rate": 4.895614572772916e-06,
"loss": 0.5163,
"step": 36
},
{
"epoch": 1.4230769230769231,
"grad_norm": 0.441534358834555,
"learning_rate": 4.889778418470823e-06,
"loss": 0.5569,
"step": 37
},
{
"epoch": 1.4615384615384617,
"grad_norm": 0.41185704733550094,
"learning_rate": 4.883787194871841e-06,
"loss": 0.5279,
"step": 38
},
{
"epoch": 1.5,
"grad_norm": 0.4257014048854468,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.6732,
"step": 39
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.4609674066796317,
"learning_rate": 4.8713411048678635e-06,
"loss": 0.68,
"step": 40
},
{
"epoch": 1.5769230769230769,
"grad_norm": 0.4027959547024447,
"learning_rate": 4.864887046071814e-06,
"loss": 0.7983,
"step": 41
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.4569844209382049,
"learning_rate": 4.858279533144358e-06,
"loss": 0.4545,
"step": 42
},
{
"epoch": 1.6538461538461537,
"grad_norm": 0.4257095872017722,
"learning_rate": 4.851518994837544e-06,
"loss": 0.6318,
"step": 43
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.40879419445400555,
"learning_rate": 4.844605869833011e-06,
"loss": 0.5548,
"step": 44
},
{
"epoch": 1.7307692307692308,
"grad_norm": 0.43599839467908014,
"learning_rate": 4.837540606713538e-06,
"loss": 0.519,
"step": 45
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.445760123696135,
"learning_rate": 4.83032366393392e-06,
"loss": 0.6377,
"step": 46
},
{
"epoch": 1.8076923076923077,
"grad_norm": 0.4456381082157439,
"learning_rate": 4.8229555097912335e-06,
"loss": 0.7013,
"step": 47
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.40541134413006163,
"learning_rate": 4.815436622394442e-06,
"loss": 0.5183,
"step": 48
},
{
"epoch": 1.8846153846153846,
"grad_norm": 0.44406611064318774,
"learning_rate": 4.807767489633372e-06,
"loss": 0.8679,
"step": 49
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.48978477026333694,
"learning_rate": 4.799948609147061e-06,
"loss": 0.5619,
"step": 50
},
{
"epoch": 1.9615384615384617,
"grad_norm": 0.42927653335100224,
"learning_rate": 4.791980488291457e-06,
"loss": 0.5482,
"step": 51
},
{
"epoch": 2.0,
"grad_norm": 0.3993118925497605,
"learning_rate": 4.783863644106502e-06,
"loss": 0.5905,
"step": 52
},
{
"epoch": 2.0384615384615383,
"grad_norm": 0.45533421207861624,
"learning_rate": 4.775598603282587e-06,
"loss": 0.7045,
"step": 53
},
{
"epoch": 2.076923076923077,
"grad_norm": 0.4309081724975481,
"learning_rate": 4.7671859021263635e-06,
"loss": 0.5227,
"step": 54
},
{
"epoch": 2.1153846153846154,
"grad_norm": 0.4300292912476653,
"learning_rate": 4.758626086525956e-06,
"loss": 0.5115,
"step": 55
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.38059576443400334,
"learning_rate": 4.749919711915531e-06,
"loss": 0.4959,
"step": 56
},
{
"epoch": 2.1923076923076925,
"grad_norm": 0.4126674320822358,
"learning_rate": 4.74106734323926e-06,
"loss": 0.6265,
"step": 57
},
{
"epoch": 2.230769230769231,
"grad_norm": 0.4671745147500316,
"learning_rate": 4.73206955491466e-06,
"loss": 0.5165,
"step": 58
},
{
"epoch": 2.269230769230769,
"grad_norm": 0.41618486395615545,
"learning_rate": 4.7229269307953235e-06,
"loss": 0.5657,
"step": 59
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.43433496329970006,
"learning_rate": 4.7136400641330245e-06,
"loss": 0.666,
"step": 60
},
{
"epoch": 2.3461538461538463,
"grad_norm": 0.39780973220100413,
"learning_rate": 4.704209557539235e-06,
"loss": 0.5704,
"step": 61
},
{
"epoch": 2.3846153846153846,
"grad_norm": 0.4048550114338101,
"learning_rate": 4.694636022946012e-06,
"loss": 0.5673,
"step": 62
},
{
"epoch": 2.423076923076923,
"grad_norm": 0.385995975226885,
"learning_rate": 4.684920081566295e-06,
"loss": 0.5959,
"step": 63
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.3949623073886239,
"learning_rate": 4.675062363853599e-06,
"loss": 0.6107,
"step": 64
},
{
"epoch": 2.5,
"grad_norm": 0.4330059721657707,
"learning_rate": 4.665063509461098e-06,
"loss": 0.5281,
"step": 65
},
{
"epoch": 2.5384615384615383,
"grad_norm": 0.44446090274862377,
"learning_rate": 4.654924167200124e-06,
"loss": 0.5839,
"step": 66
},
{
"epoch": 2.5769230769230766,
"grad_norm": 0.4450780587127078,
"learning_rate": 4.6446449949980665e-06,
"loss": 0.5034,
"step": 67
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.43555773421037586,
"learning_rate": 4.634226659855681e-06,
"loss": 0.62,
"step": 68
},
{
"epoch": 2.6538461538461537,
"grad_norm": 0.44535610498714195,
"learning_rate": 4.623669837803803e-06,
"loss": 0.5418,
"step": 69
},
{
"epoch": 2.6923076923076925,
"grad_norm": 0.4266516883259732,
"learning_rate": 4.612975213859487e-06,
"loss": 0.6767,
"step": 70
},
{
"epoch": 2.730769230769231,
"grad_norm": 0.4170049551133152,
"learning_rate": 4.602143481981556e-06,
"loss": 0.6701,
"step": 71
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.43191999973483397,
"learning_rate": 4.591175345025567e-06,
"loss": 0.5555,
"step": 72
},
{
"epoch": 2.8076923076923075,
"grad_norm": 0.4361492711034067,
"learning_rate": 4.580071514698211e-06,
"loss": 0.6593,
"step": 73
},
{
"epoch": 2.8461538461538463,
"grad_norm": 0.39471819539317293,
"learning_rate": 4.568832711511125e-06,
"loss": 0.5535,
"step": 74
},
{
"epoch": 2.8846153846153846,
"grad_norm": 0.38086387704457475,
"learning_rate": 4.5574596647341414e-06,
"loss": 0.4545,
"step": 75
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.45356707180004513,
"learning_rate": 4.545953112347967e-06,
"loss": 0.6251,
"step": 76
},
{
"epoch": 2.9615384615384617,
"grad_norm": 0.4254117893636373,
"learning_rate": 4.5343138009963e-06,
"loss": 0.5367,
"step": 77
},
{
"epoch": 3.0,
"grad_norm": 0.3871428196474084,
"learning_rate": 4.522542485937369e-06,
"loss": 0.4539,
"step": 78
},
{
"epoch": 3.0384615384615383,
"grad_norm": 0.42671046348757163,
"learning_rate": 4.510639930994942e-06,
"loss": 0.513,
"step": 79
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.46427278945358347,
"learning_rate": 4.498606908508754e-06,
"loss": 0.3476,
"step": 80
},
{
"epoch": 3.1153846153846154,
"grad_norm": 0.3755914435180136,
"learning_rate": 4.486444199284386e-06,
"loss": 0.3924,
"step": 81
},
{
"epoch": 3.1538461538461537,
"grad_norm": 0.46176939111709947,
"learning_rate": 4.474152592542613e-06,
"loss": 0.7687,
"step": 82
},
{
"epoch": 3.1923076923076925,
"grad_norm": 0.5272804561176313,
"learning_rate": 4.4617328858681806e-06,
"loss": 0.5281,
"step": 83
},
{
"epoch": 3.230769230769231,
"grad_norm": 0.3948130729702814,
"learning_rate": 4.449185885158056e-06,
"loss": 0.5106,
"step": 84
},
{
"epoch": 3.269230769230769,
"grad_norm": 0.4087463332644831,
"learning_rate": 4.436512404569136e-06,
"loss": 0.4434,
"step": 85
},
{
"epoch": 3.3076923076923075,
"grad_norm": 0.5269799431074561,
"learning_rate": 4.423713266465415e-06,
"loss": 0.3944,
"step": 86
},
{
"epoch": 3.3461538461538463,
"grad_norm": 0.461938868726041,
"learning_rate": 4.410789301364621e-06,
"loss": 0.5757,
"step": 87
},
{
"epoch": 3.3846153846153846,
"grad_norm": 0.46622397789610276,
"learning_rate": 4.397741347884329e-06,
"loss": 0.6648,
"step": 88
},
{
"epoch": 3.423076923076923,
"grad_norm": 0.43959089723440037,
"learning_rate": 4.384570252687542e-06,
"loss": 0.6123,
"step": 89
},
{
"epoch": 3.4615384615384617,
"grad_norm": 0.4651240638848072,
"learning_rate": 4.3712768704277535e-06,
"loss": 0.511,
"step": 90
},
{
"epoch": 3.5,
"grad_norm": 0.44262354534992776,
"learning_rate": 4.357862063693486e-06,
"loss": 0.4815,
"step": 91
},
{
"epoch": 3.5384615384615383,
"grad_norm": 0.39275121653018796,
"learning_rate": 4.3443267029523265e-06,
"loss": 0.6761,
"step": 92
},
{
"epoch": 3.5769230769230766,
"grad_norm": 0.39829066035096183,
"learning_rate": 4.3306716664944345e-06,
"loss": 0.66,
"step": 93
},
{
"epoch": 3.6153846153846154,
"grad_norm": 0.3972752441251099,
"learning_rate": 4.316897840375558e-06,
"loss": 0.3552,
"step": 94
},
{
"epoch": 3.6538461538461537,
"grad_norm": 0.43087508212959164,
"learning_rate": 4.303006118359536e-06,
"loss": 0.3577,
"step": 95
},
{
"epoch": 3.6923076923076925,
"grad_norm": 0.37367031545629315,
"learning_rate": 4.288997401860303e-06,
"loss": 0.4744,
"step": 96
},
{
"epoch": 3.730769230769231,
"grad_norm": 0.4908959204559879,
"learning_rate": 4.274872599883396e-06,
"loss": 0.8344,
"step": 97
},
{
"epoch": 3.769230769230769,
"grad_norm": 0.479031181871949,
"learning_rate": 4.260632628966974e-06,
"loss": 0.444,
"step": 98
},
{
"epoch": 3.8076923076923075,
"grad_norm": 0.42816486401058296,
"learning_rate": 4.246278413122344e-06,
"loss": 0.4751,
"step": 99
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.4441229834123768,
"learning_rate": 4.231810883773999e-06,
"loss": 0.6354,
"step": 100
},
{
"epoch": 3.8846153846153846,
"grad_norm": 0.5505544877136986,
"learning_rate": 4.217230979699188e-06,
"loss": 0.404,
"step": 101
},
{
"epoch": 3.9230769230769234,
"grad_norm": 0.428310468582681,
"learning_rate": 4.202539646966993e-06,
"loss": 0.4905,
"step": 102
},
{
"epoch": 3.9615384615384617,
"grad_norm": 0.36741931493755814,
"learning_rate": 4.187737838876941e-06,
"loss": 0.4986,
"step": 103
},
{
"epoch": 4.0,
"grad_norm": 0.3765611133789381,
"learning_rate": 4.172826515897146e-06,
"loss": 0.4131,
"step": 104
},
{
"epoch": 4.038461538461538,
"grad_norm": 0.43538159963075335,
"learning_rate": 4.1578066456019885e-06,
"loss": 0.5491,
"step": 105
},
{
"epoch": 4.076923076923077,
"grad_norm": 0.42184253496128765,
"learning_rate": 4.1426792026093274e-06,
"loss": 0.5411,
"step": 106
},
{
"epoch": 4.115384615384615,
"grad_norm": 0.40484010942448745,
"learning_rate": 4.12744516851726e-06,
"loss": 0.3895,
"step": 107
},
{
"epoch": 4.153846153846154,
"grad_norm": 0.35000742524077344,
"learning_rate": 4.112105531840427e-06,
"loss": 0.548,
"step": 108
},
{
"epoch": 4.1923076923076925,
"grad_norm": 0.45752349995263586,
"learning_rate": 4.09666128794587e-06,
"loss": 0.4775,
"step": 109
},
{
"epoch": 4.230769230769231,
"grad_norm": 0.40329722163324117,
"learning_rate": 4.081113438988443e-06,
"loss": 0.2606,
"step": 110
},
{
"epoch": 4.269230769230769,
"grad_norm": 0.3842172424364366,
"learning_rate": 4.065462993845785e-06,
"loss": 0.3961,
"step": 111
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.41497732883921246,
"learning_rate": 4.049710968052851e-06,
"loss": 0.4113,
"step": 112
},
{
"epoch": 4.346153846153846,
"grad_norm": 0.39255048207938287,
"learning_rate": 4.0338583837360225e-06,
"loss": 0.4558,
"step": 113
},
{
"epoch": 4.384615384615385,
"grad_norm": 0.42620728115181244,
"learning_rate": 4.017906269546778e-06,
"loss": 0.3251,
"step": 114
},
{
"epoch": 4.423076923076923,
"grad_norm": 0.5169226176212537,
"learning_rate": 4.001855660594948e-06,
"loss": 0.4581,
"step": 115
},
{
"epoch": 4.461538461538462,
"grad_norm": 0.40368787971042785,
"learning_rate": 3.985707598381544e-06,
"loss": 0.4953,
"step": 116
},
{
"epoch": 4.5,
"grad_norm": 0.5423983145992816,
"learning_rate": 3.969463130731183e-06,
"loss": 0.6845,
"step": 117
},
{
"epoch": 4.538461538461538,
"grad_norm": 0.46715408940216974,
"learning_rate": 3.953123311724092e-06,
"loss": 0.5063,
"step": 118
},
{
"epoch": 4.576923076923077,
"grad_norm": 0.4361497261141325,
"learning_rate": 3.93668920162771e-06,
"loss": 0.4339,
"step": 119
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.402405096617437,
"learning_rate": 3.92016186682789e-06,
"loss": 0.4568,
"step": 120
},
{
"epoch": 4.653846153846154,
"grad_norm": 0.47183080337248506,
"learning_rate": 3.903542379759703e-06,
"loss": 0.5403,
"step": 121
},
{
"epoch": 4.6923076923076925,
"grad_norm": 0.43474922930243004,
"learning_rate": 3.8868318188378475e-06,
"loss": 0.4105,
"step": 122
},
{
"epoch": 4.730769230769231,
"grad_norm": 0.43857419939588704,
"learning_rate": 3.870031268386676e-06,
"loss": 0.438,
"step": 123
},
{
"epoch": 4.769230769230769,
"grad_norm": 0.537974523699989,
"learning_rate": 3.853141818569829e-06,
"loss": 0.4117,
"step": 124
},
{
"epoch": 4.8076923076923075,
"grad_norm": 0.3969885457122397,
"learning_rate": 3.836164565319503e-06,
"loss": 0.543,
"step": 125
},
{
"epoch": 4.846153846153846,
"grad_norm": 0.36690332072305304,
"learning_rate": 3.819100610265332e-06,
"loss": 0.5541,
"step": 126
},
{
"epoch": 4.884615384615385,
"grad_norm": 0.5119390431012444,
"learning_rate": 3.8019510606629063e-06,
"loss": 0.5643,
"step": 127
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.4592095946696792,
"learning_rate": 3.7847170293219223e-06,
"loss": 0.3905,
"step": 128
},
{
"epoch": 4.961538461538462,
"grad_norm": 0.35650580989737807,
"learning_rate": 3.767399634533976e-06,
"loss": 0.5302,
"step": 129
},
{
"epoch": 5.0,
"grad_norm": 0.3754488505326029,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.3909,
"step": 130
},
{
"epoch": 5.038461538461538,
"grad_norm": 0.41646937067352735,
"learning_rate": 3.732519254757344e-06,
"loss": 0.5944,
"step": 131
},
{
"epoch": 5.076923076923077,
"grad_norm": 0.40757576595922934,
"learning_rate": 3.714958533106515e-06,
"loss": 0.4101,
"step": 132
},
{
"epoch": 5.115384615384615,
"grad_norm": 0.44167101044285684,
"learning_rate": 3.6973189745375772e-06,
"loss": 0.605,
"step": 133
},
{
"epoch": 5.153846153846154,
"grad_norm": 0.8949596690000747,
"learning_rate": 3.679601723656205e-06,
"loss": 0.377,
"step": 134
},
{
"epoch": 5.1923076923076925,
"grad_norm": 0.4894480201337239,
"learning_rate": 3.661807930109422e-06,
"loss": 0.4202,
"step": 135
},
{
"epoch": 5.230769230769231,
"grad_norm": 0.48711144702694403,
"learning_rate": 3.643938748510989e-06,
"loss": 0.4178,
"step": 136
},
{
"epoch": 5.269230769230769,
"grad_norm": 0.40010645794899213,
"learning_rate": 3.6259953383664924e-06,
"loss": 0.5813,
"step": 137
},
{
"epoch": 5.3076923076923075,
"grad_norm": 0.46290191921273705,
"learning_rate": 3.607978863998104e-06,
"loss": 0.3735,
"step": 138
},
{
"epoch": 5.346153846153846,
"grad_norm": 0.4572988531725457,
"learning_rate": 3.5898904944690256e-06,
"loss": 0.2532,
"step": 139
},
{
"epoch": 5.384615384615385,
"grad_norm": 0.3937342666166495,
"learning_rate": 3.5717314035076355e-06,
"loss": 0.4399,
"step": 140
},
{
"epoch": 5.423076923076923,
"grad_norm": 0.5865553694046071,
"learning_rate": 3.5535027694313233e-06,
"loss": 0.4118,
"step": 141
},
{
"epoch": 5.461538461538462,
"grad_norm": 0.44868948609399,
"learning_rate": 3.53520577507003e-06,
"loss": 0.5487,
"step": 142
},
{
"epoch": 5.5,
"grad_norm": 0.46918871672194645,
"learning_rate": 3.516841607689501e-06,
"loss": 0.3151,
"step": 143
},
{
"epoch": 5.538461538461538,
"grad_norm": 0.4103213544946049,
"learning_rate": 3.4984114589142388e-06,
"loss": 0.3456,
"step": 144
},
{
"epoch": 5.576923076923077,
"grad_norm": 0.4269140275443733,
"learning_rate": 3.479916524650188e-06,
"loss": 0.3889,
"step": 145
},
{
"epoch": 5.615384615384615,
"grad_norm": 0.4390888322307832,
"learning_rate": 3.461358005007128e-06,
"loss": 0.3875,
"step": 146
},
{
"epoch": 5.653846153846154,
"grad_norm": 0.4836520353054927,
"learning_rate": 3.442737104220801e-06,
"loss": 0.4952,
"step": 147
},
{
"epoch": 5.6923076923076925,
"grad_norm": 0.44749171068850996,
"learning_rate": 3.4240550305747776e-06,
"loss": 0.2631,
"step": 148
},
{
"epoch": 5.730769230769231,
"grad_norm": 0.4397924186863106,
"learning_rate": 3.4053129963220423e-06,
"loss": 0.4175,
"step": 149
},
{
"epoch": 5.769230769230769,
"grad_norm": 0.44733865616762203,
"learning_rate": 3.386512217606339e-06,
"loss": 0.2536,
"step": 150
},
{
"epoch": 5.8076923076923075,
"grad_norm": 0.4416324576002063,
"learning_rate": 3.3676539143832577e-06,
"loss": 0.4575,
"step": 151
},
{
"epoch": 5.846153846153846,
"grad_norm": 0.41183773131044676,
"learning_rate": 3.3487393103410683e-06,
"loss": 0.4202,
"step": 152
},
{
"epoch": 5.884615384615385,
"grad_norm": 0.4177204473083171,
"learning_rate": 3.3297696328213215e-06,
"loss": 0.5144,
"step": 153
},
{
"epoch": 5.923076923076923,
"grad_norm": 0.5632867664875632,
"learning_rate": 3.3107461127392072e-06,
"loss": 0.4375,
"step": 154
},
{
"epoch": 5.961538461538462,
"grad_norm": 0.5035767522506955,
"learning_rate": 3.291669984503682e-06,
"loss": 0.3978,
"step": 155
},
{
"epoch": 6.0,
"grad_norm": 0.38032531941618825,
"learning_rate": 3.272542485937369e-06,
"loss": 0.5376,
"step": 156
},
{
"epoch": 6.038461538461538,
"grad_norm": 0.7171824151944847,
"learning_rate": 3.25336485819624e-06,
"loss": 0.4286,
"step": 157
},
{
"epoch": 6.076923076923077,
"grad_norm": 0.8186367401766156,
"learning_rate": 3.2341383456890776e-06,
"loss": 0.4859,
"step": 158
},
{
"epoch": 6.115384615384615,
"grad_norm": 0.4282358834280859,
"learning_rate": 3.214864195996723e-06,
"loss": 0.3245,
"step": 159
},
{
"epoch": 6.153846153846154,
"grad_norm": 0.5299932389205944,
"learning_rate": 3.195543659791132e-06,
"loss": 0.3781,
"step": 160
},
{
"epoch": 6.1923076923076925,
"grad_norm": 0.7312028043937328,
"learning_rate": 3.17617799075421e-06,
"loss": 0.4593,
"step": 161
},
{
"epoch": 6.230769230769231,
"grad_norm": 0.4438755365149523,
"learning_rate": 3.1567684454964674e-06,
"loss": 0.3754,
"step": 162
},
{
"epoch": 6.269230769230769,
"grad_norm": 0.49986780180998563,
"learning_rate": 3.1373162834754835e-06,
"loss": 0.3928,
"step": 163
},
{
"epoch": 6.3076923076923075,
"grad_norm": 0.5018972246724045,
"learning_rate": 3.117822766914174e-06,
"loss": 0.2548,
"step": 164
},
{
"epoch": 6.346153846153846,
"grad_norm": 0.5483100007115429,
"learning_rate": 3.0982891607188948e-06,
"loss": 0.277,
"step": 165
},
{
"epoch": 6.384615384615385,
"grad_norm": 0.49689281955934633,
"learning_rate": 3.0787167323973584e-06,
"loss": 0.2597,
"step": 166
},
{
"epoch": 6.423076923076923,
"grad_norm": 0.4027937354358392,
"learning_rate": 3.0591067519763894e-06,
"loss": 0.3835,
"step": 167
},
{
"epoch": 6.461538461538462,
"grad_norm": 0.5118060463198152,
"learning_rate": 3.039460491919516e-06,
"loss": 0.3728,
"step": 168
},
{
"epoch": 6.5,
"grad_norm": 0.47925684071917685,
"learning_rate": 3.019779227044398e-06,
"loss": 0.4208,
"step": 169
},
{
"epoch": 6.538461538461538,
"grad_norm": 0.4352967699287943,
"learning_rate": 3.0000642344401115e-06,
"loss": 0.508,
"step": 170
},
{
"epoch": 6.576923076923077,
"grad_norm": 0.4550075954830929,
"learning_rate": 2.9803167933842712e-06,
"loss": 0.4579,
"step": 171
},
{
"epoch": 6.615384615384615,
"grad_norm": 0.4177250074477428,
"learning_rate": 2.960538185260029e-06,
"loss": 0.2693,
"step": 172
},
{
"epoch": 6.653846153846154,
"grad_norm": 0.485295005073564,
"learning_rate": 2.9407296934729227e-06,
"loss": 0.3715,
"step": 173
},
{
"epoch": 6.6923076923076925,
"grad_norm": 0.3538134050545686,
"learning_rate": 2.920892603367596e-06,
"loss": 0.3562,
"step": 174
},
{
"epoch": 6.730769230769231,
"grad_norm": 0.447652246093713,
"learning_rate": 2.9010282021444008e-06,
"loss": 0.5187,
"step": 175
},
{
"epoch": 6.769230769230769,
"grad_norm": 0.4997227920641816,
"learning_rate": 2.881137778775864e-06,
"loss": 0.3123,
"step": 176
},
{
"epoch": 6.8076923076923075,
"grad_norm": 0.4348269374773686,
"learning_rate": 2.8612226239230536e-06,
"loss": 0.4705,
"step": 177
},
{
"epoch": 6.846153846153846,
"grad_norm": 0.4358691651373222,
"learning_rate": 2.8412840298518295e-06,
"loss": 0.3094,
"step": 178
},
{
"epoch": 6.884615384615385,
"grad_norm": 0.4084037477612026,
"learning_rate": 2.821323290348987e-06,
"loss": 0.3531,
"step": 179
},
{
"epoch": 6.923076923076923,
"grad_norm": 0.37734080915888196,
"learning_rate": 2.8013417006383078e-06,
"loss": 0.3738,
"step": 180
},
{
"epoch": 6.961538461538462,
"grad_norm": 0.37345514337855473,
"learning_rate": 2.781340557296514e-06,
"loss": 0.3986,
"step": 181
},
{
"epoch": 7.0,
"grad_norm": 0.4661469904131123,
"learning_rate": 2.761321158169134e-06,
"loss": 0.3717,
"step": 182
},
{
"epoch": 7.038461538461538,
"grad_norm": 0.7076797880268779,
"learning_rate": 2.7412848022862883e-06,
"loss": 0.4596,
"step": 183
},
{
"epoch": 7.076923076923077,
"grad_norm": 0.4705027460981129,
"learning_rate": 2.7212327897783963e-06,
"loss": 0.2957,
"step": 184
},
{
"epoch": 7.115384615384615,
"grad_norm": 0.3742202895931818,
"learning_rate": 2.7011664217918154e-06,
"loss": 0.1995,
"step": 185
},
{
"epoch": 7.153846153846154,
"grad_norm": 0.6149190541876877,
"learning_rate": 2.6810870004044065e-06,
"loss": 0.3374,
"step": 186
},
{
"epoch": 7.1923076923076925,
"grad_norm": 0.568239675579952,
"learning_rate": 2.6609958285410488e-06,
"loss": 0.3496,
"step": 187
},
{
"epoch": 7.230769230769231,
"grad_norm": 0.501969212073365,
"learning_rate": 2.6408942098890937e-06,
"loss": 0.3381,
"step": 188
},
{
"epoch": 7.269230769230769,
"grad_norm": 0.3800552771656915,
"learning_rate": 2.620783448813768e-06,
"loss": 0.3114,
"step": 189
},
{
"epoch": 7.3076923076923075,
"grad_norm": 0.5327131952318535,
"learning_rate": 2.6006648502735384e-06,
"loss": 0.282,
"step": 190
},
{
"epoch": 7.346153846153846,
"grad_norm": 0.4985133449111499,
"learning_rate": 2.5805397197354333e-06,
"loss": 0.3596,
"step": 191
},
{
"epoch": 7.384615384615385,
"grad_norm": 0.4330481942035506,
"learning_rate": 2.560409363090331e-06,
"loss": 0.3929,
"step": 192
},
{
"epoch": 7.423076923076923,
"grad_norm": 0.5096890385502361,
"learning_rate": 2.5402750865682283e-06,
"loss": 0.4024,
"step": 193
},
{
"epoch": 7.461538461538462,
"grad_norm": 1.1384220300859678,
"learning_rate": 2.5201381966534748e-06,
"loss": 0.3574,
"step": 194
},
{
"epoch": 7.5,
"grad_norm": 0.45277823692409325,
"learning_rate": 2.5e-06,
"loss": 0.3694,
"step": 195
},
{
"epoch": 7.538461538461538,
"grad_norm": 0.3598366252810026,
"learning_rate": 2.4798618033465256e-06,
"loss": 0.3475,
"step": 196
},
{
"epoch": 7.576923076923077,
"grad_norm": 0.507605650711227,
"learning_rate": 2.459724913431772e-06,
"loss": 0.227,
"step": 197
},
{
"epoch": 7.615384615384615,
"grad_norm": 0.4176653745984533,
"learning_rate": 2.43959063690967e-06,
"loss": 0.3827,
"step": 198
},
{
"epoch": 7.653846153846154,
"grad_norm": 0.357063614895515,
"learning_rate": 2.4194602802645684e-06,
"loss": 0.4291,
"step": 199
},
{
"epoch": 7.6923076923076925,
"grad_norm": 0.40598350398883626,
"learning_rate": 2.399335149726463e-06,
"loss": 0.2007,
"step": 200
},
{
"epoch": 7.730769230769231,
"grad_norm": 0.4765396454533307,
"learning_rate": 2.379216551186233e-06,
"loss": 0.3043,
"step": 201
},
{
"epoch": 7.769230769230769,
"grad_norm": 0.40291257531450625,
"learning_rate": 2.3591057901109063e-06,
"loss": 0.425,
"step": 202
},
{
"epoch": 7.8076923076923075,
"grad_norm": 0.37654212040684903,
"learning_rate": 2.3390041714589516e-06,
"loss": 0.2046,
"step": 203
},
{
"epoch": 7.846153846153846,
"grad_norm": 0.42635430764759097,
"learning_rate": 2.3189129995955944e-06,
"loss": 0.3833,
"step": 204
},
{
"epoch": 7.884615384615385,
"grad_norm": 0.5211999817060086,
"learning_rate": 2.2988335782081854e-06,
"loss": 0.5149,
"step": 205
},
{
"epoch": 7.923076923076923,
"grad_norm": 0.3750493471230682,
"learning_rate": 2.2787672102216045e-06,
"loss": 0.3424,
"step": 206
},
{
"epoch": 7.961538461538462,
"grad_norm": 0.45738143535125575,
"learning_rate": 2.258715197713712e-06,
"loss": 0.2011,
"step": 207
},
{
"epoch": 8.0,
"grad_norm": 0.41140646800685055,
"learning_rate": 2.238678841830867e-06,
"loss": 0.6172,
"step": 208
},
{
"epoch": 8.038461538461538,
"grad_norm": 0.4519178763903938,
"learning_rate": 2.2186594427034868e-06,
"loss": 0.2643,
"step": 209
},
{
"epoch": 8.076923076923077,
"grad_norm": 0.39353646673207954,
"learning_rate": 2.1986582993616926e-06,
"loss": 0.3688,
"step": 210
},
{
"epoch": 8.115384615384615,
"grad_norm": 0.3325326790913655,
"learning_rate": 2.178676709651014e-06,
"loss": 0.1834,
"step": 211
},
{
"epoch": 8.153846153846153,
"grad_norm": 1.7571111944550344,
"learning_rate": 2.1587159701481718e-06,
"loss": 0.362,
"step": 212
},
{
"epoch": 8.192307692307692,
"grad_norm": 0.41759821313528867,
"learning_rate": 2.1387773760769477e-06,
"loss": 0.2023,
"step": 213
},
{
"epoch": 8.23076923076923,
"grad_norm": 0.4314676228849499,
"learning_rate": 2.1188622212241366e-06,
"loss": 0.1954,
"step": 214
},
{
"epoch": 8.26923076923077,
"grad_norm": 0.3877676711204905,
"learning_rate": 2.0989717978555992e-06,
"loss": 0.4745,
"step": 215
},
{
"epoch": 8.307692307692308,
"grad_norm": 0.47245327189394,
"learning_rate": 2.079107396632404e-06,
"loss": 0.2973,
"step": 216
},
{
"epoch": 8.346153846153847,
"grad_norm": 0.4052393199704439,
"learning_rate": 2.0592703065270777e-06,
"loss": 0.2097,
"step": 217
},
{
"epoch": 8.384615384615385,
"grad_norm": 0.4313505494454412,
"learning_rate": 2.0394618147399713e-06,
"loss": 0.2667,
"step": 218
},
{
"epoch": 8.423076923076923,
"grad_norm": 0.4454566796379792,
"learning_rate": 2.019683206615729e-06,
"loss": 0.2783,
"step": 219
},
{
"epoch": 8.461538461538462,
"grad_norm": 0.42931541141385865,
"learning_rate": 1.9999357655598894e-06,
"loss": 0.331,
"step": 220
},
{
"epoch": 8.5,
"grad_norm": 0.5368759318605051,
"learning_rate": 1.9802207729556023e-06,
"loss": 0.4092,
"step": 221
},
{
"epoch": 8.538461538461538,
"grad_norm": 0.39319040108908987,
"learning_rate": 1.960539508080485e-06,
"loss": 0.3224,
"step": 222
},
{
"epoch": 8.576923076923077,
"grad_norm": 0.33398610783086696,
"learning_rate": 1.940893248023612e-06,
"loss": 0.3013,
"step": 223
},
{
"epoch": 8.615384615384615,
"grad_norm": 0.41372509168979466,
"learning_rate": 1.921283267602643e-06,
"loss": 0.4431,
"step": 224
},
{
"epoch": 8.653846153846153,
"grad_norm": 0.45779278911697124,
"learning_rate": 1.9017108392811065e-06,
"loss": 0.2524,
"step": 225
},
{
"epoch": 8.692307692307692,
"grad_norm": 0.40817506076516025,
"learning_rate": 1.8821772330858259e-06,
"loss": 0.2038,
"step": 226
},
{
"epoch": 8.73076923076923,
"grad_norm": 0.37963645007803587,
"learning_rate": 1.8626837165245165e-06,
"loss": 0.385,
"step": 227
},
{
"epoch": 8.76923076923077,
"grad_norm": 0.48116684628673423,
"learning_rate": 1.8432315545035328e-06,
"loss": 0.2815,
"step": 228
},
{
"epoch": 8.807692307692308,
"grad_norm": 0.5907769436853136,
"learning_rate": 1.8238220092457909e-06,
"loss": 0.3713,
"step": 229
},
{
"epoch": 8.846153846153847,
"grad_norm": 0.6129928044002855,
"learning_rate": 1.8044563402088686e-06,
"loss": 0.3014,
"step": 230
},
{
"epoch": 8.884615384615385,
"grad_norm": 0.4955121873754913,
"learning_rate": 1.7851358040032773e-06,
"loss": 0.2634,
"step": 231
},
{
"epoch": 8.923076923076923,
"grad_norm": 0.48937225000365864,
"learning_rate": 1.7658616543109237e-06,
"loss": 0.2948,
"step": 232
},
{
"epoch": 8.961538461538462,
"grad_norm": 0.6666285440555229,
"learning_rate": 1.7466351418037608e-06,
"loss": 0.4233,
"step": 233
},
{
"epoch": 9.0,
"grad_norm": 0.370440162908677,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.3876,
"step": 234
}
],
"logging_steps": 1,
"max_steps": 390,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 368952499765248.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}