|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 225, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 0.17873810231685638, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 0.9284, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 0.1875726729631424, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 0.9365, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.2115127593278885, |
|
"learning_rate": 2.608695652173913e-05, |
|
"loss": 0.9367, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.2282930463552475, |
|
"learning_rate": 3.478260869565218e-05, |
|
"loss": 0.8894, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.2383526861667633, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 0.9455, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.23653796315193176, |
|
"learning_rate": 5.217391304347826e-05, |
|
"loss": 1.0121, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 0.253218412399292, |
|
"learning_rate": 6.086956521739131e-05, |
|
"loss": 0.8756, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.24895112216472626, |
|
"learning_rate": 6.956521739130436e-05, |
|
"loss": 0.9091, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2350614368915558, |
|
"learning_rate": 7.82608695652174e-05, |
|
"loss": 0.9091, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.27106258273124695, |
|
"learning_rate": 8.695652173913044e-05, |
|
"loss": 0.8018, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 0.3147624731063843, |
|
"learning_rate": 9.565217391304348e-05, |
|
"loss": 0.796, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.3592516779899597, |
|
"learning_rate": 9.999395316300748e-05, |
|
"loss": 0.8837, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 0.27557510137557983, |
|
"learning_rate": 9.994558724213054e-05, |
|
"loss": 0.8367, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 0.2757929861545563, |
|
"learning_rate": 9.984890219128146e-05, |
|
"loss": 0.8764, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.30082619190216064, |
|
"learning_rate": 9.970399154700263e-05, |
|
"loss": 0.8687, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.26034796237945557, |
|
"learning_rate": 9.951099550098349e-05, |
|
"loss": 0.8371, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 0.22807522118091583, |
|
"learning_rate": 9.927010076443407e-05, |
|
"loss": 0.8085, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.22343170642852783, |
|
"learning_rate": 9.898154038745408e-05, |
|
"loss": 0.8812, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 0.18960116803646088, |
|
"learning_rate": 9.864559353357187e-05, |
|
"loss": 0.7953, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.21037450432777405, |
|
"learning_rate": 9.826258520967178e-05, |
|
"loss": 0.7484, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.2212020456790924, |
|
"learning_rate": 9.783288595157098e-05, |
|
"loss": 0.7931, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 0.23480449616909027, |
|
"learning_rate": 9.735691146555002e-05, |
|
"loss": 0.8413, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.0222222222222221, |
|
"grad_norm": 0.22327324748039246, |
|
"learning_rate": 9.683512222618377e-05, |
|
"loss": 0.8391, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.24856683611869812, |
|
"learning_rate": 9.626802303086208e-05, |
|
"loss": 0.7997, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.2144002616405487, |
|
"learning_rate": 9.565616251143094e-05, |
|
"loss": 0.9146, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 0.24486525356769562, |
|
"learning_rate": 9.500013260342651e-05, |
|
"loss": 0.7813, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.23040799796581268, |
|
"learning_rate": 9.430056797341574e-05, |
|
"loss": 0.7282, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 0.24283307790756226, |
|
"learning_rate": 9.355814540499752e-05, |
|
"loss": 0.7598, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2888888888888888, |
|
"grad_norm": 0.25237399339675903, |
|
"learning_rate": 9.27735831440582e-05, |
|
"loss": 0.7959, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.2991330325603485, |
|
"learning_rate": 9.194764020391506e-05, |
|
"loss": 0.7853, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3777777777777778, |
|
"grad_norm": 0.27033334970474243, |
|
"learning_rate": 9.108111563102004e-05, |
|
"loss": 0.7553, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 0.32717347145080566, |
|
"learning_rate": 9.017484773193378e-05, |
|
"loss": 0.7809, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 0.29897943139076233, |
|
"learning_rate": 8.92297132623183e-05, |
|
"loss": 0.7628, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 0.30663371086120605, |
|
"learning_rate": 8.824662657873239e-05, |
|
"loss": 0.7763, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.28747016191482544, |
|
"learning_rate": 8.722653875405075e-05, |
|
"loss": 0.737, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.30916616320610046, |
|
"learning_rate": 8.617043665736249e-05, |
|
"loss": 0.7879, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.6444444444444444, |
|
"grad_norm": 0.35312747955322266, |
|
"learning_rate": 8.507934199923884e-05, |
|
"loss": 0.7707, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 0.36939722299575806, |
|
"learning_rate": 8.39543103432943e-05, |
|
"loss": 0.727, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 0.3665863871574402, |
|
"learning_rate": 8.2796430084997e-05, |
|
"loss": 0.7193, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.34428849816322327, |
|
"learning_rate": 8.160682139871633e-05, |
|
"loss": 0.7573, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.8222222222222222, |
|
"grad_norm": 0.3433472216129303, |
|
"learning_rate": 8.03866351540266e-05, |
|
"loss": 0.7711, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.3693158030509949, |
|
"learning_rate": 7.913705180231505e-05, |
|
"loss": 0.7741, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.911111111111111, |
|
"grad_norm": 0.4667452573776245, |
|
"learning_rate": 7.785928023477142e-05, |
|
"loss": 0.812, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 0.36592212319374084, |
|
"learning_rate": 7.655455661286376e-05, |
|
"loss": 0.7561, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4149788022041321, |
|
"learning_rate": 7.5224143172432e-05, |
|
"loss": 0.7797, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.0444444444444443, |
|
"grad_norm": 0.39174893498420715, |
|
"learning_rate": 7.386932700255636e-05, |
|
"loss": 0.7651, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.088888888888889, |
|
"grad_norm": 0.4016912877559662, |
|
"learning_rate": 7.24914188003818e-05, |
|
"loss": 0.741, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 0.46315836906433105, |
|
"learning_rate": 7.109175160310312e-05, |
|
"loss": 0.695, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.1777777777777776, |
|
"grad_norm": 0.48870378732681274, |
|
"learning_rate": 6.967167949833763e-05, |
|
"loss": 0.7259, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.43603938817977905, |
|
"learning_rate": 6.823257631413276e-05, |
|
"loss": 0.7574, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.2666666666666666, |
|
"grad_norm": 0.46362167596817017, |
|
"learning_rate": 6.677583428987625e-05, |
|
"loss": 0.6457, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"grad_norm": 0.5170242786407471, |
|
"learning_rate": 6.530286272939437e-05, |
|
"loss": 0.6498, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.3555555555555556, |
|
"grad_norm": 0.4572562575340271, |
|
"learning_rate": 6.381508663754153e-05, |
|
"loss": 0.6389, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.5652968287467957, |
|
"learning_rate": 6.231394534160008e-05, |
|
"loss": 0.7215, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.5415116548538208, |
|
"learning_rate": 6.0800891098824186e-05, |
|
"loss": 0.7096, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 0.493282675743103, |
|
"learning_rate": 5.9277387691474676e-05, |
|
"loss": 0.6736, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.533333333333333, |
|
"grad_norm": 0.6388583183288574, |
|
"learning_rate": 5.774490901070424e-05, |
|
"loss": 0.6875, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.5777777777777775, |
|
"grad_norm": 0.4991196393966675, |
|
"learning_rate": 5.620493763066297e-05, |
|
"loss": 0.6023, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.6222222222222222, |
|
"grad_norm": 0.5749160051345825, |
|
"learning_rate": 5.465896337420359e-05, |
|
"loss": 0.6356, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.6559845209121704, |
|
"learning_rate": 5.3108481871574036e-05, |
|
"loss": 0.6939, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.7111111111111112, |
|
"grad_norm": 0.6793063879013062, |
|
"learning_rate": 5.155499311349185e-05, |
|
"loss": 0.6981, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.7555555555555555, |
|
"grad_norm": 0.6073561310768127, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6733, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6620696187019348, |
|
"learning_rate": 4.844500688650816e-05, |
|
"loss": 0.6851, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 0.626446545124054, |
|
"learning_rate": 4.6891518128425976e-05, |
|
"loss": 0.654, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.6012407541275024, |
|
"learning_rate": 4.534103662579642e-05, |
|
"loss": 0.6303, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.9333333333333336, |
|
"grad_norm": 0.6335547566413879, |
|
"learning_rate": 4.3795062369337034e-05, |
|
"loss": 0.5819, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.977777777777778, |
|
"grad_norm": 0.6275166273117065, |
|
"learning_rate": 4.2255090989295764e-05, |
|
"loss": 0.6726, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 3.022222222222222, |
|
"grad_norm": 0.545259416103363, |
|
"learning_rate": 4.0722612308525335e-05, |
|
"loss": 0.6621, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 3.066666666666667, |
|
"grad_norm": 0.6598561406135559, |
|
"learning_rate": 3.919910890117584e-05, |
|
"loss": 0.5295, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.6240947842597961, |
|
"learning_rate": 3.7686054658399935e-05, |
|
"loss": 0.5466, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.1555555555555554, |
|
"grad_norm": 0.682789146900177, |
|
"learning_rate": 3.618491336245849e-05, |
|
"loss": 0.5419, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.7448738217353821, |
|
"learning_rate": 3.469713727060564e-05, |
|
"loss": 0.6171, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 3.2444444444444445, |
|
"grad_norm": 0.7555476427078247, |
|
"learning_rate": 3.3224165710123756e-05, |
|
"loss": 0.5628, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 3.2888888888888888, |
|
"grad_norm": 0.7862750887870789, |
|
"learning_rate": 3.176742368586725e-05, |
|
"loss": 0.5923, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.7729607224464417, |
|
"learning_rate": 3.032832050166239e-05, |
|
"loss": 0.5308, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.3777777777777778, |
|
"grad_norm": 0.7676399350166321, |
|
"learning_rate": 2.890824839689689e-05, |
|
"loss": 0.6444, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.422222222222222, |
|
"grad_norm": 0.8251731991767883, |
|
"learning_rate": 2.750858119961821e-05, |
|
"loss": 0.6573, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.466666666666667, |
|
"grad_norm": 0.8169701099395752, |
|
"learning_rate": 2.613067299744364e-05, |
|
"loss": 0.582, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.511111111111111, |
|
"grad_norm": 0.7788083553314209, |
|
"learning_rate": 2.4775856827568016e-05, |
|
"loss": 0.6335, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.8565711379051208, |
|
"learning_rate": 2.3445443387136244e-05, |
|
"loss": 0.508, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.8271569609642029, |
|
"learning_rate": 2.2140719765228584e-05, |
|
"loss": 0.5991, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.6444444444444444, |
|
"grad_norm": 0.812567412853241, |
|
"learning_rate": 2.0862948197684955e-05, |
|
"loss": 0.5808, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.688888888888889, |
|
"grad_norm": 0.7940819263458252, |
|
"learning_rate": 1.961336484597343e-05, |
|
"loss": 0.5703, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.7333333333333334, |
|
"grad_norm": 0.9031268358230591, |
|
"learning_rate": 1.8393178601283683e-05, |
|
"loss": 0.5002, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 0.8260697722434998, |
|
"learning_rate": 1.7203569915003005e-05, |
|
"loss": 0.5269, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.822222222222222, |
|
"grad_norm": 0.9412121772766113, |
|
"learning_rate": 1.6045689656705716e-05, |
|
"loss": 0.4902, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.8666666666666667, |
|
"grad_norm": 0.9131683707237244, |
|
"learning_rate": 1.4920658000761174e-05, |
|
"loss": 0.6185, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.911111111111111, |
|
"grad_norm": 0.7923269271850586, |
|
"learning_rate": 1.3829563342637513e-05, |
|
"loss": 0.537, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.9555555555555557, |
|
"grad_norm": 0.8013989329338074, |
|
"learning_rate": 1.2773461245949247e-05, |
|
"loss": 0.5637, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.7401474118232727, |
|
"learning_rate": 1.1753373421267621e-05, |
|
"loss": 0.5124, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.044444444444444, |
|
"grad_norm": 0.8890312910079956, |
|
"learning_rate": 1.0770286737681701e-05, |
|
"loss": 0.5488, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 4.088888888888889, |
|
"grad_norm": 0.858996570110321, |
|
"learning_rate": 9.825152268066213e-06, |
|
"loss": 0.4998, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 4.133333333333334, |
|
"grad_norm": 0.755382239818573, |
|
"learning_rate": 8.91888436897997e-06, |
|
"loss": 0.4892, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 4.177777777777778, |
|
"grad_norm": 0.8359836935997009, |
|
"learning_rate": 8.052359796084951e-06, |
|
"loss": 0.5365, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 4.222222222222222, |
|
"grad_norm": 0.8235160112380981, |
|
"learning_rate": 7.226416855941814e-06, |
|
"loss": 0.4963, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.266666666666667, |
|
"grad_norm": 0.9951562881469727, |
|
"learning_rate": 6.441854595002477e-06, |
|
"loss": 0.5367, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 4.311111111111111, |
|
"grad_norm": 0.907408595085144, |
|
"learning_rate": 5.699432026584267e-06, |
|
"loss": 0.5051, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 4.355555555555555, |
|
"grad_norm": 0.9453915357589722, |
|
"learning_rate": 4.999867396573499e-06, |
|
"loss": 0.5326, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.8306211829185486, |
|
"learning_rate": 4.343837488569058e-06, |
|
"loss": 0.4549, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.9665538668632507, |
|
"learning_rate": 3.731976969137929e-06, |
|
"loss": 0.4968, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.488888888888889, |
|
"grad_norm": 0.9233301281929016, |
|
"learning_rate": 3.1648777738162494e-06, |
|
"loss": 0.4623, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 4.533333333333333, |
|
"grad_norm": 0.8156710267066956, |
|
"learning_rate": 2.6430885344499946e-06, |
|
"loss": 0.4445, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.5777777777777775, |
|
"grad_norm": 0.8379063010215759, |
|
"learning_rate": 2.1671140484290142e-06, |
|
"loss": 0.4678, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 4.622222222222222, |
|
"grad_norm": 0.8422232866287231, |
|
"learning_rate": 1.7374147903282178e-06, |
|
"loss": 0.5377, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 0.9653432965278625, |
|
"learning_rate": 1.3544064664281265e-06, |
|
"loss": 0.5819, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.711111111111111, |
|
"grad_norm": 0.8152110576629639, |
|
"learning_rate": 1.0184596125459135e-06, |
|
"loss": 0.494, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.7555555555555555, |
|
"grad_norm": 0.9850694537162781, |
|
"learning_rate": 7.29899235565934e-07, |
|
"loss": 0.4983, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.9584403038024902, |
|
"learning_rate": 4.890044990165321e-07, |
|
"loss": 0.5186, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.844444444444444, |
|
"grad_norm": 0.948579728603363, |
|
"learning_rate": 2.9600845299737056e-07, |
|
"loss": 0.5044, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.8661581873893738, |
|
"learning_rate": 1.5109780871853663e-07, |
|
"loss": 0.5062, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.933333333333334, |
|
"grad_norm": 0.9714407324790955, |
|
"learning_rate": 5.4412757869459763e-08, |
|
"loss": 0.5391, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.977777777777778, |
|
"grad_norm": 0.7409553527832031, |
|
"learning_rate": 6.04683699252373e-09, |
|
"loss": 0.5243, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 225, |
|
"total_flos": 8.831352737326694e+16, |
|
"train_loss": 0.6794352424144745, |
|
"train_runtime": 806.654, |
|
"train_samples_per_second": 8.926, |
|
"train_steps_per_second": 0.279 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 225, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.831352737326694e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|