|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7486075009283327, |
|
"eval_steps": 84, |
|
"global_step": 252, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.297607958316803, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5915, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 1.027875304222107, |
|
"eval_runtime": 319.0994, |
|
"eval_samples_per_second": 0.78, |
|
"eval_steps_per_second": 0.78, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3444654643535614, |
|
"learning_rate": 4e-05, |
|
"loss": 0.5941, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.31323298811912537, |
|
"learning_rate": 6e-05, |
|
"loss": 0.5986, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3083553612232208, |
|
"learning_rate": 8e-05, |
|
"loss": 0.589, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3427445590496063, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6559, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.26026275753974915, |
|
"learning_rate": 0.00012, |
|
"loss": 0.6444, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.22449900209903717, |
|
"learning_rate": 0.00014, |
|
"loss": 0.5621, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.18667733669281006, |
|
"learning_rate": 0.00016, |
|
"loss": 0.574, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.1876465231180191, |
|
"learning_rate": 0.00018, |
|
"loss": 0.6105, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2330338954925537, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5793, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.18704406917095184, |
|
"learning_rate": 0.00019999535665248002, |
|
"loss": 0.5431, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.20125611126422882, |
|
"learning_rate": 0.0001999814270411335, |
|
"loss": 0.5219, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.17054004967212677, |
|
"learning_rate": 0.000199958212459561, |
|
"loss": 0.5022, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.20665132999420166, |
|
"learning_rate": 0.00019992571506363, |
|
"loss": 0.5907, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.21675613522529602, |
|
"learning_rate": 0.00019988393787127441, |
|
"loss": 0.4667, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.20350293815135956, |
|
"learning_rate": 0.0001998328847622148, |
|
"loss": 0.5533, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.18382889032363892, |
|
"learning_rate": 0.00019977256047759765, |
|
"loss": 0.5022, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.1696760207414627, |
|
"learning_rate": 0.00019970297061955533, |
|
"loss": 0.5381, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.19592055678367615, |
|
"learning_rate": 0.00019962412165068573, |
|
"loss": 0.5733, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.17972029745578766, |
|
"learning_rate": 0.00019953602089345217, |
|
"loss": 0.5891, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.19014880061149597, |
|
"learning_rate": 0.0001994386765295032, |
|
"loss": 0.5629, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.15824897587299347, |
|
"learning_rate": 0.00019933209759891317, |
|
"loss": 0.5627, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.17133353650569916, |
|
"learning_rate": 0.00019921629399934223, |
|
"loss": 0.5542, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.13738161325454712, |
|
"learning_rate": 0.00019909127648511755, |
|
"loss": 0.4443, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.16617180407047272, |
|
"learning_rate": 0.0001989570566662345, |
|
"loss": 0.5469, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.13969440758228302, |
|
"learning_rate": 0.00019881364700727823, |
|
"loss": 0.5141, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.1598738580942154, |
|
"learning_rate": 0.0001986610608262665, |
|
"loss": 0.5598, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.14080321788787842, |
|
"learning_rate": 0.00019849931229341258, |
|
"loss": 0.4978, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.16124503314495087, |
|
"learning_rate": 0.00019832841642980945, |
|
"loss": 0.613, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.1537931114435196, |
|
"learning_rate": 0.00019814838910603481, |
|
"loss": 0.4913, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.16199100017547607, |
|
"learning_rate": 0.00019795924704067721, |
|
"loss": 0.4872, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.13554096221923828, |
|
"learning_rate": 0.00019776100779878345, |
|
"loss": 0.4609, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.18090932071208954, |
|
"learning_rate": 0.00019755368979022732, |
|
"loss": 0.4582, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.14535771310329437, |
|
"learning_rate": 0.00019733731226800015, |
|
"loss": 0.5359, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.1427253782749176, |
|
"learning_rate": 0.00019711189532642243, |
|
"loss": 0.5324, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.13935472071170807, |
|
"learning_rate": 0.00019687745989927823, |
|
"loss": 0.5106, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.1388338804244995, |
|
"learning_rate": 0.00019663402775787066, |
|
"loss": 0.5634, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.13948731124401093, |
|
"learning_rate": 0.00019638162150900027, |
|
"loss": 0.5058, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.13687050342559814, |
|
"learning_rate": 0.00019612026459286578, |
|
"loss": 0.5806, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.1311887949705124, |
|
"learning_rate": 0.00019584998128088684, |
|
"loss": 0.4552, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.15644784271717072, |
|
"learning_rate": 0.0001955707966734505, |
|
"loss": 0.4424, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.14039994776248932, |
|
"learning_rate": 0.00019528273669757972, |
|
"loss": 0.4905, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.14709696173667908, |
|
"learning_rate": 0.0001949858281045261, |
|
"loss": 0.5655, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.1661250740289688, |
|
"learning_rate": 0.00019468009846728513, |
|
"loss": 0.5141, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.3046470582485199, |
|
"learning_rate": 0.00019436557617803595, |
|
"loss": 0.4792, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.15159915387630463, |
|
"learning_rate": 0.00019404229044550433, |
|
"loss": 0.5077, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.15115606784820557, |
|
"learning_rate": 0.00019371027129225042, |
|
"loss": 0.4915, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.14040274918079376, |
|
"learning_rate": 0.0001933695495518804, |
|
"loss": 0.5048, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.14947518706321716, |
|
"learning_rate": 0.00019302015686618326, |
|
"loss": 0.5124, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.13805006444454193, |
|
"learning_rate": 0.0001926621256821922, |
|
"loss": 0.4455, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.14188264310359955, |
|
"learning_rate": 0.00019229548924917146, |
|
"loss": 0.5244, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.1314387321472168, |
|
"learning_rate": 0.00019192028161552847, |
|
"loss": 0.5251, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.1659722477197647, |
|
"learning_rate": 0.0001915365376256519, |
|
"loss": 0.4851, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.1405472755432129, |
|
"learning_rate": 0.00019114429291667583, |
|
"loss": 0.463, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.14347247779369354, |
|
"learning_rate": 0.00019074358391517023, |
|
"loss": 0.4491, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.14897900819778442, |
|
"learning_rate": 0.00019033444783375804, |
|
"loss": 0.4942, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.13556955754756927, |
|
"learning_rate": 0.00018991692266765947, |
|
"loss": 0.4755, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.16432009637355804, |
|
"learning_rate": 0.00018949104719116332, |
|
"loss": 0.5747, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.15541689097881317, |
|
"learning_rate": 0.00018905686095402647, |
|
"loss": 0.533, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1607791632413864, |
|
"learning_rate": 0.0001886144042778006, |
|
"loss": 0.556, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.15379983186721802, |
|
"learning_rate": 0.00018816371825208789, |
|
"loss": 0.4549, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.15419824421405792, |
|
"learning_rate": 0.0001877048447307252, |
|
"loss": 0.4933, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1353635936975479, |
|
"learning_rate": 0.00018723782632789701, |
|
"loss": 0.393, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.15725383162498474, |
|
"learning_rate": 0.00018676270641417822, |
|
"loss": 0.4671, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.12909093499183655, |
|
"learning_rate": 0.0001862795291125063, |
|
"loss": 0.484, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.14394541084766388, |
|
"learning_rate": 0.0001857883392940837, |
|
"loss": 0.5232, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1671515554189682, |
|
"learning_rate": 0.000185289182574211, |
|
"loss": 0.4683, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1658957600593567, |
|
"learning_rate": 0.0001847821053080505, |
|
"loss": 0.5072, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.15142019093036652, |
|
"learning_rate": 0.00018426715458632153, |
|
"loss": 0.5328, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.141921728849411, |
|
"learning_rate": 0.00018374437823092724, |
|
"loss": 0.4387, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.1730872541666031, |
|
"learning_rate": 0.00018321382479051347, |
|
"loss": 0.5323, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.13589338958263397, |
|
"learning_rate": 0.00018267554353596025, |
|
"loss": 0.4217, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.16519547998905182, |
|
"learning_rate": 0.0001821295844558062, |
|
"loss": 0.4498, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.16089589893817902, |
|
"learning_rate": 0.0001815759982516061, |
|
"loss": 0.5204, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.16409483551979065, |
|
"learning_rate": 0.00018101483633322255, |
|
"loss": 0.4511, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.16179721057415009, |
|
"learning_rate": 0.00018044615081405153, |
|
"loss": 0.5866, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.15137337148189545, |
|
"learning_rate": 0.00017986999450618295, |
|
"loss": 0.4728, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.14265336096286774, |
|
"learning_rate": 0.00017928642091549613, |
|
"loss": 0.4491, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.14860045909881592, |
|
"learning_rate": 0.00017869548423669077, |
|
"loss": 0.4734, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.14790605008602142, |
|
"learning_rate": 0.00017809723934825405, |
|
"loss": 0.533, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1581815779209137, |
|
"learning_rate": 0.00017749174180736442, |
|
"loss": 0.4661, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1520436853170395, |
|
"learning_rate": 0.00017687904784473188, |
|
"loss": 0.5292, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.15341107547283173, |
|
"learning_rate": 0.00017625921435937637, |
|
"loss": 0.5062, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.14248648285865784, |
|
"learning_rate": 0.00017563229891334338, |
|
"loss": 0.4249, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.9711415767669678, |
|
"eval_runtime": 320.5008, |
|
"eval_samples_per_second": 0.777, |
|
"eval_steps_per_second": 0.777, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.16506022214889526, |
|
"learning_rate": 0.00017499835972635856, |
|
"loss": 0.4873, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.14895877242088318, |
|
"learning_rate": 0.00017435745567042095, |
|
"loss": 0.4774, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.25245463848114014, |
|
"learning_rate": 0.00017370964626433567, |
|
"loss": 0.5324, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.14706483483314514, |
|
"learning_rate": 0.0001730549916681868, |
|
"loss": 0.4536, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.14856815338134766, |
|
"learning_rate": 0.00017239355267775018, |
|
"loss": 0.4576, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.1513276994228363, |
|
"learning_rate": 0.0001717253907188477, |
|
"loss": 0.4286, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.1549597978591919, |
|
"learning_rate": 0.00017105056784164294, |
|
"loss": 0.4554, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.15850332379341125, |
|
"learning_rate": 0.00017036914671487852, |
|
"loss": 0.454, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.15920379757881165, |
|
"learning_rate": 0.00016968119062005642, |
|
"loss": 0.5333, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.1652740240097046, |
|
"learning_rate": 0.00016898676344556118, |
|
"loss": 0.5272, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.15505805611610413, |
|
"learning_rate": 0.00016828592968072678, |
|
"loss": 0.5429, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.16118398308753967, |
|
"learning_rate": 0.00016757875440984768, |
|
"loss": 0.4782, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.1549789011478424, |
|
"learning_rate": 0.0001668653033061347, |
|
"loss": 0.4594, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.1643325537443161, |
|
"learning_rate": 0.00016614564262561608, |
|
"loss": 0.5086, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.1949179321527481, |
|
"learning_rate": 0.0001654198392009846, |
|
"loss": 0.4052, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.1303270012140274, |
|
"learning_rate": 0.0001646879604353908, |
|
"loss": 0.4845, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.13640670478343964, |
|
"learning_rate": 0.00016395007429618382, |
|
"loss": 0.4459, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.18252772092819214, |
|
"learning_rate": 0.00016320624930859904, |
|
"loss": 0.523, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.16342373192310333, |
|
"learning_rate": 0.00016245655454939474, |
|
"loss": 0.4714, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1418078988790512, |
|
"learning_rate": 0.00016170105964043695, |
|
"loss": 0.4841, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.15758009254932404, |
|
"learning_rate": 0.0001609398347422339, |
|
"loss": 0.3911, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1476665586233139, |
|
"learning_rate": 0.00016017295054742046, |
|
"loss": 0.3962, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.14230811595916748, |
|
"learning_rate": 0.00015940047827419303, |
|
"loss": 0.4391, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.15269385278224945, |
|
"learning_rate": 0.00015862248965969604, |
|
"loss": 0.4992, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.14437031745910645, |
|
"learning_rate": 0.00015783905695335946, |
|
"loss": 0.4768, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1448802649974823, |
|
"learning_rate": 0.0001570502529101896, |
|
"loss": 0.4469, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1571648269891739, |
|
"learning_rate": 0.00015625615078401244, |
|
"loss": 0.4048, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1563793420791626, |
|
"learning_rate": 0.00015545682432067067, |
|
"loss": 0.4518, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1441984921693802, |
|
"learning_rate": 0.0001546523477511754, |
|
"loss": 0.497, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.13513916730880737, |
|
"learning_rate": 0.00015384279578481221, |
|
"loss": 0.4923, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1501740962266922, |
|
"learning_rate": 0.00015302824360220353, |
|
"loss": 0.4701, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2020605355501175, |
|
"learning_rate": 0.00015220876684832638, |
|
"loss": 0.4122, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.15191146731376648, |
|
"learning_rate": 0.0001513844416254879, |
|
"loss": 0.4432, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.1627548784017563, |
|
"learning_rate": 0.00015055534448625766, |
|
"loss": 0.4779, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.14043453335762024, |
|
"learning_rate": 0.00014972155242635852, |
|
"loss": 0.471, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.16092036664485931, |
|
"learning_rate": 0.0001488831428775164, |
|
"loss": 0.4543, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1478460729122162, |
|
"learning_rate": 0.00014804019370026926, |
|
"loss": 0.4521, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.15067800879478455, |
|
"learning_rate": 0.00014719278317673655, |
|
"loss": 0.5665, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.16037791967391968, |
|
"learning_rate": 0.0001463409900033493, |
|
"loss": 0.5198, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.15850840508937836, |
|
"learning_rate": 0.00014548489328354195, |
|
"loss": 0.5183, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.13782666623592377, |
|
"learning_rate": 0.00014462457252040607, |
|
"loss": 0.4415, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.17566777765750885, |
|
"learning_rate": 0.00014376010760930728, |
|
"loss": 0.5166, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.14671465754508972, |
|
"learning_rate": 0.00014289157883046568, |
|
"loss": 0.454, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.14827768504619598, |
|
"learning_rate": 0.0001420190668415002, |
|
"loss": 0.3743, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.16858412325382233, |
|
"learning_rate": 0.00014114265266993846, |
|
"loss": 0.5204, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.14838969707489014, |
|
"learning_rate": 0.00014026241770569197, |
|
"loss": 0.5072, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.1676333248615265, |
|
"learning_rate": 0.00013937844369349734, |
|
"loss": 0.5026, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.1548876315355301, |
|
"learning_rate": 0.00013849081272532544, |
|
"loss": 0.4393, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.16387146711349487, |
|
"learning_rate": 0.00013759960723275732, |
|
"loss": 0.4899, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.15013404190540314, |
|
"learning_rate": 0.00013670490997932922, |
|
"loss": 0.4398, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1466226577758789, |
|
"learning_rate": 0.00013580680405284664, |
|
"loss": 0.5087, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.15578770637512207, |
|
"learning_rate": 0.00013490537285766808, |
|
"loss": 0.4662, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.16825653612613678, |
|
"learning_rate": 0.00013400070010695966, |
|
"loss": 0.4826, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.15624648332595825, |
|
"learning_rate": 0.00013309286981492085, |
|
"loss": 0.4818, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.16185308992862701, |
|
"learning_rate": 0.00013218196628898233, |
|
"loss": 0.4848, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.15799778699874878, |
|
"learning_rate": 0.00013126807412197665, |
|
"loss": 0.4456, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.15358638763427734, |
|
"learning_rate": 0.0001303512781842824, |
|
"loss": 0.434, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.14526773989200592, |
|
"learning_rate": 0.00012943166361594242, |
|
"loss": 0.4944, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1717992126941681, |
|
"learning_rate": 0.00012850931581875723, |
|
"loss": 0.4966, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.14694097638130188, |
|
"learning_rate": 0.00012758432044835392, |
|
"loss": 0.4055, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.17159771919250488, |
|
"learning_rate": 0.0001266567634062317, |
|
"loss": 0.4455, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.16740797460079193, |
|
"learning_rate": 0.0001257267308317845, |
|
"loss": 0.3732, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.14464697241783142, |
|
"learning_rate": 0.00012479430909430108, |
|
"loss": 0.4976, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.15393030643463135, |
|
"learning_rate": 0.00012385958478494487, |
|
"loss": 0.3653, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.15888682007789612, |
|
"learning_rate": 0.00012292264470871182, |
|
"loss": 0.4966, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.14429794251918793, |
|
"learning_rate": 0.00012198357587636957, |
|
"loss": 0.4821, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.15093272924423218, |
|
"learning_rate": 0.00012104246549637683, |
|
"loss": 0.4431, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.16575871407985687, |
|
"learning_rate": 0.00012009940096678452, |
|
"loss": 0.4903, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.15208427608013153, |
|
"learning_rate": 0.00011915446986711953, |
|
"loss": 0.4465, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.16902336478233337, |
|
"learning_rate": 0.00011820775995025147, |
|
"loss": 0.4997, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.1612280309200287, |
|
"learning_rate": 0.0001172593591342432, |
|
"loss": 0.4258, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.14114782214164734, |
|
"learning_rate": 0.00011630935549418627, |
|
"loss": 0.5262, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.16275247931480408, |
|
"learning_rate": 0.00011535783725402163, |
|
"loss": 0.4403, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1442529857158661, |
|
"learning_rate": 0.00011440489277834645, |
|
"loss": 0.479, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.15231800079345703, |
|
"learning_rate": 0.0001134506105642081, |
|
"loss": 0.4856, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.14390550553798676, |
|
"learning_rate": 0.00011249507923288562, |
|
"loss": 0.3665, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.19076046347618103, |
|
"learning_rate": 0.0001115383875216598, |
|
"loss": 0.4423, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.16523972153663635, |
|
"learning_rate": 0.00011058062427557229, |
|
"loss": 0.5816, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1553642749786377, |
|
"learning_rate": 0.00010962187843917497, |
|
"loss": 0.4793, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.14419597387313843, |
|
"learning_rate": 0.0001086622390482699, |
|
"loss": 0.4862, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.14601647853851318, |
|
"learning_rate": 0.00010770179522164079, |
|
"loss": 0.4589, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.15500588715076447, |
|
"learning_rate": 0.0001067406361527768, |
|
"loss": 0.4754, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.16032655537128448, |
|
"learning_rate": 0.00010577885110158958, |
|
"loss": 0.4644, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.15342958271503448, |
|
"learning_rate": 0.00010481652938612374, |
|
"loss": 0.3911, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.9589568972587585, |
|
"eval_runtime": 324.8473, |
|
"eval_samples_per_second": 0.767, |
|
"eval_steps_per_second": 0.767, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.15607774257659912, |
|
"learning_rate": 0.00010385376037426226, |
|
"loss": 0.492, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.15384891629219055, |
|
"learning_rate": 0.00010289063347542726, |
|
"loss": 0.487, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.15275317430496216, |
|
"learning_rate": 0.00010192723813227672, |
|
"loss": 0.4475, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.1687576025724411, |
|
"learning_rate": 0.00010096366381239808, |
|
"loss": 0.4491, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.17867827415466309, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4197, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.14261119067668915, |
|
"learning_rate": 9.903633618760195e-05, |
|
"loss": 0.4734, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.14787495136260986, |
|
"learning_rate": 9.807276186772333e-05, |
|
"loss": 0.404, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.15715274214744568, |
|
"learning_rate": 9.710936652457276e-05, |
|
"loss": 0.4583, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.1578068882226944, |
|
"learning_rate": 9.614623962573776e-05, |
|
"loss": 0.4343, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.15597595274448395, |
|
"learning_rate": 9.518347061387628e-05, |
|
"loss": 0.4454, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.142894446849823, |
|
"learning_rate": 9.422114889841044e-05, |
|
"loss": 0.4443, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.15279816091060638, |
|
"learning_rate": 9.325936384722321e-05, |
|
"loss": 0.4428, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.1651957929134369, |
|
"learning_rate": 9.229820477835927e-05, |
|
"loss": 0.3857, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.22555674612522125, |
|
"learning_rate": 9.133776095173015e-05, |
|
"loss": 0.4578, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.14163288474082947, |
|
"learning_rate": 9.037812156082504e-05, |
|
"loss": 0.4409, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.1683199107646942, |
|
"learning_rate": 8.941937572442773e-05, |
|
"loss": 0.4829, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.17766785621643066, |
|
"learning_rate": 8.846161247834024e-05, |
|
"loss": 0.4357, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.15518875420093536, |
|
"learning_rate": 8.750492076711439e-05, |
|
"loss": 0.4681, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.13920673727989197, |
|
"learning_rate": 8.654938943579194e-05, |
|
"loss": 0.4258, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.17552931606769562, |
|
"learning_rate": 8.55951072216536e-05, |
|
"loss": 0.3932, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.15486915409564972, |
|
"learning_rate": 8.464216274597838e-05, |
|
"loss": 0.4679, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.1437031477689743, |
|
"learning_rate": 8.369064450581373e-05, |
|
"loss": 0.3984, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.15499216318130493, |
|
"learning_rate": 8.274064086575681e-05, |
|
"loss": 0.5216, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.18269003927707672, |
|
"learning_rate": 8.179224004974857e-05, |
|
"loss": 0.4853, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.1596907526254654, |
|
"learning_rate": 8.084553013288048e-05, |
|
"loss": 0.4784, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.15115347504615784, |
|
"learning_rate": 7.990059903321553e-05, |
|
"loss": 0.4306, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.19720065593719482, |
|
"learning_rate": 7.89575345036232e-05, |
|
"loss": 0.4782, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.14170032739639282, |
|
"learning_rate": 7.801642412363041e-05, |
|
"loss": 0.4318, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.16026908159255981, |
|
"learning_rate": 7.707735529128819e-05, |
|
"loss": 0.4656, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.16910114884376526, |
|
"learning_rate": 7.614041521505517e-05, |
|
"loss": 0.5014, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.18795110285282135, |
|
"learning_rate": 7.520569090569893e-05, |
|
"loss": 0.4512, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.15903539955615997, |
|
"learning_rate": 7.427326916821557e-05, |
|
"loss": 0.5036, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.15247230231761932, |
|
"learning_rate": 7.334323659376829e-05, |
|
"loss": 0.4369, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.1472063809633255, |
|
"learning_rate": 7.24156795516461e-05, |
|
"loss": 0.426, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.15710321068763733, |
|
"learning_rate": 7.149068418124281e-05, |
|
"loss": 0.4034, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.15857233107089996, |
|
"learning_rate": 7.056833638405762e-05, |
|
"loss": 0.4783, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.16299530863761902, |
|
"learning_rate": 6.964872181571764e-05, |
|
"loss": 0.4719, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.14976787567138672, |
|
"learning_rate": 6.87319258780234e-05, |
|
"loss": 0.4543, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.14311917126178741, |
|
"learning_rate": 6.781803371101774e-05, |
|
"loss": 0.4422, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.14494828879833221, |
|
"learning_rate": 6.690713018507918e-05, |
|
"loss": 0.4078, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.1489972174167633, |
|
"learning_rate": 6.599929989304035e-05, |
|
"loss": 0.3908, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.15475499629974365, |
|
"learning_rate": 6.509462714233195e-05, |
|
"loss": 0.5416, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.1536937654018402, |
|
"learning_rate": 6.419319594715339e-05, |
|
"loss": 0.4267, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.1654786467552185, |
|
"learning_rate": 6.32950900206708e-05, |
|
"loss": 0.4622, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.14727674424648285, |
|
"learning_rate": 6.240039276724272e-05, |
|
"loss": 0.441, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.1774049550294876, |
|
"learning_rate": 6.150918727467455e-05, |
|
"loss": 0.4956, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.14867380261421204, |
|
"learning_rate": 6.062155630650265e-05, |
|
"loss": 0.5788, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.16996605694293976, |
|
"learning_rate": 5.973758229430806e-05, |
|
"loss": 0.588, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.15790195763111115, |
|
"learning_rate": 5.885734733006154e-05, |
|
"loss": 0.425, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.15656080842018127, |
|
"learning_rate": 5.798093315849984e-05, |
|
"loss": 0.4193, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.143330916762352, |
|
"learning_rate": 5.710842116953438e-05, |
|
"loss": 0.4427, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.15389469265937805, |
|
"learning_rate": 5.623989239069275e-05, |
|
"loss": 0.4265, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.16425251960754395, |
|
"learning_rate": 5.537542747959394e-05, |
|
"loss": 0.53, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.1583469659090042, |
|
"learning_rate": 5.451510671645807e-05, |
|
"loss": 0.5009, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.16075047850608826, |
|
"learning_rate": 5.36590099966507e-05, |
|
"loss": 0.5189, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.14854145050048828, |
|
"learning_rate": 5.2807216823263484e-05, |
|
"loss": 0.463, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.15285861492156982, |
|
"learning_rate": 5.1959806299730774e-05, |
|
"loss": 0.4413, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.14538119733333588, |
|
"learning_rate": 5.111685712248364e-05, |
|
"loss": 0.4736, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.15160393714904785, |
|
"learning_rate": 5.0278447573641495e-05, |
|
"loss": 0.4298, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.14497333765029907, |
|
"learning_rate": 4.944465551374238e-05, |
|
"loss": 0.3981, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.14342719316482544, |
|
"learning_rate": 4.861555837451213e-05, |
|
"loss": 0.4502, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.1621183604001999, |
|
"learning_rate": 4.779123315167362e-05, |
|
"loss": 0.4322, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.15252463519573212, |
|
"learning_rate": 4.6971756397796504e-05, |
|
"loss": 0.5072, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.17667719721794128, |
|
"learning_rate": 4.61572042151878e-05, |
|
"loss": 0.4708, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.15273234248161316, |
|
"learning_rate": 4.5347652248824624e-05, |
|
"loss": 0.5212, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.16372352838516235, |
|
"learning_rate": 4.4543175679329344e-05, |
|
"loss": 0.5303, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.14327646791934967, |
|
"learning_rate": 4.3743849215987595e-05, |
|
"loss": 0.4, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.1604086011648178, |
|
"learning_rate": 4.294974708981041e-05, |
|
"loss": 0.4508, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.23445452749729156, |
|
"learning_rate": 4.216094304664056e-05, |
|
"loss": 0.4363, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.15834979712963104, |
|
"learning_rate": 4.137751034030399e-05, |
|
"loss": 0.5153, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.14937692880630493, |
|
"learning_rate": 4.059952172580694e-05, |
|
"loss": 0.4423, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.16776776313781738, |
|
"learning_rate": 3.982704945257957e-05, |
|
"loss": 0.4089, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.16948916018009186, |
|
"learning_rate": 3.906016525776611e-05, |
|
"loss": 0.562, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.16811306774616241, |
|
"learning_rate": 3.829894035956306e-05, |
|
"loss": 0.407, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.13980595767498016, |
|
"learning_rate": 3.7543445450605285e-05, |
|
"loss": 0.4329, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.16961558163166046, |
|
"learning_rate": 3.6793750691400994e-05, |
|
"loss": 0.4476, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.16474933922290802, |
|
"learning_rate": 3.6049925703816214e-05, |
|
"loss": 0.4629, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.1619129627943039, |
|
"learning_rate": 3.53120395646092e-05, |
|
"loss": 0.4648, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.20941300690174103, |
|
"learning_rate": 3.458016079901544e-05, |
|
"loss": 0.4777, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.1691591888666153, |
|
"learning_rate": 3.38543573743839e-05, |
|
"loss": 0.4544, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.1482505053281784, |
|
"learning_rate": 3.3134696693865316e-05, |
|
"loss": 0.4295, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.16560567915439606, |
|
"learning_rate": 3.242124559015234e-05, |
|
"loss": 0.4568, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.15260429680347443, |
|
"learning_rate": 3.171407031927325e-05, |
|
"loss": 0.4311, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.15556661784648895, |
|
"learning_rate": 3.101323655443882e-05, |
|
"loss": 0.4655, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.9627882838249207, |
|
"eval_runtime": 320.8688, |
|
"eval_samples_per_second": 0.776, |
|
"eval_steps_per_second": 0.776, |
|
"step": 252 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 336, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 84, |
|
"total_flos": 2.5549257243937997e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|