tinyllava-v2x / trainer_state.json
Seokhyun1's picture
Upload 6 files
7012785 verified
raw
history blame
No virus
96.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 552,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005434782608695652,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 10.5124,
"step": 1
},
{
"epoch": 0.010869565217391304,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 10.5941,
"step": 2
},
{
"epoch": 0.016304347826086956,
"grad_norm": 8.096252368220718,
"learning_rate": 1.1764705882352942e-06,
"loss": 10.475,
"step": 3
},
{
"epoch": 0.021739130434782608,
"grad_norm": 8.39383943803796,
"learning_rate": 2.3529411764705885e-06,
"loss": 10.4029,
"step": 4
},
{
"epoch": 0.02717391304347826,
"grad_norm": 8.479649232958007,
"learning_rate": 3.529411764705883e-06,
"loss": 10.606,
"step": 5
},
{
"epoch": 0.03260869565217391,
"grad_norm": 8.388175109430223,
"learning_rate": 4.705882352941177e-06,
"loss": 10.4024,
"step": 6
},
{
"epoch": 0.03804347826086957,
"grad_norm": 8.445899787393927,
"learning_rate": 5.882352941176471e-06,
"loss": 10.4772,
"step": 7
},
{
"epoch": 0.043478260869565216,
"grad_norm": 8.405772228388786,
"learning_rate": 7.058823529411766e-06,
"loss": 10.4004,
"step": 8
},
{
"epoch": 0.04891304347826087,
"grad_norm": 8.44764590867685,
"learning_rate": 8.23529411764706e-06,
"loss": 10.1775,
"step": 9
},
{
"epoch": 0.05434782608695652,
"grad_norm": 8.23897507323131,
"learning_rate": 9.411764705882354e-06,
"loss": 10.2434,
"step": 10
},
{
"epoch": 0.059782608695652176,
"grad_norm": 8.118852150518913,
"learning_rate": 1.0588235294117648e-05,
"loss": 9.7644,
"step": 11
},
{
"epoch": 0.06521739130434782,
"grad_norm": 8.570315139494753,
"learning_rate": 1.1764705882352942e-05,
"loss": 9.7751,
"step": 12
},
{
"epoch": 0.07065217391304347,
"grad_norm": 8.622402474140065,
"learning_rate": 1.2941176470588238e-05,
"loss": 9.2685,
"step": 13
},
{
"epoch": 0.07608695652173914,
"grad_norm": 8.736670863686008,
"learning_rate": 1.4117647058823532e-05,
"loss": 8.897,
"step": 14
},
{
"epoch": 0.08152173913043478,
"grad_norm": 9.172468108894085,
"learning_rate": 1.5294117647058822e-05,
"loss": 8.7101,
"step": 15
},
{
"epoch": 0.08695652173913043,
"grad_norm": 10.228378996373296,
"learning_rate": 1.647058823529412e-05,
"loss": 8.3074,
"step": 16
},
{
"epoch": 0.09239130434782608,
"grad_norm": 10.657372840257251,
"learning_rate": 1.7647058823529414e-05,
"loss": 7.8589,
"step": 17
},
{
"epoch": 0.09782608695652174,
"grad_norm": 10.887433964524527,
"learning_rate": 1.8823529411764708e-05,
"loss": 7.4742,
"step": 18
},
{
"epoch": 0.10326086956521739,
"grad_norm": 11.682285639818433,
"learning_rate": 2e-05,
"loss": 6.8416,
"step": 19
},
{
"epoch": 0.10869565217391304,
"grad_norm": 11.901377724871265,
"learning_rate": 1.999982759060109e-05,
"loss": 6.2183,
"step": 20
},
{
"epoch": 0.11413043478260869,
"grad_norm": 11.383373292219964,
"learning_rate": 1.9999310368349344e-05,
"loss": 5.4371,
"step": 21
},
{
"epoch": 0.11956521739130435,
"grad_norm": 9.311596334088138,
"learning_rate": 1.999844835107957e-05,
"loss": 4.7164,
"step": 22
},
{
"epoch": 0.125,
"grad_norm": 8.688635937406437,
"learning_rate": 1.9997241568515742e-05,
"loss": 4.456,
"step": 23
},
{
"epoch": 0.13043478260869565,
"grad_norm": 7.4122077747748305,
"learning_rate": 1.9995690062269985e-05,
"loss": 3.8875,
"step": 24
},
{
"epoch": 0.1358695652173913,
"grad_norm": 6.888182537563505,
"learning_rate": 1.9993793885841157e-05,
"loss": 3.5685,
"step": 25
},
{
"epoch": 0.14130434782608695,
"grad_norm": 6.988607551936095,
"learning_rate": 1.9991553104612982e-05,
"loss": 3.4123,
"step": 26
},
{
"epoch": 0.14673913043478262,
"grad_norm": 7.211548625105269,
"learning_rate": 1.998896779585181e-05,
"loss": 3.0838,
"step": 27
},
{
"epoch": 0.15217391304347827,
"grad_norm": 7.767483170773942,
"learning_rate": 1.998603804870395e-05,
"loss": 2.831,
"step": 28
},
{
"epoch": 0.15760869565217392,
"grad_norm": 7.950559222260086,
"learning_rate": 1.9982763964192586e-05,
"loss": 2.6297,
"step": 29
},
{
"epoch": 0.16304347826086957,
"grad_norm": 8.23795631455961,
"learning_rate": 1.9979145655214306e-05,
"loss": 2.2795,
"step": 30
},
{
"epoch": 0.16847826086956522,
"grad_norm": 8.57956169127235,
"learning_rate": 1.9975183246535212e-05,
"loss": 2.0509,
"step": 31
},
{
"epoch": 0.17391304347826086,
"grad_norm": 8.071070816084118,
"learning_rate": 1.99708768747866e-05,
"loss": 1.8279,
"step": 32
},
{
"epoch": 0.1793478260869565,
"grad_norm": 7.042152882720071,
"learning_rate": 1.9966226688460258e-05,
"loss": 1.3567,
"step": 33
},
{
"epoch": 0.18478260869565216,
"grad_norm": 4.814338676579685,
"learning_rate": 1.996123284790336e-05,
"loss": 0.9542,
"step": 34
},
{
"epoch": 0.19021739130434784,
"grad_norm": 2.9434658655739474,
"learning_rate": 1.9955895525312913e-05,
"loss": 0.8261,
"step": 35
},
{
"epoch": 0.1956521739130435,
"grad_norm": 2.452806110360505,
"learning_rate": 1.995021490472983e-05,
"loss": 0.851,
"step": 36
},
{
"epoch": 0.20108695652173914,
"grad_norm": 1.6789979391543146,
"learning_rate": 1.9944191182032588e-05,
"loss": 0.8265,
"step": 37
},
{
"epoch": 0.20652173913043478,
"grad_norm": 2.0007370440742154,
"learning_rate": 1.9937824564930474e-05,
"loss": 0.8181,
"step": 38
},
{
"epoch": 0.21195652173913043,
"grad_norm": 2.493212508529885,
"learning_rate": 1.9931115272956405e-05,
"loss": 0.767,
"step": 39
},
{
"epoch": 0.21739130434782608,
"grad_norm": 1.9209687838841931,
"learning_rate": 1.992406353745939e-05,
"loss": 0.7196,
"step": 40
},
{
"epoch": 0.22282608695652173,
"grad_norm": 1.8290330319103352,
"learning_rate": 1.9916669601596515e-05,
"loss": 0.7299,
"step": 41
},
{
"epoch": 0.22826086956521738,
"grad_norm": 1.7900648029089992,
"learning_rate": 1.990893372032459e-05,
"loss": 0.7229,
"step": 42
},
{
"epoch": 0.23369565217391305,
"grad_norm": 1.6749799534602232,
"learning_rate": 1.990085616039135e-05,
"loss": 0.7238,
"step": 43
},
{
"epoch": 0.2391304347826087,
"grad_norm": 1.986613572625418,
"learning_rate": 1.989243720032624e-05,
"loss": 0.7332,
"step": 44
},
{
"epoch": 0.24456521739130435,
"grad_norm": 1.8912806129771145,
"learning_rate": 1.9883677130430827e-05,
"loss": 0.5864,
"step": 45
},
{
"epoch": 0.25,
"grad_norm": 1.7750105086017574,
"learning_rate": 1.9874576252768793e-05,
"loss": 0.6124,
"step": 46
},
{
"epoch": 0.2554347826086957,
"grad_norm": 1.2955635391212061,
"learning_rate": 1.9865134881155504e-05,
"loss": 0.6884,
"step": 47
},
{
"epoch": 0.2608695652173913,
"grad_norm": 1.273010141736733,
"learning_rate": 1.98553533411472e-05,
"loss": 0.6484,
"step": 48
},
{
"epoch": 0.266304347826087,
"grad_norm": 2.163538460282388,
"learning_rate": 1.9845231970029774e-05,
"loss": 0.7095,
"step": 49
},
{
"epoch": 0.2717391304347826,
"grad_norm": 1.8775881503442995,
"learning_rate": 1.983477111680712e-05,
"loss": 0.604,
"step": 50
},
{
"epoch": 0.27717391304347827,
"grad_norm": 1.5484748822902972,
"learning_rate": 1.9823971142189126e-05,
"loss": 0.6862,
"step": 51
},
{
"epoch": 0.2826086956521739,
"grad_norm": 1.0946391927116763,
"learning_rate": 1.981283241857922e-05,
"loss": 0.6276,
"step": 52
},
{
"epoch": 0.28804347826086957,
"grad_norm": 1.4879971843628843,
"learning_rate": 1.9801355330061526e-05,
"loss": 0.5763,
"step": 53
},
{
"epoch": 0.29347826086956524,
"grad_norm": 1.8993705185884953,
"learning_rate": 1.978954027238763e-05,
"loss": 0.5908,
"step": 54
},
{
"epoch": 0.29891304347826086,
"grad_norm": 1.6076663483914293,
"learning_rate": 1.9777387652962933e-05,
"loss": 0.5543,
"step": 55
},
{
"epoch": 0.30434782608695654,
"grad_norm": 1.1740894440396383,
"learning_rate": 1.9764897890832597e-05,
"loss": 0.5458,
"step": 56
},
{
"epoch": 0.30978260869565216,
"grad_norm": 1.9838553435397361,
"learning_rate": 1.9752071416667102e-05,
"loss": 0.5046,
"step": 57
},
{
"epoch": 0.31521739130434784,
"grad_norm": 1.0812842728047714,
"learning_rate": 1.973890867274738e-05,
"loss": 0.5609,
"step": 58
},
{
"epoch": 0.32065217391304346,
"grad_norm": 1.723223092822651,
"learning_rate": 1.972541011294959e-05,
"loss": 0.4724,
"step": 59
},
{
"epoch": 0.32608695652173914,
"grad_norm": 1.4887350192643218,
"learning_rate": 1.9711576202729445e-05,
"loss": 0.5168,
"step": 60
},
{
"epoch": 0.33152173913043476,
"grad_norm": 1.533986608527031,
"learning_rate": 1.9697407419106178e-05,
"loss": 0.5374,
"step": 61
},
{
"epoch": 0.33695652173913043,
"grad_norm": 1.283663400004928,
"learning_rate": 1.9682904250646084e-05,
"loss": 0.622,
"step": 62
},
{
"epoch": 0.3423913043478261,
"grad_norm": 1.511070122779534,
"learning_rate": 1.9668067197445662e-05,
"loss": 0.572,
"step": 63
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.843030359662425,
"learning_rate": 1.9652896771114416e-05,
"loss": 0.5449,
"step": 64
},
{
"epoch": 0.3532608695652174,
"grad_norm": 2.2753033401712752,
"learning_rate": 1.9637393494757146e-05,
"loss": 0.6883,
"step": 65
},
{
"epoch": 0.358695652173913,
"grad_norm": 1.1407510209951979,
"learning_rate": 1.962155790295597e-05,
"loss": 0.4357,
"step": 66
},
{
"epoch": 0.3641304347826087,
"grad_norm": 1.351954153650573,
"learning_rate": 1.9605390541751864e-05,
"loss": 0.5109,
"step": 67
},
{
"epoch": 0.3695652173913043,
"grad_norm": 1.2344312626302043,
"learning_rate": 1.9588891968625828e-05,
"loss": 0.5133,
"step": 68
},
{
"epoch": 0.375,
"grad_norm": 3.528171261663953,
"learning_rate": 1.9572062752479684e-05,
"loss": 0.7135,
"step": 69
},
{
"epoch": 0.3804347826086957,
"grad_norm": 1.0283054372439564,
"learning_rate": 1.9554903473616432e-05,
"loss": 0.4934,
"step": 70
},
{
"epoch": 0.3858695652173913,
"grad_norm": 1.2480924815092371,
"learning_rate": 1.953741472372027e-05,
"loss": 0.3846,
"step": 71
},
{
"epoch": 0.391304347826087,
"grad_norm": 1.4701584460006578,
"learning_rate": 1.951959710583616e-05,
"loss": 0.5303,
"step": 72
},
{
"epoch": 0.3967391304347826,
"grad_norm": 2.2396908880712774,
"learning_rate": 1.950145123434907e-05,
"loss": 0.4241,
"step": 73
},
{
"epoch": 0.40217391304347827,
"grad_norm": 1.7904621917947958,
"learning_rate": 1.9482977734962753e-05,
"loss": 0.6144,
"step": 74
},
{
"epoch": 0.4076086956521739,
"grad_norm": 1.650705831140192,
"learning_rate": 1.94641772446782e-05,
"loss": 0.592,
"step": 75
},
{
"epoch": 0.41304347826086957,
"grad_norm": 1.588255971243881,
"learning_rate": 1.9445050411771648e-05,
"loss": 0.5918,
"step": 76
},
{
"epoch": 0.41847826086956524,
"grad_norm": 1.4379861368277966,
"learning_rate": 1.9425597895772257e-05,
"loss": 0.604,
"step": 77
},
{
"epoch": 0.42391304347826086,
"grad_norm": 1.7783069990731366,
"learning_rate": 1.9405820367439343e-05,
"loss": 0.6351,
"step": 78
},
{
"epoch": 0.42934782608695654,
"grad_norm": 1.3451929958729711,
"learning_rate": 1.9385718508739263e-05,
"loss": 0.4487,
"step": 79
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.5631174238633363,
"learning_rate": 1.9365293012821887e-05,
"loss": 0.5412,
"step": 80
},
{
"epoch": 0.44021739130434784,
"grad_norm": 1.7641796531654723,
"learning_rate": 1.934454458399671e-05,
"loss": 0.4606,
"step": 81
},
{
"epoch": 0.44565217391304346,
"grad_norm": 2.007206796904478,
"learning_rate": 1.9323473937708565e-05,
"loss": 0.5409,
"step": 82
},
{
"epoch": 0.45108695652173914,
"grad_norm": 1.6060302211544533,
"learning_rate": 1.9302081800512943e-05,
"loss": 0.5194,
"step": 83
},
{
"epoch": 0.45652173913043476,
"grad_norm": 1.584139057778314,
"learning_rate": 1.9280368910050943e-05,
"loss": 0.4662,
"step": 84
},
{
"epoch": 0.46195652173913043,
"grad_norm": 1.8953323400594193,
"learning_rate": 1.9258336015023847e-05,
"loss": 0.4433,
"step": 85
},
{
"epoch": 0.4673913043478261,
"grad_norm": 1.6067605181621798,
"learning_rate": 1.9235983875167296e-05,
"loss": 0.4255,
"step": 86
},
{
"epoch": 0.47282608695652173,
"grad_norm": 1.4529302278758023,
"learning_rate": 1.9213313261225083e-05,
"loss": 0.4364,
"step": 87
},
{
"epoch": 0.4782608695652174,
"grad_norm": 1.9965642456327142,
"learning_rate": 1.9190324954922594e-05,
"loss": 0.4199,
"step": 88
},
{
"epoch": 0.483695652173913,
"grad_norm": 1.9458245431232768,
"learning_rate": 1.9167019748939847e-05,
"loss": 0.4024,
"step": 89
},
{
"epoch": 0.4891304347826087,
"grad_norm": 2.000159805579825,
"learning_rate": 1.914339844688415e-05,
"loss": 0.4595,
"step": 90
},
{
"epoch": 0.4945652173913043,
"grad_norm": 1.97378975953703,
"learning_rate": 1.91194618632624e-05,
"loss": 0.4917,
"step": 91
},
{
"epoch": 0.5,
"grad_norm": 1.3771983904411074,
"learning_rate": 1.9095210823452997e-05,
"loss": 0.3341,
"step": 92
},
{
"epoch": 0.5054347826086957,
"grad_norm": 1.8123410249166505,
"learning_rate": 1.9070646163677383e-05,
"loss": 0.4285,
"step": 93
},
{
"epoch": 0.5108695652173914,
"grad_norm": 1.7561172390607174,
"learning_rate": 1.9045768730971198e-05,
"loss": 0.3863,
"step": 94
},
{
"epoch": 0.5163043478260869,
"grad_norm": 1.809060828661053,
"learning_rate": 1.9020579383155087e-05,
"loss": 0.3486,
"step": 95
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.541206279317173,
"learning_rate": 1.899507898880512e-05,
"loss": 0.1713,
"step": 96
},
{
"epoch": 0.5271739130434783,
"grad_norm": 2.0502484531232343,
"learning_rate": 1.8969268427222823e-05,
"loss": 0.2059,
"step": 97
},
{
"epoch": 0.532608695652174,
"grad_norm": 1.8524406597388374,
"learning_rate": 1.8943148588404877e-05,
"loss": 0.3856,
"step": 98
},
{
"epoch": 0.5380434782608695,
"grad_norm": 3.385889154621842,
"learning_rate": 1.8916720373012425e-05,
"loss": 0.3027,
"step": 99
},
{
"epoch": 0.5434782608695652,
"grad_norm": 1.2814547066301334,
"learning_rate": 1.8889984692340015e-05,
"loss": 0.1609,
"step": 100
},
{
"epoch": 0.5489130434782609,
"grad_norm": 1.473493575445019,
"learning_rate": 1.8862942468284174e-05,
"loss": 0.1658,
"step": 101
},
{
"epoch": 0.5543478260869565,
"grad_norm": 2.2017906861514125,
"learning_rate": 1.883559463331162e-05,
"loss": 0.2269,
"step": 102
},
{
"epoch": 0.5597826086956522,
"grad_norm": 2.9266092953974345,
"learning_rate": 1.880794213042711e-05,
"loss": 0.2638,
"step": 103
},
{
"epoch": 0.5652173913043478,
"grad_norm": 1.2470192969755443,
"learning_rate": 1.8779985913140927e-05,
"loss": 0.1826,
"step": 104
},
{
"epoch": 0.5706521739130435,
"grad_norm": 1.1329281006012806,
"learning_rate": 1.875172694543599e-05,
"loss": 0.0992,
"step": 105
},
{
"epoch": 0.5760869565217391,
"grad_norm": 1.435458967360399,
"learning_rate": 1.8723166201734626e-05,
"loss": 0.1052,
"step": 106
},
{
"epoch": 0.5815217391304348,
"grad_norm": 2.4406380430615244,
"learning_rate": 1.869430466686497e-05,
"loss": 0.1999,
"step": 107
},
{
"epoch": 0.5869565217391305,
"grad_norm": 1.0271614062096617,
"learning_rate": 1.8665143336027e-05,
"loss": 0.0855,
"step": 108
},
{
"epoch": 0.592391304347826,
"grad_norm": 1.3651592297249626,
"learning_rate": 1.8635683214758213e-05,
"loss": 0.0977,
"step": 109
},
{
"epoch": 0.5978260869565217,
"grad_norm": 0.5945892482638718,
"learning_rate": 1.8605925318898973e-05,
"loss": 0.0337,
"step": 110
},
{
"epoch": 0.6032608695652174,
"grad_norm": 1.194835217639101,
"learning_rate": 1.8575870674557467e-05,
"loss": 0.0722,
"step": 111
},
{
"epoch": 0.6086956521739131,
"grad_norm": 1.762735939201958,
"learning_rate": 1.8545520318074328e-05,
"loss": 0.1228,
"step": 112
},
{
"epoch": 0.6141304347826086,
"grad_norm": 1.017829163872169,
"learning_rate": 1.85148752959869e-05,
"loss": 0.0344,
"step": 113
},
{
"epoch": 0.6195652173913043,
"grad_norm": 1.052690658912748,
"learning_rate": 1.8483936664993152e-05,
"loss": 0.0377,
"step": 114
},
{
"epoch": 0.625,
"grad_norm": 1.7977784022224987,
"learning_rate": 1.8452705491915232e-05,
"loss": 0.141,
"step": 115
},
{
"epoch": 0.6304347826086957,
"grad_norm": 1.8477093237099182,
"learning_rate": 1.8421182853662704e-05,
"loss": 0.0734,
"step": 116
},
{
"epoch": 0.6358695652173914,
"grad_norm": 0.6794730347498438,
"learning_rate": 1.8389369837195387e-05,
"loss": 0.0266,
"step": 117
},
{
"epoch": 0.6413043478260869,
"grad_norm": 0.8818635589659883,
"learning_rate": 1.835726753948589e-05,
"loss": 0.0487,
"step": 118
},
{
"epoch": 0.6467391304347826,
"grad_norm": 1.0608887498751458,
"learning_rate": 1.8324877067481782e-05,
"loss": 0.0275,
"step": 119
},
{
"epoch": 0.6521739130434783,
"grad_norm": 1.3129587931586821,
"learning_rate": 1.829219953806743e-05,
"loss": 0.0642,
"step": 120
},
{
"epoch": 0.657608695652174,
"grad_norm": 1.8948301224723039,
"learning_rate": 1.825923607802547e-05,
"loss": 0.0785,
"step": 121
},
{
"epoch": 0.6630434782608695,
"grad_norm": 0.2518374968408712,
"learning_rate": 1.8225987823997967e-05,
"loss": 0.0111,
"step": 122
},
{
"epoch": 0.6684782608695652,
"grad_norm": 0.25552971144651465,
"learning_rate": 1.8192455922447227e-05,
"loss": 0.0103,
"step": 123
},
{
"epoch": 0.6739130434782609,
"grad_norm": 0.7841302667217214,
"learning_rate": 1.815864152961624e-05,
"loss": 0.0122,
"step": 124
},
{
"epoch": 0.6793478260869565,
"grad_norm": 0.1515291563958561,
"learning_rate": 1.812454581148884e-05,
"loss": 0.0079,
"step": 125
},
{
"epoch": 0.6847826086956522,
"grad_norm": 0.11584834326779594,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.0055,
"step": 126
},
{
"epoch": 0.6902173913043478,
"grad_norm": 0.1740566784478502,
"learning_rate": 1.8055515111742688e-05,
"loss": 0.0069,
"step": 127
},
{
"epoch": 0.6956521739130435,
"grad_norm": 1.5625062014274096,
"learning_rate": 1.8020582510432234e-05,
"loss": 0.0383,
"step": 128
},
{
"epoch": 0.7010869565217391,
"grad_norm": 0.12273159750563628,
"learning_rate": 1.798537334435986e-05,
"loss": 0.0062,
"step": 129
},
{
"epoch": 0.7065217391304348,
"grad_norm": 3.693193027378141,
"learning_rate": 1.7949888827603813e-05,
"loss": 0.1765,
"step": 130
},
{
"epoch": 0.7119565217391305,
"grad_norm": 0.12477337459792677,
"learning_rate": 1.791413018373692e-05,
"loss": 0.0057,
"step": 131
},
{
"epoch": 0.717391304347826,
"grad_norm": 0.8357268279739778,
"learning_rate": 1.7878098645784447e-05,
"loss": 0.0163,
"step": 132
},
{
"epoch": 0.7228260869565217,
"grad_norm": 3.8264656288549985,
"learning_rate": 1.7841795456181556e-05,
"loss": 0.1727,
"step": 133
},
{
"epoch": 0.7282608695652174,
"grad_norm": 0.6387227523871831,
"learning_rate": 1.780522186673046e-05,
"loss": 0.0076,
"step": 134
},
{
"epoch": 0.7336956521739131,
"grad_norm": 0.09079528876022976,
"learning_rate": 1.776837913855728e-05,
"loss": 0.0038,
"step": 135
},
{
"epoch": 0.7391304347826086,
"grad_norm": 1.9001901725953279,
"learning_rate": 1.7731268542068536e-05,
"loss": 0.0208,
"step": 136
},
{
"epoch": 0.7445652173913043,
"grad_norm": 0.21704170005212517,
"learning_rate": 1.7693891356907357e-05,
"loss": 0.007,
"step": 137
},
{
"epoch": 0.75,
"grad_norm": 0.7213653784073487,
"learning_rate": 1.7656248871909346e-05,
"loss": 0.0137,
"step": 138
},
{
"epoch": 0.7554347826086957,
"grad_norm": 0.40110602562720454,
"learning_rate": 1.7618342385058147e-05,
"loss": 0.0099,
"step": 139
},
{
"epoch": 0.7608695652173914,
"grad_norm": 2.026407827233553,
"learning_rate": 1.758017320344068e-05,
"loss": 0.0415,
"step": 140
},
{
"epoch": 0.7663043478260869,
"grad_norm": 1.1169723105563958,
"learning_rate": 1.754174264320208e-05,
"loss": 0.0232,
"step": 141
},
{
"epoch": 0.7717391304347826,
"grad_norm": 0.1746366846193237,
"learning_rate": 1.7503052029500308e-05,
"loss": 0.0052,
"step": 142
},
{
"epoch": 0.7771739130434783,
"grad_norm": 2.3203125623649874,
"learning_rate": 1.7464102696460447e-05,
"loss": 0.2205,
"step": 143
},
{
"epoch": 0.782608695652174,
"grad_norm": 3.9663829407278315,
"learning_rate": 1.7424895987128723e-05,
"loss": 0.223,
"step": 144
},
{
"epoch": 0.7880434782608695,
"grad_norm": 2.9570619026185883,
"learning_rate": 1.738543325342617e-05,
"loss": 0.0697,
"step": 145
},
{
"epoch": 0.7934782608695652,
"grad_norm": 0.07057319843123724,
"learning_rate": 1.7345715856102024e-05,
"loss": 0.0031,
"step": 146
},
{
"epoch": 0.7989130434782609,
"grad_norm": 0.11320521777018241,
"learning_rate": 1.7305745164686816e-05,
"loss": 0.0042,
"step": 147
},
{
"epoch": 0.8043478260869565,
"grad_norm": 1.3124572295306176,
"learning_rate": 1.7265522557445115e-05,
"loss": 0.021,
"step": 148
},
{
"epoch": 0.8097826086956522,
"grad_norm": 0.42701665371399616,
"learning_rate": 1.7225049421328024e-05,
"loss": 0.0091,
"step": 149
},
{
"epoch": 0.8152173913043478,
"grad_norm": 0.6276112813031721,
"learning_rate": 1.7184327151925366e-05,
"loss": 0.0094,
"step": 150
},
{
"epoch": 0.8206521739130435,
"grad_norm": 1.5664524264393311,
"learning_rate": 1.7143357153417533e-05,
"loss": 0.0256,
"step": 151
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.41431375770399115,
"learning_rate": 1.710214083852709e-05,
"loss": 0.0117,
"step": 152
},
{
"epoch": 0.8315217391304348,
"grad_norm": 0.3493269925986,
"learning_rate": 1.7060679628470054e-05,
"loss": 0.0084,
"step": 153
},
{
"epoch": 0.8369565217391305,
"grad_norm": 0.3211404898250956,
"learning_rate": 1.7018974952906885e-05,
"loss": 0.0084,
"step": 154
},
{
"epoch": 0.842391304347826,
"grad_norm": 0.21231254558257762,
"learning_rate": 1.697702824989319e-05,
"loss": 0.0065,
"step": 155
},
{
"epoch": 0.8478260869565217,
"grad_norm": 1.457137599474762,
"learning_rate": 1.693484096583014e-05,
"loss": 0.0226,
"step": 156
},
{
"epoch": 0.8532608695652174,
"grad_norm": 0.19497147073015395,
"learning_rate": 1.6892414555414594e-05,
"loss": 0.0048,
"step": 157
},
{
"epoch": 0.8586956521739131,
"grad_norm": 1.8062131040571878,
"learning_rate": 1.6849750481588936e-05,
"loss": 0.0277,
"step": 158
},
{
"epoch": 0.8641304347826086,
"grad_norm": 1.3188356922598312,
"learning_rate": 1.680685021549063e-05,
"loss": 0.0207,
"step": 159
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.26492812790936593,
"learning_rate": 1.6763715236401493e-05,
"loss": 0.0059,
"step": 160
},
{
"epoch": 0.875,
"grad_norm": 0.3017199408994534,
"learning_rate": 1.672034703169669e-05,
"loss": 0.0076,
"step": 161
},
{
"epoch": 0.8804347826086957,
"grad_norm": 0.1252817764595737,
"learning_rate": 1.667674709679344e-05,
"loss": 0.0041,
"step": 162
},
{
"epoch": 0.8858695652173914,
"grad_norm": 1.1529370223873083,
"learning_rate": 1.663291693509946e-05,
"loss": 0.019,
"step": 163
},
{
"epoch": 0.8913043478260869,
"grad_norm": 0.12063163996672908,
"learning_rate": 1.658885805796111e-05,
"loss": 0.0031,
"step": 164
},
{
"epoch": 0.8967391304347826,
"grad_norm": 0.11125376158368971,
"learning_rate": 1.6544571984611306e-05,
"loss": 0.0034,
"step": 165
},
{
"epoch": 0.9021739130434783,
"grad_norm": 0.19945453640512878,
"learning_rate": 1.6500060242117096e-05,
"loss": 0.0051,
"step": 166
},
{
"epoch": 0.907608695652174,
"grad_norm": 0.07254620014242376,
"learning_rate": 1.6455324365327035e-05,
"loss": 0.0026,
"step": 167
},
{
"epoch": 0.9130434782608695,
"grad_norm": 1.3895686723936829,
"learning_rate": 1.6410365896818253e-05,
"loss": 0.0234,
"step": 168
},
{
"epoch": 0.9184782608695652,
"grad_norm": 0.7517916115731629,
"learning_rate": 1.636518638684325e-05,
"loss": 0.0057,
"step": 169
},
{
"epoch": 0.9239130434782609,
"grad_norm": 0.11708397875230993,
"learning_rate": 1.6319787393276463e-05,
"loss": 0.0036,
"step": 170
},
{
"epoch": 0.9293478260869565,
"grad_norm": 0.027987175186703777,
"learning_rate": 1.6274170481560527e-05,
"loss": 0.0015,
"step": 171
},
{
"epoch": 0.9347826086956522,
"grad_norm": 0.17986790848065237,
"learning_rate": 1.6228337224652307e-05,
"loss": 0.0059,
"step": 172
},
{
"epoch": 0.9402173913043478,
"grad_norm": 0.03867873116439446,
"learning_rate": 1.6182289202968663e-05,
"loss": 0.0017,
"step": 173
},
{
"epoch": 0.9456521739130435,
"grad_norm": 0.057278523890185604,
"learning_rate": 1.613602800433194e-05,
"loss": 0.0024,
"step": 174
},
{
"epoch": 0.9510869565217391,
"grad_norm": 2.728399164781685,
"learning_rate": 1.6089555223915226e-05,
"loss": 0.1588,
"step": 175
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.3768997196852311,
"learning_rate": 1.6042872464187352e-05,
"loss": 0.0054,
"step": 176
},
{
"epoch": 0.9619565217391305,
"grad_norm": 4.011589996542784,
"learning_rate": 1.5995981334857625e-05,
"loss": 0.0702,
"step": 177
},
{
"epoch": 0.967391304347826,
"grad_norm": 0.49004409324214177,
"learning_rate": 1.5948883452820326e-05,
"loss": 0.01,
"step": 178
},
{
"epoch": 0.9728260869565217,
"grad_norm": 0.048813631073329034,
"learning_rate": 1.590158044209897e-05,
"loss": 0.002,
"step": 179
},
{
"epoch": 0.9782608695652174,
"grad_norm": 0.09547901003362863,
"learning_rate": 1.5854073933790277e-05,
"loss": 0.0024,
"step": 180
},
{
"epoch": 0.9836956521739131,
"grad_norm": 2.3086350812363565,
"learning_rate": 1.580636556600796e-05,
"loss": 0.0277,
"step": 181
},
{
"epoch": 0.9891304347826086,
"grad_norm": 2.752485470216331,
"learning_rate": 1.575845698382622e-05,
"loss": 0.0671,
"step": 182
},
{
"epoch": 0.9945652173913043,
"grad_norm": 0.08760080184190135,
"learning_rate": 1.5710349839223034e-05,
"loss": 0.0025,
"step": 183
},
{
"epoch": 1.0,
"grad_norm": 0.052319179757302624,
"learning_rate": 1.566204579102317e-05,
"loss": 0.0016,
"step": 184
},
{
"epoch": 1.0054347826086956,
"grad_norm": 0.20188982483949725,
"learning_rate": 1.561354650484102e-05,
"loss": 0.0054,
"step": 185
},
{
"epoch": 1.0108695652173914,
"grad_norm": 1.214861582615001,
"learning_rate": 1.556485365302313e-05,
"loss": 0.0095,
"step": 186
},
{
"epoch": 1.016304347826087,
"grad_norm": 1.1857810014141275,
"learning_rate": 1.5515968914590568e-05,
"loss": 0.0161,
"step": 187
},
{
"epoch": 1.0217391304347827,
"grad_norm": 0.19290187635263223,
"learning_rate": 1.546689397518101e-05,
"loss": 0.004,
"step": 188
},
{
"epoch": 1.0271739130434783,
"grad_norm": 0.22326269659684472,
"learning_rate": 1.5417630526990613e-05,
"loss": 0.0044,
"step": 189
},
{
"epoch": 1.0326086956521738,
"grad_norm": 0.0690691126927046,
"learning_rate": 1.5368180268715678e-05,
"loss": 0.0022,
"step": 190
},
{
"epoch": 1.0380434782608696,
"grad_norm": 0.519784946142706,
"learning_rate": 1.5318544905494063e-05,
"loss": 0.0075,
"step": 191
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.1210215491547705,
"learning_rate": 1.52687261488464e-05,
"loss": 0.0032,
"step": 192
},
{
"epoch": 1.048913043478261,
"grad_norm": 0.1128182153705411,
"learning_rate": 1.5218725716617062e-05,
"loss": 0.0031,
"step": 193
},
{
"epoch": 1.0543478260869565,
"grad_norm": 0.0917279431010188,
"learning_rate": 1.5168545332914942e-05,
"loss": 0.0032,
"step": 194
},
{
"epoch": 1.059782608695652,
"grad_norm": 0.1599750281188914,
"learning_rate": 1.5118186728054002e-05,
"loss": 0.0034,
"step": 195
},
{
"epoch": 1.065217391304348,
"grad_norm": 3.0052317701428906,
"learning_rate": 1.50676516384936e-05,
"loss": 0.2052,
"step": 196
},
{
"epoch": 1.0706521739130435,
"grad_norm": 0.09347487309598097,
"learning_rate": 1.5016941806778622e-05,
"loss": 0.0024,
"step": 197
},
{
"epoch": 1.0760869565217392,
"grad_norm": 0.6368154943577347,
"learning_rate": 1.496605898147938e-05,
"loss": 0.0112,
"step": 198
},
{
"epoch": 1.0815217391304348,
"grad_norm": 0.08805765943523453,
"learning_rate": 1.4915004917131345e-05,
"loss": 0.0025,
"step": 199
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.05469514003374087,
"learning_rate": 1.4863781374174625e-05,
"loss": 0.002,
"step": 200
},
{
"epoch": 1.0923913043478262,
"grad_norm": 0.10652940546536208,
"learning_rate": 1.4812390118893273e-05,
"loss": 0.0032,
"step": 201
},
{
"epoch": 1.0978260869565217,
"grad_norm": 4.207882558276106,
"learning_rate": 1.4760832923354375e-05,
"loss": 0.0583,
"step": 202
},
{
"epoch": 1.1032608695652173,
"grad_norm": 0.0699647885839302,
"learning_rate": 1.4709111565346948e-05,
"loss": 0.0026,
"step": 203
},
{
"epoch": 1.108695652173913,
"grad_norm": 0.30166623168218903,
"learning_rate": 1.4657227828320637e-05,
"loss": 0.006,
"step": 204
},
{
"epoch": 1.1141304347826086,
"grad_norm": 4.199370993333585,
"learning_rate": 1.4605183501324231e-05,
"loss": 0.0775,
"step": 205
},
{
"epoch": 1.1195652173913044,
"grad_norm": 0.32565218496952747,
"learning_rate": 1.4552980378943953e-05,
"loss": 0.0033,
"step": 206
},
{
"epoch": 1.125,
"grad_norm": 0.0809703234967001,
"learning_rate": 1.4500620261241598e-05,
"loss": 0.0026,
"step": 207
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.06883017026031267,
"learning_rate": 1.4448104953692443e-05,
"loss": 0.0019,
"step": 208
},
{
"epoch": 1.1358695652173914,
"grad_norm": 0.08112137716749798,
"learning_rate": 1.4395436267123017e-05,
"loss": 0.0025,
"step": 209
},
{
"epoch": 1.141304347826087,
"grad_norm": 0.0472362130550949,
"learning_rate": 1.4342616017648632e-05,
"loss": 0.0018,
"step": 210
},
{
"epoch": 1.1467391304347827,
"grad_norm": 0.0884620238410297,
"learning_rate": 1.4289646026610789e-05,
"loss": 0.0021,
"step": 211
},
{
"epoch": 1.1521739130434783,
"grad_norm": 0.04795365977948435,
"learning_rate": 1.423652812051434e-05,
"loss": 0.0017,
"step": 212
},
{
"epoch": 1.1576086956521738,
"grad_norm": 0.02935797027689571,
"learning_rate": 1.4183264130964545e-05,
"loss": 0.0015,
"step": 213
},
{
"epoch": 1.1630434782608696,
"grad_norm": 0.0668820523334726,
"learning_rate": 1.4129855894603885e-05,
"loss": 0.0027,
"step": 214
},
{
"epoch": 1.1684782608695652,
"grad_norm": 0.7758685627388171,
"learning_rate": 1.4076305253048748e-05,
"loss": 0.0105,
"step": 215
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.7009141120346845,
"learning_rate": 1.4022614052825918e-05,
"loss": 0.01,
"step": 216
},
{
"epoch": 1.1793478260869565,
"grad_norm": 0.058294067779879076,
"learning_rate": 1.3968784145308907e-05,
"loss": 0.002,
"step": 217
},
{
"epoch": 1.184782608695652,
"grad_norm": 0.09580668260043325,
"learning_rate": 1.3914817386654112e-05,
"loss": 0.0028,
"step": 218
},
{
"epoch": 1.190217391304348,
"grad_norm": 4.864872194559485,
"learning_rate": 1.3860715637736817e-05,
"loss": 0.1252,
"step": 219
},
{
"epoch": 1.1956521739130435,
"grad_norm": 0.15310828918627564,
"learning_rate": 1.3806480764087027e-05,
"loss": 0.003,
"step": 220
},
{
"epoch": 1.2010869565217392,
"grad_norm": 0.3265801320494785,
"learning_rate": 1.3752114635825138e-05,
"loss": 0.005,
"step": 221
},
{
"epoch": 1.2065217391304348,
"grad_norm": 4.409339908706341,
"learning_rate": 1.369761912759744e-05,
"loss": 0.1368,
"step": 222
},
{
"epoch": 1.2119565217391304,
"grad_norm": 0.09658224964632216,
"learning_rate": 1.3642996118511504e-05,
"loss": 0.0027,
"step": 223
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.13386998342251066,
"learning_rate": 1.358824749207136e-05,
"loss": 0.0029,
"step": 224
},
{
"epoch": 1.2228260869565217,
"grad_norm": 0.058694075695156535,
"learning_rate": 1.3533375136112563e-05,
"loss": 0.0019,
"step": 225
},
{
"epoch": 1.2282608695652173,
"grad_norm": 0.1675736492580823,
"learning_rate": 1.3478380942737097e-05,
"loss": 0.0041,
"step": 226
},
{
"epoch": 1.233695652173913,
"grad_norm": 0.6605378406587118,
"learning_rate": 1.3423266808248123e-05,
"loss": 0.0064,
"step": 227
},
{
"epoch": 1.2391304347826086,
"grad_norm": 0.07582983219640445,
"learning_rate": 1.3368034633084603e-05,
"loss": 0.0021,
"step": 228
},
{
"epoch": 1.2445652173913044,
"grad_norm": 0.11839256459523798,
"learning_rate": 1.331268632175576e-05,
"loss": 0.0033,
"step": 229
},
{
"epoch": 1.25,
"grad_norm": 0.498989993420891,
"learning_rate": 1.3257223782775412e-05,
"loss": 0.0058,
"step": 230
},
{
"epoch": 1.2554347826086958,
"grad_norm": 0.0627689672183379,
"learning_rate": 1.3201648928596164e-05,
"loss": 0.0028,
"step": 231
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.44003082591712833,
"learning_rate": 1.3145963675543451e-05,
"loss": 0.0056,
"step": 232
},
{
"epoch": 1.266304347826087,
"grad_norm": 3.9655617256713556,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.0738,
"step": 233
},
{
"epoch": 1.2717391304347827,
"grad_norm": 0.1490491896911272,
"learning_rate": 1.3034269657086993e-05,
"loss": 0.003,
"step": 234
},
{
"epoch": 1.2771739130434783,
"grad_norm": 0.255678738387853,
"learning_rate": 1.2978264743102964e-05,
"loss": 0.0036,
"step": 235
},
{
"epoch": 1.2826086956521738,
"grad_norm": 0.08658556472142168,
"learning_rate": 1.2922157132952106e-05,
"loss": 0.003,
"step": 236
},
{
"epoch": 1.2880434782608696,
"grad_norm": 0.056388528409829865,
"learning_rate": 1.286594876133028e-05,
"loss": 0.0016,
"step": 237
},
{
"epoch": 1.2934782608695652,
"grad_norm": 1.5398049755885386,
"learning_rate": 1.2809641566407802e-05,
"loss": 0.0378,
"step": 238
},
{
"epoch": 1.2989130434782608,
"grad_norm": 0.036566689298081184,
"learning_rate": 1.27532374897626e-05,
"loss": 0.0012,
"step": 239
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.04920293791313143,
"learning_rate": 1.2696738476313261e-05,
"loss": 0.0017,
"step": 240
},
{
"epoch": 1.309782608695652,
"grad_norm": 0.1402817359911882,
"learning_rate": 1.2640146474251979e-05,
"loss": 0.0036,
"step": 241
},
{
"epoch": 1.315217391304348,
"grad_norm": 0.06831135225959813,
"learning_rate": 1.258346343497736e-05,
"loss": 0.0025,
"step": 242
},
{
"epoch": 1.3206521739130435,
"grad_norm": 0.028285907167631727,
"learning_rate": 1.2526691313027153e-05,
"loss": 0.001,
"step": 243
},
{
"epoch": 1.3260869565217392,
"grad_norm": 0.33707980146121225,
"learning_rate": 1.2469832066010843e-05,
"loss": 0.0074,
"step": 244
},
{
"epoch": 1.3315217391304348,
"grad_norm": 0.02312342530538864,
"learning_rate": 1.2412887654542147e-05,
"loss": 0.001,
"step": 245
},
{
"epoch": 1.3369565217391304,
"grad_norm": 0.026427047059385186,
"learning_rate": 1.2355860042171421e-05,
"loss": 0.0011,
"step": 246
},
{
"epoch": 1.3423913043478262,
"grad_norm": 2.9263468296261164,
"learning_rate": 1.2298751195317935e-05,
"loss": 0.1557,
"step": 247
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.020548021429656328,
"learning_rate": 1.224156308320208e-05,
"loss": 0.0009,
"step": 248
},
{
"epoch": 1.3532608695652173,
"grad_norm": 0.025684644607937637,
"learning_rate": 1.2184297677777463e-05,
"loss": 0.0011,
"step": 249
},
{
"epoch": 1.358695652173913,
"grad_norm": 0.4277199026740869,
"learning_rate": 1.2126956953662914e-05,
"loss": 0.0074,
"step": 250
},
{
"epoch": 1.3641304347826086,
"grad_norm": 0.722362923284817,
"learning_rate": 1.2069542888074386e-05,
"loss": 0.0094,
"step": 251
},
{
"epoch": 1.3695652173913042,
"grad_norm": 0.05042192018129352,
"learning_rate": 1.2012057460756786e-05,
"loss": 0.0016,
"step": 252
},
{
"epoch": 1.375,
"grad_norm": 0.04160962471056512,
"learning_rate": 1.1954502653915704e-05,
"loss": 0.0014,
"step": 253
},
{
"epoch": 1.3804347826086958,
"grad_norm": 0.04523201782339563,
"learning_rate": 1.1896880452149077e-05,
"loss": 0.0016,
"step": 254
},
{
"epoch": 1.3858695652173914,
"grad_norm": 0.023639170674016628,
"learning_rate": 1.1839192842378737e-05,
"loss": 0.0009,
"step": 255
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.04866250108659108,
"learning_rate": 1.1781441813781911e-05,
"loss": 0.0014,
"step": 256
},
{
"epoch": 1.3967391304347827,
"grad_norm": 0.027392748713626538,
"learning_rate": 1.1723629357722622e-05,
"loss": 0.001,
"step": 257
},
{
"epoch": 1.4021739130434783,
"grad_norm": 0.04956045333392312,
"learning_rate": 1.1665757467683025e-05,
"loss": 0.0013,
"step": 258
},
{
"epoch": 1.4076086956521738,
"grad_norm": 0.287445593085176,
"learning_rate": 1.1607828139194683e-05,
"loss": 0.0051,
"step": 259
},
{
"epoch": 1.4130434782608696,
"grad_norm": 0.13531127988753577,
"learning_rate": 1.1549843369769733e-05,
"loss": 0.0023,
"step": 260
},
{
"epoch": 1.4184782608695652,
"grad_norm": 0.16453092649100554,
"learning_rate": 1.1491805158832028e-05,
"loss": 0.0031,
"step": 261
},
{
"epoch": 1.4239130434782608,
"grad_norm": 1.4301870845043336,
"learning_rate": 1.1433715507648173e-05,
"loss": 0.0166,
"step": 262
},
{
"epoch": 1.4293478260869565,
"grad_norm": 0.06079450292325032,
"learning_rate": 1.1375576419258543e-05,
"loss": 0.0016,
"step": 263
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.12935761070271598,
"learning_rate": 1.1317389898408188e-05,
"loss": 0.0022,
"step": 264
},
{
"epoch": 1.440217391304348,
"grad_norm": 0.06441466437879496,
"learning_rate": 1.125915795147773e-05,
"loss": 0.0017,
"step": 265
},
{
"epoch": 1.4456521739130435,
"grad_norm": 0.11938010559111087,
"learning_rate": 1.1200882586414168e-05,
"loss": 0.0021,
"step": 266
},
{
"epoch": 1.4510869565217392,
"grad_norm": 0.14576252527987352,
"learning_rate": 1.114256581266162e-05,
"loss": 0.0032,
"step": 267
},
{
"epoch": 1.4565217391304348,
"grad_norm": 0.8091624068148694,
"learning_rate": 1.1084209641092083e-05,
"loss": 0.0098,
"step": 268
},
{
"epoch": 1.4619565217391304,
"grad_norm": 0.07301592812987565,
"learning_rate": 1.1025816083936036e-05,
"loss": 0.0021,
"step": 269
},
{
"epoch": 1.4673913043478262,
"grad_norm": 0.019465384139083376,
"learning_rate": 1.0967387154713104e-05,
"loss": 0.0008,
"step": 270
},
{
"epoch": 1.4728260869565217,
"grad_norm": 0.02684807806576838,
"learning_rate": 1.0908924868162605e-05,
"loss": 0.0009,
"step": 271
},
{
"epoch": 1.4782608695652173,
"grad_norm": 2.0536809709774086,
"learning_rate": 1.0850431240174066e-05,
"loss": 0.2241,
"step": 272
},
{
"epoch": 1.483695652173913,
"grad_norm": 0.5395466577497267,
"learning_rate": 1.0791908287717744e-05,
"loss": 0.0097,
"step": 273
},
{
"epoch": 1.4891304347826086,
"grad_norm": 3.6218348045652107,
"learning_rate": 1.073335802877504e-05,
"loss": 0.0488,
"step": 274
},
{
"epoch": 1.4945652173913042,
"grad_norm": 0.0346000232826567,
"learning_rate": 1.0674782482268953e-05,
"loss": 0.0013,
"step": 275
},
{
"epoch": 1.5,
"grad_norm": 0.031039844572176237,
"learning_rate": 1.0616183667994435e-05,
"loss": 0.0011,
"step": 276
},
{
"epoch": 1.5054347826086958,
"grad_norm": 1.3869410436009917,
"learning_rate": 1.0557563606548751e-05,
"loss": 0.02,
"step": 277
},
{
"epoch": 1.5108695652173914,
"grad_norm": 0.31857812561228843,
"learning_rate": 1.0498924319261816e-05,
"loss": 0.0046,
"step": 278
},
{
"epoch": 1.516304347826087,
"grad_norm": 0.018901071551922013,
"learning_rate": 1.0440267828126478e-05,
"loss": 0.0007,
"step": 279
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.35747451319055523,
"learning_rate": 1.0381596155728823e-05,
"loss": 0.0077,
"step": 280
},
{
"epoch": 1.5271739130434783,
"grad_norm": 0.038504499041816166,
"learning_rate": 1.0322911325178402e-05,
"loss": 0.0012,
"step": 281
},
{
"epoch": 1.5326086956521738,
"grad_norm": 0.061533456725221265,
"learning_rate": 1.0264215360038483e-05,
"loss": 0.0018,
"step": 282
},
{
"epoch": 1.5380434782608696,
"grad_norm": 0.053405604412389306,
"learning_rate": 1.0205510284256286e-05,
"loss": 0.0014,
"step": 283
},
{
"epoch": 1.5434782608695652,
"grad_norm": 0.1699993644991474,
"learning_rate": 1.0146798122093167e-05,
"loss": 0.0029,
"step": 284
},
{
"epoch": 1.5489130434782608,
"grad_norm": 0.07043260478387495,
"learning_rate": 1.0088080898054852e-05,
"loss": 0.0013,
"step": 285
},
{
"epoch": 1.5543478260869565,
"grad_norm": 0.050883436804006456,
"learning_rate": 1.00293606368216e-05,
"loss": 0.0018,
"step": 286
},
{
"epoch": 1.5597826086956523,
"grad_norm": 0.2015858838482068,
"learning_rate": 9.970639363178401e-06,
"loss": 0.0034,
"step": 287
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.15696624949542315,
"learning_rate": 9.91191910194515e-06,
"loss": 0.0024,
"step": 288
},
{
"epoch": 1.5706521739130435,
"grad_norm": 0.016094697472839387,
"learning_rate": 9.853201877906836e-06,
"loss": 0.0007,
"step": 289
},
{
"epoch": 1.5760869565217392,
"grad_norm": 2.6447259699825225,
"learning_rate": 9.79448971574372e-06,
"loss": 0.0868,
"step": 290
},
{
"epoch": 1.5815217391304348,
"grad_norm": 0.034146999181789345,
"learning_rate": 9.73578463996152e-06,
"loss": 0.001,
"step": 291
},
{
"epoch": 1.5869565217391304,
"grad_norm": 2.3913058327100507,
"learning_rate": 9.677088674821601e-06,
"loss": 0.0933,
"step": 292
},
{
"epoch": 1.5923913043478262,
"grad_norm": 2.7206555164113113,
"learning_rate": 9.618403844271179e-06,
"loss": 0.0834,
"step": 293
},
{
"epoch": 1.5978260869565217,
"grad_norm": 2.04432325341852,
"learning_rate": 9.559732171873524e-06,
"loss": 0.0509,
"step": 294
},
{
"epoch": 1.6032608695652173,
"grad_norm": 3.408481044696874,
"learning_rate": 9.50107568073819e-06,
"loss": 0.1523,
"step": 295
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.15857623535162915,
"learning_rate": 9.442436393451252e-06,
"loss": 0.0037,
"step": 296
},
{
"epoch": 1.6141304347826086,
"grad_norm": 0.48149742863897177,
"learning_rate": 9.383816332005569e-06,
"loss": 0.0066,
"step": 297
},
{
"epoch": 1.6195652173913042,
"grad_norm": 0.43146507945514945,
"learning_rate": 9.325217517731047e-06,
"loss": 0.0063,
"step": 298
},
{
"epoch": 1.625,
"grad_norm": 3.7183270965419526,
"learning_rate": 9.266641971224963e-06,
"loss": 0.0717,
"step": 299
},
{
"epoch": 1.6304347826086958,
"grad_norm": 0.6284145395909966,
"learning_rate": 9.208091712282261e-06,
"loss": 0.0113,
"step": 300
},
{
"epoch": 1.6358695652173914,
"grad_norm": 0.12204274733613643,
"learning_rate": 9.149568759825937e-06,
"loss": 0.003,
"step": 301
},
{
"epoch": 1.641304347826087,
"grad_norm": 1.1716856729713159,
"learning_rate": 9.091075131837399e-06,
"loss": 0.016,
"step": 302
},
{
"epoch": 1.6467391304347827,
"grad_norm": 2.3073801254975743,
"learning_rate": 9.032612845286896e-06,
"loss": 0.0625,
"step": 303
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.24584369141616186,
"learning_rate": 8.974183916063967e-06,
"loss": 0.0038,
"step": 304
},
{
"epoch": 1.6576086956521738,
"grad_norm": 0.896272637025756,
"learning_rate": 8.915790358907924e-06,
"loss": 0.0124,
"step": 305
},
{
"epoch": 1.6630434782608696,
"grad_norm": 3.8696382415332957,
"learning_rate": 8.857434187338381e-06,
"loss": 0.0462,
"step": 306
},
{
"epoch": 1.6684782608695652,
"grad_norm": 0.12503032249914797,
"learning_rate": 8.799117413585836e-06,
"loss": 0.0025,
"step": 307
},
{
"epoch": 1.6739130434782608,
"grad_norm": 0.45154839467695335,
"learning_rate": 8.740842048522268e-06,
"loss": 0.0061,
"step": 308
},
{
"epoch": 1.6793478260869565,
"grad_norm": 0.09419278918622512,
"learning_rate": 8.682610101591813e-06,
"loss": 0.002,
"step": 309
},
{
"epoch": 1.6847826086956523,
"grad_norm": 0.4958479599321362,
"learning_rate": 8.624423580741462e-06,
"loss": 0.0086,
"step": 310
},
{
"epoch": 1.6902173913043477,
"grad_norm": 0.11770008527271246,
"learning_rate": 8.56628449235183e-06,
"loss": 0.0025,
"step": 311
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.369565128723298,
"learning_rate": 8.508194841167975e-06,
"loss": 0.0059,
"step": 312
},
{
"epoch": 1.7010869565217392,
"grad_norm": 0.06235754588692365,
"learning_rate": 8.450156630230267e-06,
"loss": 0.0019,
"step": 313
},
{
"epoch": 1.7065217391304348,
"grad_norm": 0.02787223131850643,
"learning_rate": 8.39217186080532e-06,
"loss": 0.0012,
"step": 314
},
{
"epoch": 1.7119565217391304,
"grad_norm": 0.03719997929743275,
"learning_rate": 8.334242532316977e-06,
"loss": 0.0012,
"step": 315
},
{
"epoch": 1.7173913043478262,
"grad_norm": 0.42795195182267215,
"learning_rate": 8.276370642277383e-06,
"loss": 0.0048,
"step": 316
},
{
"epoch": 1.7228260869565217,
"grad_norm": 0.9372903840892463,
"learning_rate": 8.21855818621809e-06,
"loss": 0.0203,
"step": 317
},
{
"epoch": 1.7282608695652173,
"grad_norm": 0.13870817101483046,
"learning_rate": 8.160807157621262e-06,
"loss": 0.0025,
"step": 318
},
{
"epoch": 1.733695652173913,
"grad_norm": 0.2445880882562458,
"learning_rate": 8.103119547850924e-06,
"loss": 0.0037,
"step": 319
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.06926518467785787,
"learning_rate": 8.045497346084297e-06,
"loss": 0.002,
"step": 320
},
{
"epoch": 1.7445652173913042,
"grad_norm": 0.029704630377944685,
"learning_rate": 7.98794253924322e-06,
"loss": 0.0011,
"step": 321
},
{
"epoch": 1.75,
"grad_norm": 0.02657434909385738,
"learning_rate": 7.930457111925616e-06,
"loss": 0.0012,
"step": 322
},
{
"epoch": 1.7554347826086958,
"grad_norm": 0.087118861417369,
"learning_rate": 7.873043046337086e-06,
"loss": 0.002,
"step": 323
},
{
"epoch": 1.7608695652173914,
"grad_norm": 0.029028883768708425,
"learning_rate": 7.815702322222539e-06,
"loss": 0.0009,
"step": 324
},
{
"epoch": 1.766304347826087,
"grad_norm": 0.574091822654542,
"learning_rate": 7.758436916797923e-06,
"loss": 0.0092,
"step": 325
},
{
"epoch": 1.7717391304347827,
"grad_norm": 0.043721730276414336,
"learning_rate": 7.701248804682069e-06,
"loss": 0.0014,
"step": 326
},
{
"epoch": 1.7771739130434783,
"grad_norm": 2.4824141009923726,
"learning_rate": 7.64413995782858e-06,
"loss": 0.1501,
"step": 327
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.3656857182755404,
"learning_rate": 7.5871123454578534e-06,
"loss": 0.0055,
"step": 328
},
{
"epoch": 1.7880434782608696,
"grad_norm": 0.030565125424490584,
"learning_rate": 7.530167933989161e-06,
"loss": 0.001,
"step": 329
},
{
"epoch": 1.7934782608695652,
"grad_norm": 0.6771809217496879,
"learning_rate": 7.47330868697285e-06,
"loss": 0.01,
"step": 330
},
{
"epoch": 1.7989130434782608,
"grad_norm": 0.24573870561094346,
"learning_rate": 7.4165365650226425e-06,
"loss": 0.0049,
"step": 331
},
{
"epoch": 1.8043478260869565,
"grad_norm": 0.8696535124002203,
"learning_rate": 7.3598535257480244e-06,
"loss": 0.0126,
"step": 332
},
{
"epoch": 1.8097826086956523,
"grad_norm": 0.02189894312561321,
"learning_rate": 7.30326152368674e-06,
"loss": 0.0008,
"step": 333
},
{
"epoch": 1.8152173913043477,
"grad_norm": 0.031609375803459974,
"learning_rate": 7.246762510237404e-06,
"loss": 0.0011,
"step": 334
},
{
"epoch": 1.8206521739130435,
"grad_norm": 0.020342266321765227,
"learning_rate": 7.1903584335922e-06,
"loss": 0.0008,
"step": 335
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.09248271114619741,
"learning_rate": 7.134051238669722e-06,
"loss": 0.0018,
"step": 336
},
{
"epoch": 1.8315217391304348,
"grad_norm": 0.10061723518020388,
"learning_rate": 7.077842867047897e-06,
"loss": 0.0024,
"step": 337
},
{
"epoch": 1.8369565217391304,
"grad_norm": 0.21992324150498122,
"learning_rate": 7.021735256897035e-06,
"loss": 0.0027,
"step": 338
},
{
"epoch": 1.8423913043478262,
"grad_norm": 0.030816726743244916,
"learning_rate": 6.965730342913011e-06,
"loss": 0.0011,
"step": 339
},
{
"epoch": 1.8478260869565217,
"grad_norm": 0.01683095603625154,
"learning_rate": 6.909830056250527e-06,
"loss": 0.0008,
"step": 340
},
{
"epoch": 1.8532608695652173,
"grad_norm": 0.23379778261250125,
"learning_rate": 6.8540363244565524e-06,
"loss": 0.0043,
"step": 341
},
{
"epoch": 1.858695652173913,
"grad_norm": 0.03675133534148478,
"learning_rate": 6.798351071403839e-06,
"loss": 0.001,
"step": 342
},
{
"epoch": 1.8641304347826086,
"grad_norm": 0.1140408877999425,
"learning_rate": 6.742776217224587e-06,
"loss": 0.0027,
"step": 343
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.02850900102579577,
"learning_rate": 6.687313678244243e-06,
"loss": 0.0009,
"step": 344
},
{
"epoch": 1.875,
"grad_norm": 0.02532716939465366,
"learning_rate": 6.6319653669154e-06,
"loss": 0.001,
"step": 345
},
{
"epoch": 1.8804347826086958,
"grad_norm": 0.10582087034471738,
"learning_rate": 6.576733191751879e-06,
"loss": 0.0029,
"step": 346
},
{
"epoch": 1.8858695652173914,
"grad_norm": 2.4137374896779877,
"learning_rate": 6.521619057262904e-06,
"loss": 0.1004,
"step": 347
},
{
"epoch": 1.891304347826087,
"grad_norm": 2.0298394937535122,
"learning_rate": 6.466624863887437e-06,
"loss": 0.0361,
"step": 348
},
{
"epoch": 1.8967391304347827,
"grad_norm": 0.15424873092333466,
"learning_rate": 6.411752507928643e-06,
"loss": 0.0031,
"step": 349
},
{
"epoch": 1.9021739130434783,
"grad_norm": 0.7343430535593085,
"learning_rate": 6.357003881488499e-06,
"loss": 0.0086,
"step": 350
},
{
"epoch": 1.9076086956521738,
"grad_norm": 0.0169679254906056,
"learning_rate": 6.302380872402562e-06,
"loss": 0.0007,
"step": 351
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.026108663412252976,
"learning_rate": 6.247885364174866e-06,
"loss": 0.001,
"step": 352
},
{
"epoch": 1.9184782608695652,
"grad_norm": 0.022679414032134804,
"learning_rate": 6.193519235912972e-06,
"loss": 0.0008,
"step": 353
},
{
"epoch": 1.9239130434782608,
"grad_norm": 0.02365404382322627,
"learning_rate": 6.139284362263185e-06,
"loss": 0.0008,
"step": 354
},
{
"epoch": 1.9293478260869565,
"grad_norm": 0.014446988115359962,
"learning_rate": 6.085182613345893e-06,
"loss": 0.0006,
"step": 355
},
{
"epoch": 1.9347826086956523,
"grad_norm": 0.016091425374232204,
"learning_rate": 6.031215854691097e-06,
"loss": 0.0007,
"step": 356
},
{
"epoch": 1.9402173913043477,
"grad_norm": 0.01553827774955186,
"learning_rate": 5.977385947174084e-06,
"loss": 0.0007,
"step": 357
},
{
"epoch": 1.9456521739130435,
"grad_norm": 0.17966133137766196,
"learning_rate": 5.923694746951253e-06,
"loss": 0.0028,
"step": 358
},
{
"epoch": 1.9510869565217392,
"grad_norm": 0.02477310360295687,
"learning_rate": 5.8701441053961185e-06,
"loss": 0.0009,
"step": 359
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.025478377542260965,
"learning_rate": 5.816735869035458e-06,
"loss": 0.0009,
"step": 360
},
{
"epoch": 1.9619565217391304,
"grad_norm": 0.01385737253155479,
"learning_rate": 5.7634718794856626e-06,
"loss": 0.0006,
"step": 361
},
{
"epoch": 1.9673913043478262,
"grad_norm": 0.2920694264321747,
"learning_rate": 5.710353973389215e-06,
"loss": 0.003,
"step": 362
},
{
"epoch": 1.9728260869565217,
"grad_norm": 0.0609584809389905,
"learning_rate": 5.657383982351368e-06,
"loss": 0.0014,
"step": 363
},
{
"epoch": 1.9782608695652173,
"grad_norm": 0.014022955163492444,
"learning_rate": 5.604563732876989e-06,
"loss": 0.0006,
"step": 364
},
{
"epoch": 1.983695652173913,
"grad_norm": 0.02973603833790608,
"learning_rate": 5.55189504630756e-06,
"loss": 0.0009,
"step": 365
},
{
"epoch": 1.9891304347826086,
"grad_norm": 0.07663989298851219,
"learning_rate": 5.4993797387584056e-06,
"loss": 0.0015,
"step": 366
},
{
"epoch": 1.9945652173913042,
"grad_norm": 3.723476839809668,
"learning_rate": 5.447019621056049e-06,
"loss": 0.1512,
"step": 367
},
{
"epoch": 2.0,
"grad_norm": 0.023508663828369594,
"learning_rate": 5.394816498675772e-06,
"loss": 0.0008,
"step": 368
},
{
"epoch": 2.005434782608696,
"grad_norm": 0.014915331253251566,
"learning_rate": 5.342772171679364e-06,
"loss": 0.0006,
"step": 369
},
{
"epoch": 2.010869565217391,
"grad_norm": 0.15045045132635565,
"learning_rate": 5.290888434653056e-06,
"loss": 0.0035,
"step": 370
},
{
"epoch": 2.016304347826087,
"grad_norm": 0.02078710490582649,
"learning_rate": 5.239167076645626e-06,
"loss": 0.0009,
"step": 371
},
{
"epoch": 2.0217391304347827,
"grad_norm": 0.08909809356955653,
"learning_rate": 5.187609881106725e-06,
"loss": 0.0021,
"step": 372
},
{
"epoch": 2.027173913043478,
"grad_norm": 0.019002236928891497,
"learning_rate": 5.136218625825374e-06,
"loss": 0.0006,
"step": 373
},
{
"epoch": 2.032608695652174,
"grad_norm": 0.04208850827532741,
"learning_rate": 5.084995082868658e-06,
"loss": 0.0009,
"step": 374
},
{
"epoch": 2.0380434782608696,
"grad_norm": 0.046840875573742065,
"learning_rate": 5.033941018520625e-06,
"loss": 0.0014,
"step": 375
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.1033934706809575,
"learning_rate": 4.983058193221384e-06,
"loss": 0.0019,
"step": 376
},
{
"epoch": 2.0489130434782608,
"grad_norm": 0.1705166302206335,
"learning_rate": 4.932348361506402e-06,
"loss": 0.0033,
"step": 377
},
{
"epoch": 2.0543478260869565,
"grad_norm": 0.028909235733879053,
"learning_rate": 4.881813271946e-06,
"loss": 0.0012,
"step": 378
},
{
"epoch": 2.0597826086956523,
"grad_norm": 0.3030377695298429,
"learning_rate": 4.831454667085059e-06,
"loss": 0.0039,
"step": 379
},
{
"epoch": 2.0652173913043477,
"grad_norm": 0.0477055277967709,
"learning_rate": 4.781274283382941e-06,
"loss": 0.001,
"step": 380
},
{
"epoch": 2.0706521739130435,
"grad_norm": 0.0199106085983902,
"learning_rate": 4.7312738511536035e-06,
"loss": 0.0008,
"step": 381
},
{
"epoch": 2.0760869565217392,
"grad_norm": 0.027962787198971308,
"learning_rate": 4.681455094505938e-06,
"loss": 0.001,
"step": 382
},
{
"epoch": 2.0815217391304346,
"grad_norm": 0.0382934899009715,
"learning_rate": 4.631819731284323e-06,
"loss": 0.0011,
"step": 383
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.013418670608056855,
"learning_rate": 4.58236947300939e-06,
"loss": 0.0006,
"step": 384
},
{
"epoch": 2.092391304347826,
"grad_norm": 0.04141147762016092,
"learning_rate": 4.5331060248189924e-06,
"loss": 0.0013,
"step": 385
},
{
"epoch": 2.097826086956522,
"grad_norm": 0.029823878767931914,
"learning_rate": 4.4840310854094335e-06,
"loss": 0.001,
"step": 386
},
{
"epoch": 2.1032608695652173,
"grad_norm": 0.2181034359186816,
"learning_rate": 4.435146346976873e-06,
"loss": 0.004,
"step": 387
},
{
"epoch": 2.108695652173913,
"grad_norm": 0.36490526428814946,
"learning_rate": 4.386453495158983e-06,
"loss": 0.0042,
"step": 388
},
{
"epoch": 2.114130434782609,
"grad_norm": 0.0743305865977075,
"learning_rate": 4.33795420897683e-06,
"loss": 0.0011,
"step": 389
},
{
"epoch": 2.119565217391304,
"grad_norm": 0.3000013681179252,
"learning_rate": 4.289650160776967e-06,
"loss": 0.0046,
"step": 390
},
{
"epoch": 2.125,
"grad_norm": 0.05973611485866258,
"learning_rate": 4.241543016173778e-06,
"loss": 0.0011,
"step": 391
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.02140783876818863,
"learning_rate": 4.19363443399204e-06,
"loss": 0.0008,
"step": 392
},
{
"epoch": 2.135869565217391,
"grad_norm": 0.01680791379596923,
"learning_rate": 4.1459260662097235e-06,
"loss": 0.0007,
"step": 393
},
{
"epoch": 2.141304347826087,
"grad_norm": 0.5362708346340234,
"learning_rate": 4.098419557901036e-06,
"loss": 0.0077,
"step": 394
},
{
"epoch": 2.1467391304347827,
"grad_norm": 0.016360773071928784,
"learning_rate": 4.051116547179677e-06,
"loss": 0.0007,
"step": 395
},
{
"epoch": 2.1521739130434785,
"grad_norm": 0.28985199290673336,
"learning_rate": 4.00401866514238e-06,
"loss": 0.0044,
"step": 396
},
{
"epoch": 2.157608695652174,
"grad_norm": 0.01604718518106245,
"learning_rate": 3.957127535812651e-06,
"loss": 0.0007,
"step": 397
},
{
"epoch": 2.1630434782608696,
"grad_norm": 0.05241001721895836,
"learning_rate": 3.910444776084777e-06,
"loss": 0.0016,
"step": 398
},
{
"epoch": 2.1684782608695654,
"grad_norm": 0.02209678496389779,
"learning_rate": 3.8639719956680624e-06,
"loss": 0.0008,
"step": 399
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.020559716878607803,
"learning_rate": 3.817710797031338e-06,
"loss": 0.0008,
"step": 400
},
{
"epoch": 2.1793478260869565,
"grad_norm": 0.014824391810911752,
"learning_rate": 3.771662775347692e-06,
"loss": 0.0006,
"step": 401
},
{
"epoch": 2.1847826086956523,
"grad_norm": 0.015796576868617806,
"learning_rate": 3.7258295184394743e-06,
"loss": 0.0007,
"step": 402
},
{
"epoch": 2.1902173913043477,
"grad_norm": 1.9188157660999832,
"learning_rate": 3.680212606723542e-06,
"loss": 0.0306,
"step": 403
},
{
"epoch": 2.1956521739130435,
"grad_norm": 0.06391438687127189,
"learning_rate": 3.6348136131567537e-06,
"loss": 0.0019,
"step": 404
},
{
"epoch": 2.2010869565217392,
"grad_norm": 0.17262747887734978,
"learning_rate": 3.5896341031817517e-06,
"loss": 0.0036,
"step": 405
},
{
"epoch": 2.2065217391304346,
"grad_norm": 0.056665382264410494,
"learning_rate": 3.5446756346729673e-06,
"loss": 0.0012,
"step": 406
},
{
"epoch": 2.2119565217391304,
"grad_norm": 1.9642610912379441,
"learning_rate": 3.4999397578829076e-06,
"loss": 0.037,
"step": 407
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.014116778100650137,
"learning_rate": 3.4554280153886967e-06,
"loss": 0.0006,
"step": 408
},
{
"epoch": 2.2228260869565215,
"grad_norm": 0.024488008150664965,
"learning_rate": 3.4111419420388904e-06,
"loss": 0.001,
"step": 409
},
{
"epoch": 2.2282608695652173,
"grad_norm": 0.5674032898921303,
"learning_rate": 3.3670830649005437e-06,
"loss": 0.0041,
"step": 410
},
{
"epoch": 2.233695652173913,
"grad_norm": 0.02286422293729417,
"learning_rate": 3.323252903206562e-06,
"loss": 0.0009,
"step": 411
},
{
"epoch": 2.239130434782609,
"grad_norm": 0.27168054236566974,
"learning_rate": 3.279652968303313e-06,
"loss": 0.0043,
"step": 412
},
{
"epoch": 2.244565217391304,
"grad_norm": 0.1593898805811067,
"learning_rate": 3.236284763598512e-06,
"loss": 0.0035,
"step": 413
},
{
"epoch": 2.25,
"grad_norm": 0.013081366094026997,
"learning_rate": 3.1931497845093753e-06,
"loss": 0.0006,
"step": 414
},
{
"epoch": 2.255434782608696,
"grad_norm": 0.012814297915516075,
"learning_rate": 3.150249518411067e-06,
"loss": 0.0006,
"step": 415
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.07415100436276072,
"learning_rate": 3.1075854445854093e-06,
"loss": 0.0018,
"step": 416
},
{
"epoch": 2.266304347826087,
"grad_norm": 0.027114643295979856,
"learning_rate": 3.0651590341698633e-06,
"loss": 0.0009,
"step": 417
},
{
"epoch": 2.2717391304347827,
"grad_norm": 0.13722514020501544,
"learning_rate": 3.0229717501068133e-06,
"loss": 0.0023,
"step": 418
},
{
"epoch": 2.2771739130434785,
"grad_norm": 0.023053695918606187,
"learning_rate": 2.981025047093118e-06,
"loss": 0.0009,
"step": 419
},
{
"epoch": 2.282608695652174,
"grad_norm": 3.7468189613648253,
"learning_rate": 2.9393203715299477e-06,
"loss": 0.0598,
"step": 420
},
{
"epoch": 2.2880434782608696,
"grad_norm": 0.08634045866789929,
"learning_rate": 2.8978591614729114e-06,
"loss": 0.0015,
"step": 421
},
{
"epoch": 2.2934782608695654,
"grad_norm": 0.13994711242571936,
"learning_rate": 2.856642846582469e-06,
"loss": 0.0019,
"step": 422
},
{
"epoch": 2.2989130434782608,
"grad_norm": 0.0519996408733201,
"learning_rate": 2.8156728480746386e-06,
"loss": 0.0011,
"step": 423
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.01904905289611891,
"learning_rate": 2.77495057867198e-06,
"loss": 0.0007,
"step": 424
},
{
"epoch": 2.3097826086956523,
"grad_norm": 1.2476206988634295,
"learning_rate": 2.7344774425548917e-06,
"loss": 0.0339,
"step": 425
},
{
"epoch": 2.3152173913043477,
"grad_norm": 1.7884596495622582,
"learning_rate": 2.694254835313187e-06,
"loss": 0.1375,
"step": 426
},
{
"epoch": 2.3206521739130435,
"grad_norm": 0.31025512064642874,
"learning_rate": 2.654284143897976e-06,
"loss": 0.0034,
"step": 427
},
{
"epoch": 2.3260869565217392,
"grad_norm": 0.3488873501510679,
"learning_rate": 2.6145667465738333e-06,
"loss": 0.0039,
"step": 428
},
{
"epoch": 2.3315217391304346,
"grad_norm": 0.589409734181312,
"learning_rate": 2.57510401287128e-06,
"loss": 0.0044,
"step": 429
},
{
"epoch": 2.3369565217391304,
"grad_norm": 0.3987654975780055,
"learning_rate": 2.535897303539554e-06,
"loss": 0.0061,
"step": 430
},
{
"epoch": 2.342391304347826,
"grad_norm": 0.015719041310887562,
"learning_rate": 2.4969479704996935e-06,
"loss": 0.0006,
"step": 431
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.015180271606601303,
"learning_rate": 2.4582573567979196e-06,
"loss": 0.0006,
"step": 432
},
{
"epoch": 2.3532608695652173,
"grad_norm": 0.04482488635397311,
"learning_rate": 2.4198267965593224e-06,
"loss": 0.0011,
"step": 433
},
{
"epoch": 2.358695652173913,
"grad_norm": 0.28160845350626884,
"learning_rate": 2.381657614941858e-06,
"loss": 0.005,
"step": 434
},
{
"epoch": 2.364130434782609,
"grad_norm": 0.09873212459265543,
"learning_rate": 2.3437511280906576e-06,
"loss": 0.002,
"step": 435
},
{
"epoch": 2.369565217391304,
"grad_norm": 0.028522981368259783,
"learning_rate": 2.306108643092647e-06,
"loss": 0.0008,
"step": 436
},
{
"epoch": 2.375,
"grad_norm": 0.030887088059580514,
"learning_rate": 2.268731457931467e-06,
"loss": 0.001,
"step": 437
},
{
"epoch": 2.380434782608696,
"grad_norm": 0.2056153085824592,
"learning_rate": 2.2316208614427226e-06,
"loss": 0.003,
"step": 438
},
{
"epoch": 2.385869565217391,
"grad_norm": 0.03316498797260578,
"learning_rate": 2.1947781332695406e-06,
"loss": 0.001,
"step": 439
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.020603866879399167,
"learning_rate": 2.1582045438184464e-06,
"loss": 0.0007,
"step": 440
},
{
"epoch": 2.3967391304347827,
"grad_norm": 0.022416446968247912,
"learning_rate": 2.121901354215553e-06,
"loss": 0.0008,
"step": 441
},
{
"epoch": 2.4021739130434785,
"grad_norm": 1.2759832400444016,
"learning_rate": 2.085869816263081e-06,
"loss": 0.0222,
"step": 442
},
{
"epoch": 2.407608695652174,
"grad_norm": 2.7040121657564558,
"learning_rate": 2.050111172396192e-06,
"loss": 0.0472,
"step": 443
},
{
"epoch": 2.4130434782608696,
"grad_norm": 0.10233992459998235,
"learning_rate": 2.0146266556401405e-06,
"loss": 0.0016,
"step": 444
},
{
"epoch": 2.4184782608695654,
"grad_norm": 0.244848209656816,
"learning_rate": 1.97941748956777e-06,
"loss": 0.004,
"step": 445
},
{
"epoch": 2.4239130434782608,
"grad_norm": 0.05688444318906805,
"learning_rate": 1.944484888257312e-06,
"loss": 0.0013,
"step": 446
},
{
"epoch": 2.4293478260869565,
"grad_norm": 0.5574195380686696,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.0112,
"step": 447
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.0932057849593417,
"learning_rate": 1.8754541885111631e-06,
"loss": 0.0018,
"step": 448
},
{
"epoch": 2.4402173913043477,
"grad_norm": 0.10747253772821316,
"learning_rate": 1.8413584703837618e-06,
"loss": 0.0018,
"step": 449
},
{
"epoch": 2.4456521739130435,
"grad_norm": 0.39067007335009907,
"learning_rate": 1.8075440775527754e-06,
"loss": 0.0063,
"step": 450
},
{
"epoch": 2.4510869565217392,
"grad_norm": 0.028328534672816628,
"learning_rate": 1.7740121760020324e-06,
"loss": 0.001,
"step": 451
},
{
"epoch": 2.4565217391304346,
"grad_norm": 0.12079880404676811,
"learning_rate": 1.740763921974531e-06,
"loss": 0.0024,
"step": 452
},
{
"epoch": 2.4619565217391304,
"grad_norm": 0.10850662346060039,
"learning_rate": 1.7078004619325728e-06,
"loss": 0.0017,
"step": 453
},
{
"epoch": 2.467391304347826,
"grad_norm": 0.2673103325118139,
"learning_rate": 1.6751229325182194e-06,
"loss": 0.0067,
"step": 454
},
{
"epoch": 2.4728260869565215,
"grad_norm": 0.20052250560415452,
"learning_rate": 1.6427324605141125e-06,
"loss": 0.0037,
"step": 455
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.08452549445673675,
"learning_rate": 1.610630162804615e-06,
"loss": 0.0015,
"step": 456
},
{
"epoch": 2.483695652173913,
"grad_norm": 0.01638519542637996,
"learning_rate": 1.578817146337297e-06,
"loss": 0.0006,
"step": 457
},
{
"epoch": 2.489130434782609,
"grad_norm": 0.03107206330508472,
"learning_rate": 1.5472945080847679e-06,
"loss": 0.0008,
"step": 458
},
{
"epoch": 2.494565217391304,
"grad_norm": 0.03654411098415488,
"learning_rate": 1.516063335006851e-06,
"loss": 0.0009,
"step": 459
},
{
"epoch": 2.5,
"grad_norm": 0.07287899663917816,
"learning_rate": 1.485124704013101e-06,
"loss": 0.0017,
"step": 460
},
{
"epoch": 2.505434782608696,
"grad_norm": 0.9588849867572242,
"learning_rate": 1.4544796819256724e-06,
"loss": 0.0086,
"step": 461
},
{
"epoch": 2.5108695652173916,
"grad_norm": 0.02467713549047941,
"learning_rate": 1.4241293254425337e-06,
"loss": 0.0007,
"step": 462
},
{
"epoch": 2.516304347826087,
"grad_norm": 0.04748495142661645,
"learning_rate": 1.3940746811010297e-06,
"loss": 0.0011,
"step": 463
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.03054669361577949,
"learning_rate": 1.3643167852417894e-06,
"loss": 0.001,
"step": 464
},
{
"epoch": 2.5271739130434785,
"grad_norm": 0.027111109257002528,
"learning_rate": 1.3348566639730032e-06,
"loss": 0.0011,
"step": 465
},
{
"epoch": 2.532608695652174,
"grad_norm": 0.04377035701857717,
"learning_rate": 1.3056953331350297e-06,
"loss": 0.001,
"step": 466
},
{
"epoch": 2.5380434782608696,
"grad_norm": 0.08382313642398824,
"learning_rate": 1.2768337982653744e-06,
"loss": 0.0014,
"step": 467
},
{
"epoch": 2.5434782608695654,
"grad_norm": 0.030219514519134735,
"learning_rate": 1.2482730545640133e-06,
"loss": 0.0011,
"step": 468
},
{
"epoch": 2.5489130434782608,
"grad_norm": 0.42539314485494417,
"learning_rate": 1.2200140868590759e-06,
"loss": 0.0063,
"step": 469
},
{
"epoch": 2.5543478260869565,
"grad_norm": 0.025687483062924163,
"learning_rate": 1.1920578695728903e-06,
"loss": 0.0009,
"step": 470
},
{
"epoch": 2.5597826086956523,
"grad_norm": 0.027491319722765094,
"learning_rate": 1.1644053666883803e-06,
"loss": 0.0009,
"step": 471
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.12070804850917503,
"learning_rate": 1.137057531715825e-06,
"loss": 0.0023,
"step": 472
},
{
"epoch": 2.5706521739130435,
"grad_norm": 0.1648819505998384,
"learning_rate": 1.1100153076599862e-06,
"loss": 0.0025,
"step": 473
},
{
"epoch": 2.5760869565217392,
"grad_norm": 0.1168751069545925,
"learning_rate": 1.0832796269875757e-06,
"loss": 0.0023,
"step": 474
},
{
"epoch": 2.5815217391304346,
"grad_norm": 0.030968178239974237,
"learning_rate": 1.0568514115951256e-06,
"loss": 0.001,
"step": 475
},
{
"epoch": 2.5869565217391304,
"grad_norm": 1.2108714841296098,
"learning_rate": 1.0307315727771806e-06,
"loss": 0.0126,
"step": 476
},
{
"epoch": 2.592391304347826,
"grad_norm": 0.027899777268609836,
"learning_rate": 1.0049210111948815e-06,
"loss": 0.0009,
"step": 477
},
{
"epoch": 2.5978260869565215,
"grad_norm": 0.03180410299281123,
"learning_rate": 9.794206168449127e-07,
"loss": 0.0009,
"step": 478
},
{
"epoch": 2.6032608695652173,
"grad_norm": 0.033244233145600086,
"learning_rate": 9.542312690288035e-07,
"loss": 0.0009,
"step": 479
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.03761724722059268,
"learning_rate": 9.293538363226196e-07,
"loss": 0.0013,
"step": 480
},
{
"epoch": 2.6141304347826084,
"grad_norm": 0.09136376989366057,
"learning_rate": 9.04789176547004e-07,
"loss": 0.0018,
"step": 481
},
{
"epoch": 2.619565217391304,
"grad_norm": 0.18059210345284965,
"learning_rate": 8.80538136737602e-07,
"loss": 0.0029,
"step": 482
},
{
"epoch": 2.625,
"grad_norm": 0.030807943380701246,
"learning_rate": 8.566015531158534e-07,
"loss": 0.0008,
"step": 483
},
{
"epoch": 2.630434782608696,
"grad_norm": 0.05710411212363332,
"learning_rate": 8.329802510601559e-07,
"loss": 0.0014,
"step": 484
},
{
"epoch": 2.6358695652173916,
"grad_norm": 0.061848371459409315,
"learning_rate": 8.096750450774071e-07,
"loss": 0.0016,
"step": 485
},
{
"epoch": 2.641304347826087,
"grad_norm": 1.0253370343843025,
"learning_rate": 7.866867387749199e-07,
"loss": 0.0166,
"step": 486
},
{
"epoch": 2.6467391304347827,
"grad_norm": 0.029136594892818037,
"learning_rate": 7.640161248327061e-07,
"loss": 0.001,
"step": 487
},
{
"epoch": 2.6521739130434785,
"grad_norm": 1.092489264260611,
"learning_rate": 7.416639849761531e-07,
"loss": 0.0248,
"step": 488
},
{
"epoch": 2.657608695652174,
"grad_norm": 2.2914238948250363,
"learning_rate": 7.196310899490577e-07,
"loss": 0.0723,
"step": 489
},
{
"epoch": 2.6630434782608696,
"grad_norm": 0.016249601644455224,
"learning_rate": 6.979181994870587e-07,
"loss": 0.0007,
"step": 490
},
{
"epoch": 2.6684782608695654,
"grad_norm": 0.021265124563151435,
"learning_rate": 6.765260622914361e-07,
"loss": 0.0007,
"step": 491
},
{
"epoch": 2.6739130434782608,
"grad_norm": 0.03831610583206101,
"learning_rate": 6.554554160032899e-07,
"loss": 0.001,
"step": 492
},
{
"epoch": 2.6793478260869565,
"grad_norm": 0.03101608692853337,
"learning_rate": 6.347069871781164e-07,
"loss": 0.0009,
"step": 493
},
{
"epoch": 2.6847826086956523,
"grad_norm": 0.01978989576112469,
"learning_rate": 6.142814912607409e-07,
"loss": 0.0008,
"step": 494
},
{
"epoch": 2.6902173913043477,
"grad_norm": 0.3852432741704962,
"learning_rate": 5.941796325606574e-07,
"loss": 0.007,
"step": 495
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.39628033120487305,
"learning_rate": 5.744021042277437e-07,
"loss": 0.0052,
"step": 496
},
{
"epoch": 2.7010869565217392,
"grad_norm": 0.09815745867450933,
"learning_rate": 5.549495882283528e-07,
"loss": 0.0019,
"step": 497
},
{
"epoch": 2.7065217391304346,
"grad_norm": 2.2778045886314655,
"learning_rate": 5.358227553218031e-07,
"loss": 0.0699,
"step": 498
},
{
"epoch": 2.7119565217391304,
"grad_norm": 0.027783255312989117,
"learning_rate": 5.17022265037247e-07,
"loss": 0.0009,
"step": 499
},
{
"epoch": 2.717391304347826,
"grad_norm": 0.04524039432637041,
"learning_rate": 4.985487656509313e-07,
"loss": 0.0013,
"step": 500
},
{
"epoch": 2.7228260869565215,
"grad_norm": 1.8660426088847626,
"learning_rate": 4.804028941638405e-07,
"loss": 0.0379,
"step": 501
},
{
"epoch": 2.7282608695652173,
"grad_norm": 0.05194490259797287,
"learning_rate": 4.6258527627973446e-07,
"loss": 0.0011,
"step": 502
},
{
"epoch": 2.733695652173913,
"grad_norm": 0.5524275731086881,
"learning_rate": 4.450965263835694e-07,
"loss": 0.0059,
"step": 503
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.09638176861935786,
"learning_rate": 4.2793724752031807e-07,
"loss": 0.0014,
"step": 504
},
{
"epoch": 2.744565217391304,
"grad_norm": 1.5902794253403654,
"learning_rate": 4.111080313741711e-07,
"loss": 0.0265,
"step": 505
},
{
"epoch": 2.75,
"grad_norm": 0.027472533837749617,
"learning_rate": 3.9460945824813635e-07,
"loss": 0.0007,
"step": 506
},
{
"epoch": 2.755434782608696,
"grad_norm": 0.1279143225656888,
"learning_rate": 3.7844209704403055e-07,
"loss": 0.0029,
"step": 507
},
{
"epoch": 2.7608695652173916,
"grad_norm": 0.026463459883835142,
"learning_rate": 3.626065052428551e-07,
"loss": 0.0008,
"step": 508
},
{
"epoch": 2.766304347826087,
"grad_norm": 0.27505638314757236,
"learning_rate": 3.471032288855869e-07,
"loss": 0.0041,
"step": 509
},
{
"epoch": 2.7717391304347827,
"grad_norm": 0.03755249242727417,
"learning_rate": 3.3193280255433556e-07,
"loss": 0.0011,
"step": 510
},
{
"epoch": 2.7771739130434785,
"grad_norm": 1.3351363822022542,
"learning_rate": 3.170957493539195e-07,
"loss": 0.0158,
"step": 511
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.02416580008302714,
"learning_rate": 3.0259258089382236e-07,
"loss": 0.0009,
"step": 512
},
{
"epoch": 2.7880434782608696,
"grad_norm": 0.24305894735810873,
"learning_rate": 2.88423797270555e-07,
"loss": 0.0033,
"step": 513
},
{
"epoch": 2.7934782608695654,
"grad_norm": 0.0170002796253045,
"learning_rate": 2.745898870504116e-07,
"loss": 0.0006,
"step": 514
},
{
"epoch": 2.7989130434782608,
"grad_norm": 0.07161898082689806,
"learning_rate": 2.6109132725262166e-07,
"loss": 0.0017,
"step": 515
},
{
"epoch": 2.8043478260869565,
"grad_norm": 1.0122242308252756,
"learning_rate": 2.479285833329015e-07,
"loss": 0.0147,
"step": 516
},
{
"epoch": 2.8097826086956523,
"grad_norm": 0.32571610548502183,
"learning_rate": 2.351021091674044e-07,
"loss": 0.0056,
"step": 517
},
{
"epoch": 2.8152173913043477,
"grad_norm": 0.04130977709089724,
"learning_rate": 2.226123470370689e-07,
"loss": 0.0012,
"step": 518
},
{
"epoch": 2.8206521739130435,
"grad_norm": 1.7181531921107351,
"learning_rate": 2.104597276123721e-07,
"loss": 0.0401,
"step": 519
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.02705959014069028,
"learning_rate": 1.9864466993847808e-07,
"loss": 0.0009,
"step": 520
},
{
"epoch": 2.8315217391304346,
"grad_norm": 0.018768961604310398,
"learning_rate": 1.8716758142078295e-07,
"loss": 0.0007,
"step": 521
},
{
"epoch": 2.8369565217391304,
"grad_norm": 0.018345985510552047,
"learning_rate": 1.7602885781087486e-07,
"loss": 0.0008,
"step": 522
},
{
"epoch": 2.842391304347826,
"grad_norm": 0.02605179058078712,
"learning_rate": 1.6522888319288166e-07,
"loss": 0.0009,
"step": 523
},
{
"epoch": 2.8478260869565215,
"grad_norm": 0.03616601605859018,
"learning_rate": 1.5476802997022812e-07,
"loss": 0.001,
"step": 524
},
{
"epoch": 2.8532608695652173,
"grad_norm": 0.025004543897905514,
"learning_rate": 1.4464665885279948e-07,
"loss": 0.0008,
"step": 525
},
{
"epoch": 2.858695652173913,
"grad_norm": 2.0898366886384756,
"learning_rate": 1.3486511884449827e-07,
"loss": 0.0181,
"step": 526
},
{
"epoch": 2.8641304347826084,
"grad_norm": 1.4763355304020689,
"learning_rate": 1.254237472312092e-07,
"loss": 0.0246,
"step": 527
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.012470194296572264,
"learning_rate": 1.1632286956917427e-07,
"loss": 0.0006,
"step": 528
},
{
"epoch": 2.875,
"grad_norm": 0.01829002489612097,
"learning_rate": 1.075627996737627e-07,
"loss": 0.0008,
"step": 529
},
{
"epoch": 2.880434782608696,
"grad_norm": 0.38603420185863435,
"learning_rate": 9.914383960865081e-08,
"loss": 0.0047,
"step": 530
},
{
"epoch": 2.8858695652173916,
"grad_norm": 0.19367430087338477,
"learning_rate": 9.106627967540915e-08,
"loss": 0.0024,
"step": 531
},
{
"epoch": 2.891304347826087,
"grad_norm": 0.4411181370450418,
"learning_rate": 8.333039840348833e-08,
"loss": 0.0042,
"step": 532
},
{
"epoch": 2.8967391304347827,
"grad_norm": 0.012209939294930026,
"learning_rate": 7.593646254061448e-08,
"loss": 0.0006,
"step": 533
},
{
"epoch": 2.9021739130434785,
"grad_norm": 0.330056623962809,
"learning_rate": 6.888472704359661e-08,
"loss": 0.006,
"step": 534
},
{
"epoch": 2.907608695652174,
"grad_norm": 0.034219507512194006,
"learning_rate": 6.217543506952916e-08,
"loss": 0.001,
"step": 535
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.018554429841025816,
"learning_rate": 5.580881796741322e-08,
"loss": 0.0007,
"step": 536
},
{
"epoch": 2.9184782608695654,
"grad_norm": 0.03648100584653491,
"learning_rate": 4.978509527017283e-08,
"loss": 0.0009,
"step": 537
},
{
"epoch": 2.9239130434782608,
"grad_norm": 0.024703372477637507,
"learning_rate": 4.410447468709001e-08,
"loss": 0.001,
"step": 538
},
{
"epoch": 2.9293478260869565,
"grad_norm": 0.04043268074636393,
"learning_rate": 3.8767152096641504e-08,
"loss": 0.001,
"step": 539
},
{
"epoch": 2.9347826086956523,
"grad_norm": 0.046268704953150705,
"learning_rate": 3.377331153974206e-08,
"loss": 0.0015,
"step": 540
},
{
"epoch": 2.9402173913043477,
"grad_norm": 0.06556550271817785,
"learning_rate": 2.912312521340277e-08,
"loss": 0.001,
"step": 541
},
{
"epoch": 2.9456521739130435,
"grad_norm": 0.11585425133268303,
"learning_rate": 2.4816753464789177e-08,
"loss": 0.0018,
"step": 542
},
{
"epoch": 2.9510869565217392,
"grad_norm": 0.07767672609852741,
"learning_rate": 2.0854344785694593e-08,
"loss": 0.0016,
"step": 543
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.4371414896745222,
"learning_rate": 1.7236035807416397e-08,
"loss": 0.0058,
"step": 544
},
{
"epoch": 2.9619565217391304,
"grad_norm": 0.05437143993551097,
"learning_rate": 1.3961951296053156e-08,
"loss": 0.0012,
"step": 545
},
{
"epoch": 2.967391304347826,
"grad_norm": 0.08967830928285274,
"learning_rate": 1.1032204148191395e-08,
"loss": 0.0015,
"step": 546
},
{
"epoch": 2.9728260869565215,
"grad_norm": 0.0632058201324481,
"learning_rate": 8.446895387019815e-09,
"loss": 0.0013,
"step": 547
},
{
"epoch": 2.9782608695652173,
"grad_norm": 0.5781101930402831,
"learning_rate": 6.206114158845422e-09,
"loss": 0.0104,
"step": 548
},
{
"epoch": 2.983695652173913,
"grad_norm": 0.03599473788037359,
"learning_rate": 4.309937730015978e-09,
"loss": 0.0009,
"step": 549
},
{
"epoch": 2.9891304347826084,
"grad_norm": 0.029545010274494552,
"learning_rate": 2.758431484259916e-09,
"loss": 0.001,
"step": 550
},
{
"epoch": 2.994565217391304,
"grad_norm": 0.04112363341260623,
"learning_rate": 1.5516489204303598e-09,
"loss": 0.001,
"step": 551
},
{
"epoch": 3.0,
"grad_norm": 0.28259585170188845,
"learning_rate": 6.896316506554979e-10,
"loss": 0.0056,
"step": 552
},
{
"epoch": 3.0,
"step": 552,
"total_flos": 4395674998272.0,
"train_loss": 0.49586228307948593,
"train_runtime": 3133.0691,
"train_samples_per_second": 2.813,
"train_steps_per_second": 0.176
}
],
"logging_steps": 1.0,
"max_steps": 552,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50000,
"total_flos": 4395674998272.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}