{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005434782608695652, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 10.5124, "step": 1 }, { "epoch": 0.010869565217391304, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 10.5941, "step": 2 }, { "epoch": 0.016304347826086956, "grad_norm": 8.096252368220718, "learning_rate": 1.1764705882352942e-06, "loss": 10.475, "step": 3 }, { "epoch": 0.021739130434782608, "grad_norm": 8.39383943803796, "learning_rate": 2.3529411764705885e-06, "loss": 10.4029, "step": 4 }, { "epoch": 0.02717391304347826, "grad_norm": 8.479649232958007, "learning_rate": 3.529411764705883e-06, "loss": 10.606, "step": 5 }, { "epoch": 0.03260869565217391, "grad_norm": 8.388175109430223, "learning_rate": 4.705882352941177e-06, "loss": 10.4024, "step": 6 }, { "epoch": 0.03804347826086957, "grad_norm": 8.445899787393927, "learning_rate": 5.882352941176471e-06, "loss": 10.4772, "step": 7 }, { "epoch": 0.043478260869565216, "grad_norm": 8.405772228388786, "learning_rate": 7.058823529411766e-06, "loss": 10.4004, "step": 8 }, { "epoch": 0.04891304347826087, "grad_norm": 8.44764590867685, "learning_rate": 8.23529411764706e-06, "loss": 10.1775, "step": 9 }, { "epoch": 0.05434782608695652, "grad_norm": 8.23897507323131, "learning_rate": 9.411764705882354e-06, "loss": 10.2434, "step": 10 }, { "epoch": 0.059782608695652176, "grad_norm": 8.118852150518913, "learning_rate": 1.0588235294117648e-05, "loss": 9.7644, "step": 11 }, { "epoch": 0.06521739130434782, "grad_norm": 8.570315139494753, "learning_rate": 1.1764705882352942e-05, "loss": 9.7751, "step": 12 }, { "epoch": 0.07065217391304347, "grad_norm": 8.622402474140065, "learning_rate": 1.2941176470588238e-05, "loss": 9.2685, "step": 13 }, { "epoch": 0.07608695652173914, "grad_norm": 8.736670863686008, "learning_rate": 1.4117647058823532e-05, "loss": 8.897, "step": 14 }, { "epoch": 0.08152173913043478, "grad_norm": 9.172468108894085, "learning_rate": 1.5294117647058822e-05, "loss": 8.7101, "step": 15 }, { "epoch": 0.08695652173913043, "grad_norm": 10.228378996373296, "learning_rate": 1.647058823529412e-05, "loss": 8.3074, "step": 16 }, { "epoch": 0.09239130434782608, "grad_norm": 10.657372840257251, "learning_rate": 1.7647058823529414e-05, "loss": 7.8589, "step": 17 }, { "epoch": 0.09782608695652174, "grad_norm": 10.887433964524527, "learning_rate": 1.8823529411764708e-05, "loss": 7.4742, "step": 18 }, { "epoch": 0.10326086956521739, "grad_norm": 11.682285639818433, "learning_rate": 2e-05, "loss": 6.8416, "step": 19 }, { "epoch": 0.10869565217391304, "grad_norm": 11.901377724871265, "learning_rate": 1.999982759060109e-05, "loss": 6.2183, "step": 20 }, { "epoch": 0.11413043478260869, "grad_norm": 11.383373292219964, "learning_rate": 1.9999310368349344e-05, "loss": 5.4371, "step": 21 }, { "epoch": 0.11956521739130435, "grad_norm": 9.311596334088138, "learning_rate": 1.999844835107957e-05, "loss": 4.7164, "step": 22 }, { "epoch": 0.125, "grad_norm": 8.688635937406437, "learning_rate": 1.9997241568515742e-05, "loss": 4.456, "step": 23 }, { "epoch": 0.13043478260869565, "grad_norm": 7.4122077747748305, "learning_rate": 1.9995690062269985e-05, "loss": 3.8875, "step": 24 }, { "epoch": 0.1358695652173913, "grad_norm": 6.888182537563505, "learning_rate": 1.9993793885841157e-05, "loss": 3.5685, "step": 25 }, { "epoch": 0.14130434782608695, "grad_norm": 6.988607551936095, "learning_rate": 1.9991553104612982e-05, "loss": 3.4123, "step": 26 }, { "epoch": 0.14673913043478262, "grad_norm": 7.211548625105269, "learning_rate": 1.998896779585181e-05, "loss": 3.0838, "step": 27 }, { "epoch": 0.15217391304347827, "grad_norm": 7.767483170773942, "learning_rate": 1.998603804870395e-05, "loss": 2.831, "step": 28 }, { "epoch": 0.15760869565217392, "grad_norm": 7.950559222260086, "learning_rate": 1.9982763964192586e-05, "loss": 2.6297, "step": 29 }, { "epoch": 0.16304347826086957, "grad_norm": 8.23795631455961, "learning_rate": 1.9979145655214306e-05, "loss": 2.2795, "step": 30 }, { "epoch": 0.16847826086956522, "grad_norm": 8.57956169127235, "learning_rate": 1.9975183246535212e-05, "loss": 2.0509, "step": 31 }, { "epoch": 0.17391304347826086, "grad_norm": 8.071070816084118, "learning_rate": 1.99708768747866e-05, "loss": 1.8279, "step": 32 }, { "epoch": 0.1793478260869565, "grad_norm": 7.042152882720071, "learning_rate": 1.9966226688460258e-05, "loss": 1.3567, "step": 33 }, { "epoch": 0.18478260869565216, "grad_norm": 4.814338676579685, "learning_rate": 1.996123284790336e-05, "loss": 0.9542, "step": 34 }, { "epoch": 0.19021739130434784, "grad_norm": 2.9434658655739474, "learning_rate": 1.9955895525312913e-05, "loss": 0.8261, "step": 35 }, { "epoch": 0.1956521739130435, "grad_norm": 2.452806110360505, "learning_rate": 1.995021490472983e-05, "loss": 0.851, "step": 36 }, { "epoch": 0.20108695652173914, "grad_norm": 1.6789979391543146, "learning_rate": 1.9944191182032588e-05, "loss": 0.8265, "step": 37 }, { "epoch": 0.20652173913043478, "grad_norm": 2.0007370440742154, "learning_rate": 1.9937824564930474e-05, "loss": 0.8181, "step": 38 }, { "epoch": 0.21195652173913043, "grad_norm": 2.493212508529885, "learning_rate": 1.9931115272956405e-05, "loss": 0.767, "step": 39 }, { "epoch": 0.21739130434782608, "grad_norm": 1.9209687838841931, "learning_rate": 1.992406353745939e-05, "loss": 0.7196, "step": 40 }, { "epoch": 0.22282608695652173, "grad_norm": 1.8290330319103352, "learning_rate": 1.9916669601596515e-05, "loss": 0.7299, "step": 41 }, { "epoch": 0.22826086956521738, "grad_norm": 1.7900648029089992, "learning_rate": 1.990893372032459e-05, "loss": 0.7229, "step": 42 }, { "epoch": 0.23369565217391305, "grad_norm": 1.6749799534602232, "learning_rate": 1.990085616039135e-05, "loss": 0.7238, "step": 43 }, { "epoch": 0.2391304347826087, "grad_norm": 1.986613572625418, "learning_rate": 1.989243720032624e-05, "loss": 0.7332, "step": 44 }, { "epoch": 0.24456521739130435, "grad_norm": 1.8912806129771145, "learning_rate": 1.9883677130430827e-05, "loss": 0.5864, "step": 45 }, { "epoch": 0.25, "grad_norm": 1.7750105086017574, "learning_rate": 1.9874576252768793e-05, "loss": 0.6124, "step": 46 }, { "epoch": 0.2554347826086957, "grad_norm": 1.2955635391212061, "learning_rate": 1.9865134881155504e-05, "loss": 0.6884, "step": 47 }, { "epoch": 0.2608695652173913, "grad_norm": 1.273010141736733, "learning_rate": 1.98553533411472e-05, "loss": 0.6484, "step": 48 }, { "epoch": 0.266304347826087, "grad_norm": 2.163538460282388, "learning_rate": 1.9845231970029774e-05, "loss": 0.7095, "step": 49 }, { "epoch": 0.2717391304347826, "grad_norm": 1.8775881503442995, "learning_rate": 1.983477111680712e-05, "loss": 0.604, "step": 50 }, { "epoch": 0.27717391304347827, "grad_norm": 1.5484748822902972, "learning_rate": 1.9823971142189126e-05, "loss": 0.6862, "step": 51 }, { "epoch": 0.2826086956521739, "grad_norm": 1.0946391927116763, "learning_rate": 1.981283241857922e-05, "loss": 0.6276, "step": 52 }, { "epoch": 0.28804347826086957, "grad_norm": 1.4879971843628843, "learning_rate": 1.9801355330061526e-05, "loss": 0.5763, "step": 53 }, { "epoch": 0.29347826086956524, "grad_norm": 1.8993705185884953, "learning_rate": 1.978954027238763e-05, "loss": 0.5908, "step": 54 }, { "epoch": 0.29891304347826086, "grad_norm": 1.6076663483914293, "learning_rate": 1.9777387652962933e-05, "loss": 0.5543, "step": 55 }, { "epoch": 0.30434782608695654, "grad_norm": 1.1740894440396383, "learning_rate": 1.9764897890832597e-05, "loss": 0.5458, "step": 56 }, { "epoch": 0.30978260869565216, "grad_norm": 1.9838553435397361, "learning_rate": 1.9752071416667102e-05, "loss": 0.5046, "step": 57 }, { "epoch": 0.31521739130434784, "grad_norm": 1.0812842728047714, "learning_rate": 1.973890867274738e-05, "loss": 0.5609, "step": 58 }, { "epoch": 0.32065217391304346, "grad_norm": 1.723223092822651, "learning_rate": 1.972541011294959e-05, "loss": 0.4724, "step": 59 }, { "epoch": 0.32608695652173914, "grad_norm": 1.4887350192643218, "learning_rate": 1.9711576202729445e-05, "loss": 0.5168, "step": 60 }, { "epoch": 0.33152173913043476, "grad_norm": 1.533986608527031, "learning_rate": 1.9697407419106178e-05, "loss": 0.5374, "step": 61 }, { "epoch": 0.33695652173913043, "grad_norm": 1.283663400004928, "learning_rate": 1.9682904250646084e-05, "loss": 0.622, "step": 62 }, { "epoch": 0.3423913043478261, "grad_norm": 1.511070122779534, "learning_rate": 1.9668067197445662e-05, "loss": 0.572, "step": 63 }, { "epoch": 0.34782608695652173, "grad_norm": 1.843030359662425, "learning_rate": 1.9652896771114416e-05, "loss": 0.5449, "step": 64 }, { "epoch": 0.3532608695652174, "grad_norm": 2.2753033401712752, "learning_rate": 1.9637393494757146e-05, "loss": 0.6883, "step": 65 }, { "epoch": 0.358695652173913, "grad_norm": 1.1407510209951979, "learning_rate": 1.962155790295597e-05, "loss": 0.4357, "step": 66 }, { "epoch": 0.3641304347826087, "grad_norm": 1.351954153650573, "learning_rate": 1.9605390541751864e-05, "loss": 0.5109, "step": 67 }, { "epoch": 0.3695652173913043, "grad_norm": 1.2344312626302043, "learning_rate": 1.9588891968625828e-05, "loss": 0.5133, "step": 68 }, { "epoch": 0.375, "grad_norm": 3.528171261663953, "learning_rate": 1.9572062752479684e-05, "loss": 0.7135, "step": 69 }, { "epoch": 0.3804347826086957, "grad_norm": 1.0283054372439564, "learning_rate": 1.9554903473616432e-05, "loss": 0.4934, "step": 70 }, { "epoch": 0.3858695652173913, "grad_norm": 1.2480924815092371, "learning_rate": 1.953741472372027e-05, "loss": 0.3846, "step": 71 }, { "epoch": 0.391304347826087, "grad_norm": 1.4701584460006578, "learning_rate": 1.951959710583616e-05, "loss": 0.5303, "step": 72 }, { "epoch": 0.3967391304347826, "grad_norm": 2.2396908880712774, "learning_rate": 1.950145123434907e-05, "loss": 0.4241, "step": 73 }, { "epoch": 0.40217391304347827, "grad_norm": 1.7904621917947958, "learning_rate": 1.9482977734962753e-05, "loss": 0.6144, "step": 74 }, { "epoch": 0.4076086956521739, "grad_norm": 1.650705831140192, "learning_rate": 1.94641772446782e-05, "loss": 0.592, "step": 75 }, { "epoch": 0.41304347826086957, "grad_norm": 1.588255971243881, "learning_rate": 1.9445050411771648e-05, "loss": 0.5918, "step": 76 }, { "epoch": 0.41847826086956524, "grad_norm": 1.4379861368277966, "learning_rate": 1.9425597895772257e-05, "loss": 0.604, "step": 77 }, { "epoch": 0.42391304347826086, "grad_norm": 1.7783069990731366, "learning_rate": 1.9405820367439343e-05, "loss": 0.6351, "step": 78 }, { "epoch": 0.42934782608695654, "grad_norm": 1.3451929958729711, "learning_rate": 1.9385718508739263e-05, "loss": 0.4487, "step": 79 }, { "epoch": 0.43478260869565216, "grad_norm": 1.5631174238633363, "learning_rate": 1.9365293012821887e-05, "loss": 0.5412, "step": 80 }, { "epoch": 0.44021739130434784, "grad_norm": 1.7641796531654723, "learning_rate": 1.934454458399671e-05, "loss": 0.4606, "step": 81 }, { "epoch": 0.44565217391304346, "grad_norm": 2.007206796904478, "learning_rate": 1.9323473937708565e-05, "loss": 0.5409, "step": 82 }, { "epoch": 0.45108695652173914, "grad_norm": 1.6060302211544533, "learning_rate": 1.9302081800512943e-05, "loss": 0.5194, "step": 83 }, { "epoch": 0.45652173913043476, "grad_norm": 1.584139057778314, "learning_rate": 1.9280368910050943e-05, "loss": 0.4662, "step": 84 }, { "epoch": 0.46195652173913043, "grad_norm": 1.8953323400594193, "learning_rate": 1.9258336015023847e-05, "loss": 0.4433, "step": 85 }, { "epoch": 0.4673913043478261, "grad_norm": 1.6067605181621798, "learning_rate": 1.9235983875167296e-05, "loss": 0.4255, "step": 86 }, { "epoch": 0.47282608695652173, "grad_norm": 1.4529302278758023, "learning_rate": 1.9213313261225083e-05, "loss": 0.4364, "step": 87 }, { "epoch": 0.4782608695652174, "grad_norm": 1.9965642456327142, "learning_rate": 1.9190324954922594e-05, "loss": 0.4199, "step": 88 }, { "epoch": 0.483695652173913, "grad_norm": 1.9458245431232768, "learning_rate": 1.9167019748939847e-05, "loss": 0.4024, "step": 89 }, { "epoch": 0.4891304347826087, "grad_norm": 2.000159805579825, "learning_rate": 1.914339844688415e-05, "loss": 0.4595, "step": 90 }, { "epoch": 0.4945652173913043, "grad_norm": 1.97378975953703, "learning_rate": 1.91194618632624e-05, "loss": 0.4917, "step": 91 }, { "epoch": 0.5, "grad_norm": 1.3771983904411074, "learning_rate": 1.9095210823452997e-05, "loss": 0.3341, "step": 92 }, { "epoch": 0.5054347826086957, "grad_norm": 1.8123410249166505, "learning_rate": 1.9070646163677383e-05, "loss": 0.4285, "step": 93 }, { "epoch": 0.5108695652173914, "grad_norm": 1.7561172390607174, "learning_rate": 1.9045768730971198e-05, "loss": 0.3863, "step": 94 }, { "epoch": 0.5163043478260869, "grad_norm": 1.809060828661053, "learning_rate": 1.9020579383155087e-05, "loss": 0.3486, "step": 95 }, { "epoch": 0.5217391304347826, "grad_norm": 1.541206279317173, "learning_rate": 1.899507898880512e-05, "loss": 0.1713, "step": 96 }, { "epoch": 0.5271739130434783, "grad_norm": 2.0502484531232343, "learning_rate": 1.8969268427222823e-05, "loss": 0.2059, "step": 97 }, { "epoch": 0.532608695652174, "grad_norm": 1.8524406597388374, "learning_rate": 1.8943148588404877e-05, "loss": 0.3856, "step": 98 }, { "epoch": 0.5380434782608695, "grad_norm": 3.385889154621842, "learning_rate": 1.8916720373012425e-05, "loss": 0.3027, "step": 99 }, { "epoch": 0.5434782608695652, "grad_norm": 1.2814547066301334, "learning_rate": 1.8889984692340015e-05, "loss": 0.1609, "step": 100 }, { "epoch": 0.5489130434782609, "grad_norm": 1.473493575445019, "learning_rate": 1.8862942468284174e-05, "loss": 0.1658, "step": 101 }, { "epoch": 0.5543478260869565, "grad_norm": 2.2017906861514125, "learning_rate": 1.883559463331162e-05, "loss": 0.2269, "step": 102 }, { "epoch": 0.5597826086956522, "grad_norm": 2.9266092953974345, "learning_rate": 1.880794213042711e-05, "loss": 0.2638, "step": 103 }, { "epoch": 0.5652173913043478, "grad_norm": 1.2470192969755443, "learning_rate": 1.8779985913140927e-05, "loss": 0.1826, "step": 104 }, { "epoch": 0.5706521739130435, "grad_norm": 1.1329281006012806, "learning_rate": 1.875172694543599e-05, "loss": 0.0992, "step": 105 }, { "epoch": 0.5760869565217391, "grad_norm": 1.435458967360399, "learning_rate": 1.8723166201734626e-05, "loss": 0.1052, "step": 106 }, { "epoch": 0.5815217391304348, "grad_norm": 2.4406380430615244, "learning_rate": 1.869430466686497e-05, "loss": 0.1999, "step": 107 }, { "epoch": 0.5869565217391305, "grad_norm": 1.0271614062096617, "learning_rate": 1.8665143336027e-05, "loss": 0.0855, "step": 108 }, { "epoch": 0.592391304347826, "grad_norm": 1.3651592297249626, "learning_rate": 1.8635683214758213e-05, "loss": 0.0977, "step": 109 }, { "epoch": 0.5978260869565217, "grad_norm": 0.5945892482638718, "learning_rate": 1.8605925318898973e-05, "loss": 0.0337, "step": 110 }, { "epoch": 0.6032608695652174, "grad_norm": 1.194835217639101, "learning_rate": 1.8575870674557467e-05, "loss": 0.0722, "step": 111 }, { "epoch": 0.6086956521739131, "grad_norm": 1.762735939201958, "learning_rate": 1.8545520318074328e-05, "loss": 0.1228, "step": 112 }, { "epoch": 0.6141304347826086, "grad_norm": 1.017829163872169, "learning_rate": 1.85148752959869e-05, "loss": 0.0344, "step": 113 }, { "epoch": 0.6195652173913043, "grad_norm": 1.052690658912748, "learning_rate": 1.8483936664993152e-05, "loss": 0.0377, "step": 114 }, { "epoch": 0.625, "grad_norm": 1.7977784022224987, "learning_rate": 1.8452705491915232e-05, "loss": 0.141, "step": 115 }, { "epoch": 0.6304347826086957, "grad_norm": 1.8477093237099182, "learning_rate": 1.8421182853662704e-05, "loss": 0.0734, "step": 116 }, { "epoch": 0.6358695652173914, "grad_norm": 0.6794730347498438, "learning_rate": 1.8389369837195387e-05, "loss": 0.0266, "step": 117 }, { "epoch": 0.6413043478260869, "grad_norm": 0.8818635589659883, "learning_rate": 1.835726753948589e-05, "loss": 0.0487, "step": 118 }, { "epoch": 0.6467391304347826, "grad_norm": 1.0608887498751458, "learning_rate": 1.8324877067481782e-05, "loss": 0.0275, "step": 119 }, { "epoch": 0.6521739130434783, "grad_norm": 1.3129587931586821, "learning_rate": 1.829219953806743e-05, "loss": 0.0642, "step": 120 }, { "epoch": 0.657608695652174, "grad_norm": 1.8948301224723039, "learning_rate": 1.825923607802547e-05, "loss": 0.0785, "step": 121 }, { "epoch": 0.6630434782608695, "grad_norm": 0.2518374968408712, "learning_rate": 1.8225987823997967e-05, "loss": 0.0111, "step": 122 }, { "epoch": 0.6684782608695652, "grad_norm": 0.25552971144651465, "learning_rate": 1.8192455922447227e-05, "loss": 0.0103, "step": 123 }, { "epoch": 0.6739130434782609, "grad_norm": 0.7841302667217214, "learning_rate": 1.815864152961624e-05, "loss": 0.0122, "step": 124 }, { "epoch": 0.6793478260869565, "grad_norm": 0.1515291563958561, "learning_rate": 1.812454581148884e-05, "loss": 0.0079, "step": 125 }, { "epoch": 0.6847826086956522, "grad_norm": 0.11584834326779594, "learning_rate": 1.8090169943749477e-05, "loss": 0.0055, "step": 126 }, { "epoch": 0.6902173913043478, "grad_norm": 0.1740566784478502, "learning_rate": 1.8055515111742688e-05, "loss": 0.0069, "step": 127 }, { "epoch": 0.6956521739130435, "grad_norm": 1.5625062014274096, "learning_rate": 1.8020582510432234e-05, "loss": 0.0383, "step": 128 }, { "epoch": 0.7010869565217391, "grad_norm": 0.12273159750563628, "learning_rate": 1.798537334435986e-05, "loss": 0.0062, "step": 129 }, { "epoch": 0.7065217391304348, "grad_norm": 3.693193027378141, "learning_rate": 1.7949888827603813e-05, "loss": 0.1765, "step": 130 }, { "epoch": 0.7119565217391305, "grad_norm": 0.12477337459792677, "learning_rate": 1.791413018373692e-05, "loss": 0.0057, "step": 131 }, { "epoch": 0.717391304347826, "grad_norm": 0.8357268279739778, "learning_rate": 1.7878098645784447e-05, "loss": 0.0163, "step": 132 }, { "epoch": 0.7228260869565217, "grad_norm": 3.8264656288549985, "learning_rate": 1.7841795456181556e-05, "loss": 0.1727, "step": 133 }, { "epoch": 0.7282608695652174, "grad_norm": 0.6387227523871831, "learning_rate": 1.780522186673046e-05, "loss": 0.0076, "step": 134 }, { "epoch": 0.7336956521739131, "grad_norm": 0.09079528876022976, "learning_rate": 1.776837913855728e-05, "loss": 0.0038, "step": 135 }, { "epoch": 0.7391304347826086, "grad_norm": 1.9001901725953279, "learning_rate": 1.7731268542068536e-05, "loss": 0.0208, "step": 136 }, { "epoch": 0.7445652173913043, "grad_norm": 0.21704170005212517, "learning_rate": 1.7693891356907357e-05, "loss": 0.007, "step": 137 }, { "epoch": 0.75, "grad_norm": 0.7213653784073487, "learning_rate": 1.7656248871909346e-05, "loss": 0.0137, "step": 138 }, { "epoch": 0.7554347826086957, "grad_norm": 0.40110602562720454, "learning_rate": 1.7618342385058147e-05, "loss": 0.0099, "step": 139 }, { "epoch": 0.7608695652173914, "grad_norm": 2.026407827233553, "learning_rate": 1.758017320344068e-05, "loss": 0.0415, "step": 140 }, { "epoch": 0.7663043478260869, "grad_norm": 1.1169723105563958, "learning_rate": 1.754174264320208e-05, "loss": 0.0232, "step": 141 }, { "epoch": 0.7717391304347826, "grad_norm": 0.1746366846193237, "learning_rate": 1.7503052029500308e-05, "loss": 0.0052, "step": 142 }, { "epoch": 0.7771739130434783, "grad_norm": 2.3203125623649874, "learning_rate": 1.7464102696460447e-05, "loss": 0.2205, "step": 143 }, { "epoch": 0.782608695652174, "grad_norm": 3.9663829407278315, "learning_rate": 1.7424895987128723e-05, "loss": 0.223, "step": 144 }, { "epoch": 0.7880434782608695, "grad_norm": 2.9570619026185883, "learning_rate": 1.738543325342617e-05, "loss": 0.0697, "step": 145 }, { "epoch": 0.7934782608695652, "grad_norm": 0.07057319843123724, "learning_rate": 1.7345715856102024e-05, "loss": 0.0031, "step": 146 }, { "epoch": 0.7989130434782609, "grad_norm": 0.11320521777018241, "learning_rate": 1.7305745164686816e-05, "loss": 0.0042, "step": 147 }, { "epoch": 0.8043478260869565, "grad_norm": 1.3124572295306176, "learning_rate": 1.7265522557445115e-05, "loss": 0.021, "step": 148 }, { "epoch": 0.8097826086956522, "grad_norm": 0.42701665371399616, "learning_rate": 1.7225049421328024e-05, "loss": 0.0091, "step": 149 }, { "epoch": 0.8152173913043478, "grad_norm": 0.6276112813031721, "learning_rate": 1.7184327151925366e-05, "loss": 0.0094, "step": 150 }, { "epoch": 0.8206521739130435, "grad_norm": 1.5664524264393311, "learning_rate": 1.7143357153417533e-05, "loss": 0.0256, "step": 151 }, { "epoch": 0.8260869565217391, "grad_norm": 0.41431375770399115, "learning_rate": 1.710214083852709e-05, "loss": 0.0117, "step": 152 }, { "epoch": 0.8315217391304348, "grad_norm": 0.3493269925986, "learning_rate": 1.7060679628470054e-05, "loss": 0.0084, "step": 153 }, { "epoch": 0.8369565217391305, "grad_norm": 0.3211404898250956, "learning_rate": 1.7018974952906885e-05, "loss": 0.0084, "step": 154 }, { "epoch": 0.842391304347826, "grad_norm": 0.21231254558257762, "learning_rate": 1.697702824989319e-05, "loss": 0.0065, "step": 155 }, { "epoch": 0.8478260869565217, "grad_norm": 1.457137599474762, "learning_rate": 1.693484096583014e-05, "loss": 0.0226, "step": 156 }, { "epoch": 0.8532608695652174, "grad_norm": 0.19497147073015395, "learning_rate": 1.6892414555414594e-05, "loss": 0.0048, "step": 157 }, { "epoch": 0.8586956521739131, "grad_norm": 1.8062131040571878, "learning_rate": 1.6849750481588936e-05, "loss": 0.0277, "step": 158 }, { "epoch": 0.8641304347826086, "grad_norm": 1.3188356922598312, "learning_rate": 1.680685021549063e-05, "loss": 0.0207, "step": 159 }, { "epoch": 0.8695652173913043, "grad_norm": 0.26492812790936593, "learning_rate": 1.6763715236401493e-05, "loss": 0.0059, "step": 160 }, { "epoch": 0.875, "grad_norm": 0.3017199408994534, "learning_rate": 1.672034703169669e-05, "loss": 0.0076, "step": 161 }, { "epoch": 0.8804347826086957, "grad_norm": 0.1252817764595737, "learning_rate": 1.667674709679344e-05, "loss": 0.0041, "step": 162 }, { "epoch": 0.8858695652173914, "grad_norm": 1.1529370223873083, "learning_rate": 1.663291693509946e-05, "loss": 0.019, "step": 163 }, { "epoch": 0.8913043478260869, "grad_norm": 0.12063163996672908, "learning_rate": 1.658885805796111e-05, "loss": 0.0031, "step": 164 }, { "epoch": 0.8967391304347826, "grad_norm": 0.11125376158368971, "learning_rate": 1.6544571984611306e-05, "loss": 0.0034, "step": 165 }, { "epoch": 0.9021739130434783, "grad_norm": 0.19945453640512878, "learning_rate": 1.6500060242117096e-05, "loss": 0.0051, "step": 166 }, { "epoch": 0.907608695652174, "grad_norm": 0.07254620014242376, "learning_rate": 1.6455324365327035e-05, "loss": 0.0026, "step": 167 }, { "epoch": 0.9130434782608695, "grad_norm": 1.3895686723936829, "learning_rate": 1.6410365896818253e-05, "loss": 0.0234, "step": 168 }, { "epoch": 0.9184782608695652, "grad_norm": 0.7517916115731629, "learning_rate": 1.636518638684325e-05, "loss": 0.0057, "step": 169 }, { "epoch": 0.9239130434782609, "grad_norm": 0.11708397875230993, "learning_rate": 1.6319787393276463e-05, "loss": 0.0036, "step": 170 }, { "epoch": 0.9293478260869565, "grad_norm": 0.027987175186703777, "learning_rate": 1.6274170481560527e-05, "loss": 0.0015, "step": 171 }, { "epoch": 0.9347826086956522, "grad_norm": 0.17986790848065237, "learning_rate": 1.6228337224652307e-05, "loss": 0.0059, "step": 172 }, { "epoch": 0.9402173913043478, "grad_norm": 0.03867873116439446, "learning_rate": 1.6182289202968663e-05, "loss": 0.0017, "step": 173 }, { "epoch": 0.9456521739130435, "grad_norm": 0.057278523890185604, "learning_rate": 1.613602800433194e-05, "loss": 0.0024, "step": 174 }, { "epoch": 0.9510869565217391, "grad_norm": 2.728399164781685, "learning_rate": 1.6089555223915226e-05, "loss": 0.1588, "step": 175 }, { "epoch": 0.9565217391304348, "grad_norm": 0.3768997196852311, "learning_rate": 1.6042872464187352e-05, "loss": 0.0054, "step": 176 }, { "epoch": 0.9619565217391305, "grad_norm": 4.011589996542784, "learning_rate": 1.5995981334857625e-05, "loss": 0.0702, "step": 177 }, { "epoch": 0.967391304347826, "grad_norm": 0.49004409324214177, "learning_rate": 1.5948883452820326e-05, "loss": 0.01, "step": 178 }, { "epoch": 0.9728260869565217, "grad_norm": 0.048813631073329034, "learning_rate": 1.590158044209897e-05, "loss": 0.002, "step": 179 }, { "epoch": 0.9782608695652174, "grad_norm": 0.09547901003362863, "learning_rate": 1.5854073933790277e-05, "loss": 0.0024, "step": 180 }, { "epoch": 0.9836956521739131, "grad_norm": 2.3086350812363565, "learning_rate": 1.580636556600796e-05, "loss": 0.0277, "step": 181 }, { "epoch": 0.9891304347826086, "grad_norm": 2.752485470216331, "learning_rate": 1.575845698382622e-05, "loss": 0.0671, "step": 182 }, { "epoch": 0.9945652173913043, "grad_norm": 0.08760080184190135, "learning_rate": 1.5710349839223034e-05, "loss": 0.0025, "step": 183 }, { "epoch": 1.0, "grad_norm": 0.052319179757302624, "learning_rate": 1.566204579102317e-05, "loss": 0.0016, "step": 184 }, { "epoch": 1.0054347826086956, "grad_norm": 0.20188982483949725, "learning_rate": 1.561354650484102e-05, "loss": 0.0054, "step": 185 }, { "epoch": 1.0108695652173914, "grad_norm": 1.214861582615001, "learning_rate": 1.556485365302313e-05, "loss": 0.0095, "step": 186 }, { "epoch": 1.016304347826087, "grad_norm": 1.1857810014141275, "learning_rate": 1.5515968914590568e-05, "loss": 0.0161, "step": 187 }, { "epoch": 1.0217391304347827, "grad_norm": 0.19290187635263223, "learning_rate": 1.546689397518101e-05, "loss": 0.004, "step": 188 }, { "epoch": 1.0271739130434783, "grad_norm": 0.22326269659684472, "learning_rate": 1.5417630526990613e-05, "loss": 0.0044, "step": 189 }, { "epoch": 1.0326086956521738, "grad_norm": 0.0690691126927046, "learning_rate": 1.5368180268715678e-05, "loss": 0.0022, "step": 190 }, { "epoch": 1.0380434782608696, "grad_norm": 0.519784946142706, "learning_rate": 1.5318544905494063e-05, "loss": 0.0075, "step": 191 }, { "epoch": 1.0434782608695652, "grad_norm": 0.1210215491547705, "learning_rate": 1.52687261488464e-05, "loss": 0.0032, "step": 192 }, { "epoch": 1.048913043478261, "grad_norm": 0.1128182153705411, "learning_rate": 1.5218725716617062e-05, "loss": 0.0031, "step": 193 }, { "epoch": 1.0543478260869565, "grad_norm": 0.0917279431010188, "learning_rate": 1.5168545332914942e-05, "loss": 0.0032, "step": 194 }, { "epoch": 1.059782608695652, "grad_norm": 0.1599750281188914, "learning_rate": 1.5118186728054002e-05, "loss": 0.0034, "step": 195 }, { "epoch": 1.065217391304348, "grad_norm": 3.0052317701428906, "learning_rate": 1.50676516384936e-05, "loss": 0.2052, "step": 196 }, { "epoch": 1.0706521739130435, "grad_norm": 0.09347487309598097, "learning_rate": 1.5016941806778622e-05, "loss": 0.0024, "step": 197 }, { "epoch": 1.0760869565217392, "grad_norm": 0.6368154943577347, "learning_rate": 1.496605898147938e-05, "loss": 0.0112, "step": 198 }, { "epoch": 1.0815217391304348, "grad_norm": 0.08805765943523453, "learning_rate": 1.4915004917131345e-05, "loss": 0.0025, "step": 199 }, { "epoch": 1.0869565217391304, "grad_norm": 0.05469514003374087, "learning_rate": 1.4863781374174625e-05, "loss": 0.002, "step": 200 }, { "epoch": 1.0923913043478262, "grad_norm": 0.10652940546536208, "learning_rate": 1.4812390118893273e-05, "loss": 0.0032, "step": 201 }, { "epoch": 1.0978260869565217, "grad_norm": 4.207882558276106, "learning_rate": 1.4760832923354375e-05, "loss": 0.0583, "step": 202 }, { "epoch": 1.1032608695652173, "grad_norm": 0.0699647885839302, "learning_rate": 1.4709111565346948e-05, "loss": 0.0026, "step": 203 }, { "epoch": 1.108695652173913, "grad_norm": 0.30166623168218903, "learning_rate": 1.4657227828320637e-05, "loss": 0.006, "step": 204 }, { "epoch": 1.1141304347826086, "grad_norm": 4.199370993333585, "learning_rate": 1.4605183501324231e-05, "loss": 0.0775, "step": 205 }, { "epoch": 1.1195652173913044, "grad_norm": 0.32565218496952747, "learning_rate": 1.4552980378943953e-05, "loss": 0.0033, "step": 206 }, { "epoch": 1.125, "grad_norm": 0.0809703234967001, "learning_rate": 1.4500620261241598e-05, "loss": 0.0026, "step": 207 }, { "epoch": 1.1304347826086956, "grad_norm": 0.06883017026031267, "learning_rate": 1.4448104953692443e-05, "loss": 0.0019, "step": 208 }, { "epoch": 1.1358695652173914, "grad_norm": 0.08112137716749798, "learning_rate": 1.4395436267123017e-05, "loss": 0.0025, "step": 209 }, { "epoch": 1.141304347826087, "grad_norm": 0.0472362130550949, "learning_rate": 1.4342616017648632e-05, "loss": 0.0018, "step": 210 }, { "epoch": 1.1467391304347827, "grad_norm": 0.0884620238410297, "learning_rate": 1.4289646026610789e-05, "loss": 0.0021, "step": 211 }, { "epoch": 1.1521739130434783, "grad_norm": 0.04795365977948435, "learning_rate": 1.423652812051434e-05, "loss": 0.0017, "step": 212 }, { "epoch": 1.1576086956521738, "grad_norm": 0.02935797027689571, "learning_rate": 1.4183264130964545e-05, "loss": 0.0015, "step": 213 }, { "epoch": 1.1630434782608696, "grad_norm": 0.0668820523334726, "learning_rate": 1.4129855894603885e-05, "loss": 0.0027, "step": 214 }, { "epoch": 1.1684782608695652, "grad_norm": 0.7758685627388171, "learning_rate": 1.4076305253048748e-05, "loss": 0.0105, "step": 215 }, { "epoch": 1.1739130434782608, "grad_norm": 0.7009141120346845, "learning_rate": 1.4022614052825918e-05, "loss": 0.01, "step": 216 }, { "epoch": 1.1793478260869565, "grad_norm": 0.058294067779879076, "learning_rate": 1.3968784145308907e-05, "loss": 0.002, "step": 217 }, { "epoch": 1.184782608695652, "grad_norm": 0.09580668260043325, "learning_rate": 1.3914817386654112e-05, "loss": 0.0028, "step": 218 }, { "epoch": 1.190217391304348, "grad_norm": 4.864872194559485, "learning_rate": 1.3860715637736817e-05, "loss": 0.1252, "step": 219 }, { "epoch": 1.1956521739130435, "grad_norm": 0.15310828918627564, "learning_rate": 1.3806480764087027e-05, "loss": 0.003, "step": 220 }, { "epoch": 1.2010869565217392, "grad_norm": 0.3265801320494785, "learning_rate": 1.3752114635825138e-05, "loss": 0.005, "step": 221 }, { "epoch": 1.2065217391304348, "grad_norm": 4.409339908706341, "learning_rate": 1.369761912759744e-05, "loss": 0.1368, "step": 222 }, { "epoch": 1.2119565217391304, "grad_norm": 0.09658224964632216, "learning_rate": 1.3642996118511504e-05, "loss": 0.0027, "step": 223 }, { "epoch": 1.2173913043478262, "grad_norm": 0.13386998342251066, "learning_rate": 1.358824749207136e-05, "loss": 0.0029, "step": 224 }, { "epoch": 1.2228260869565217, "grad_norm": 0.058694075695156535, "learning_rate": 1.3533375136112563e-05, "loss": 0.0019, "step": 225 }, { "epoch": 1.2282608695652173, "grad_norm": 0.1675736492580823, "learning_rate": 1.3478380942737097e-05, "loss": 0.0041, "step": 226 }, { "epoch": 1.233695652173913, "grad_norm": 0.6605378406587118, "learning_rate": 1.3423266808248123e-05, "loss": 0.0064, "step": 227 }, { "epoch": 1.2391304347826086, "grad_norm": 0.07582983219640445, "learning_rate": 1.3368034633084603e-05, "loss": 0.0021, "step": 228 }, { "epoch": 1.2445652173913044, "grad_norm": 0.11839256459523798, "learning_rate": 1.331268632175576e-05, "loss": 0.0033, "step": 229 }, { "epoch": 1.25, "grad_norm": 0.498989993420891, "learning_rate": 1.3257223782775412e-05, "loss": 0.0058, "step": 230 }, { "epoch": 1.2554347826086958, "grad_norm": 0.0627689672183379, "learning_rate": 1.3201648928596164e-05, "loss": 0.0028, "step": 231 }, { "epoch": 1.2608695652173914, "grad_norm": 0.44003082591712833, "learning_rate": 1.3145963675543451e-05, "loss": 0.0056, "step": 232 }, { "epoch": 1.266304347826087, "grad_norm": 3.9655617256713556, "learning_rate": 1.3090169943749475e-05, "loss": 0.0738, "step": 233 }, { "epoch": 1.2717391304347827, "grad_norm": 0.1490491896911272, "learning_rate": 1.3034269657086993e-05, "loss": 0.003, "step": 234 }, { "epoch": 1.2771739130434783, "grad_norm": 0.255678738387853, "learning_rate": 1.2978264743102964e-05, "loss": 0.0036, "step": 235 }, { "epoch": 1.2826086956521738, "grad_norm": 0.08658556472142168, "learning_rate": 1.2922157132952106e-05, "loss": 0.003, "step": 236 }, { "epoch": 1.2880434782608696, "grad_norm": 0.056388528409829865, "learning_rate": 1.286594876133028e-05, "loss": 0.0016, "step": 237 }, { "epoch": 1.2934782608695652, "grad_norm": 1.5398049755885386, "learning_rate": 1.2809641566407802e-05, "loss": 0.0378, "step": 238 }, { "epoch": 1.2989130434782608, "grad_norm": 0.036566689298081184, "learning_rate": 1.27532374897626e-05, "loss": 0.0012, "step": 239 }, { "epoch": 1.3043478260869565, "grad_norm": 0.04920293791313143, "learning_rate": 1.2696738476313261e-05, "loss": 0.0017, "step": 240 }, { "epoch": 1.309782608695652, "grad_norm": 0.1402817359911882, "learning_rate": 1.2640146474251979e-05, "loss": 0.0036, "step": 241 }, { "epoch": 1.315217391304348, "grad_norm": 0.06831135225959813, "learning_rate": 1.258346343497736e-05, "loss": 0.0025, "step": 242 }, { "epoch": 1.3206521739130435, "grad_norm": 0.028285907167631727, "learning_rate": 1.2526691313027153e-05, "loss": 0.001, "step": 243 }, { "epoch": 1.3260869565217392, "grad_norm": 0.33707980146121225, "learning_rate": 1.2469832066010843e-05, "loss": 0.0074, "step": 244 }, { "epoch": 1.3315217391304348, "grad_norm": 0.02312342530538864, "learning_rate": 1.2412887654542147e-05, "loss": 0.001, "step": 245 }, { "epoch": 1.3369565217391304, "grad_norm": 0.026427047059385186, "learning_rate": 1.2355860042171421e-05, "loss": 0.0011, "step": 246 }, { "epoch": 1.3423913043478262, "grad_norm": 2.9263468296261164, "learning_rate": 1.2298751195317935e-05, "loss": 0.1557, "step": 247 }, { "epoch": 1.3478260869565217, "grad_norm": 0.020548021429656328, "learning_rate": 1.224156308320208e-05, "loss": 0.0009, "step": 248 }, { "epoch": 1.3532608695652173, "grad_norm": 0.025684644607937637, "learning_rate": 1.2184297677777463e-05, "loss": 0.0011, "step": 249 }, { "epoch": 1.358695652173913, "grad_norm": 0.4277199026740869, "learning_rate": 1.2126956953662914e-05, "loss": 0.0074, "step": 250 }, { "epoch": 1.3641304347826086, "grad_norm": 0.722362923284817, "learning_rate": 1.2069542888074386e-05, "loss": 0.0094, "step": 251 }, { "epoch": 1.3695652173913042, "grad_norm": 0.05042192018129352, "learning_rate": 1.2012057460756786e-05, "loss": 0.0016, "step": 252 }, { "epoch": 1.375, "grad_norm": 0.04160962471056512, "learning_rate": 1.1954502653915704e-05, "loss": 0.0014, "step": 253 }, { "epoch": 1.3804347826086958, "grad_norm": 0.04523201782339563, "learning_rate": 1.1896880452149077e-05, "loss": 0.0016, "step": 254 }, { "epoch": 1.3858695652173914, "grad_norm": 0.023639170674016628, "learning_rate": 1.1839192842378737e-05, "loss": 0.0009, "step": 255 }, { "epoch": 1.391304347826087, "grad_norm": 0.04866250108659108, "learning_rate": 1.1781441813781911e-05, "loss": 0.0014, "step": 256 }, { "epoch": 1.3967391304347827, "grad_norm": 0.027392748713626538, "learning_rate": 1.1723629357722622e-05, "loss": 0.001, "step": 257 }, { "epoch": 1.4021739130434783, "grad_norm": 0.04956045333392312, "learning_rate": 1.1665757467683025e-05, "loss": 0.0013, "step": 258 }, { "epoch": 1.4076086956521738, "grad_norm": 0.287445593085176, "learning_rate": 1.1607828139194683e-05, "loss": 0.0051, "step": 259 }, { "epoch": 1.4130434782608696, "grad_norm": 0.13531127988753577, "learning_rate": 1.1549843369769733e-05, "loss": 0.0023, "step": 260 }, { "epoch": 1.4184782608695652, "grad_norm": 0.16453092649100554, "learning_rate": 1.1491805158832028e-05, "loss": 0.0031, "step": 261 }, { "epoch": 1.4239130434782608, "grad_norm": 1.4301870845043336, "learning_rate": 1.1433715507648173e-05, "loss": 0.0166, "step": 262 }, { "epoch": 1.4293478260869565, "grad_norm": 0.06079450292325032, "learning_rate": 1.1375576419258543e-05, "loss": 0.0016, "step": 263 }, { "epoch": 1.434782608695652, "grad_norm": 0.12935761070271598, "learning_rate": 1.1317389898408188e-05, "loss": 0.0022, "step": 264 }, { "epoch": 1.440217391304348, "grad_norm": 0.06441466437879496, "learning_rate": 1.125915795147773e-05, "loss": 0.0017, "step": 265 }, { "epoch": 1.4456521739130435, "grad_norm": 0.11938010559111087, "learning_rate": 1.1200882586414168e-05, "loss": 0.0021, "step": 266 }, { "epoch": 1.4510869565217392, "grad_norm": 0.14576252527987352, "learning_rate": 1.114256581266162e-05, "loss": 0.0032, "step": 267 }, { "epoch": 1.4565217391304348, "grad_norm": 0.8091624068148694, "learning_rate": 1.1084209641092083e-05, "loss": 0.0098, "step": 268 }, { "epoch": 1.4619565217391304, "grad_norm": 0.07301592812987565, "learning_rate": 1.1025816083936036e-05, "loss": 0.0021, "step": 269 }, { "epoch": 1.4673913043478262, "grad_norm": 0.019465384139083376, "learning_rate": 1.0967387154713104e-05, "loss": 0.0008, "step": 270 }, { "epoch": 1.4728260869565217, "grad_norm": 0.02684807806576838, "learning_rate": 1.0908924868162605e-05, "loss": 0.0009, "step": 271 }, { "epoch": 1.4782608695652173, "grad_norm": 2.0536809709774086, "learning_rate": 1.0850431240174066e-05, "loss": 0.2241, "step": 272 }, { "epoch": 1.483695652173913, "grad_norm": 0.5395466577497267, "learning_rate": 1.0791908287717744e-05, "loss": 0.0097, "step": 273 }, { "epoch": 1.4891304347826086, "grad_norm": 3.6218348045652107, "learning_rate": 1.073335802877504e-05, "loss": 0.0488, "step": 274 }, { "epoch": 1.4945652173913042, "grad_norm": 0.0346000232826567, "learning_rate": 1.0674782482268953e-05, "loss": 0.0013, "step": 275 }, { "epoch": 1.5, "grad_norm": 0.031039844572176237, "learning_rate": 1.0616183667994435e-05, "loss": 0.0011, "step": 276 }, { "epoch": 1.5054347826086958, "grad_norm": 1.3869410436009917, "learning_rate": 1.0557563606548751e-05, "loss": 0.02, "step": 277 }, { "epoch": 1.5108695652173914, "grad_norm": 0.31857812561228843, "learning_rate": 1.0498924319261816e-05, "loss": 0.0046, "step": 278 }, { "epoch": 1.516304347826087, "grad_norm": 0.018901071551922013, "learning_rate": 1.0440267828126478e-05, "loss": 0.0007, "step": 279 }, { "epoch": 1.5217391304347827, "grad_norm": 0.35747451319055523, "learning_rate": 1.0381596155728823e-05, "loss": 0.0077, "step": 280 }, { "epoch": 1.5271739130434783, "grad_norm": 0.038504499041816166, "learning_rate": 1.0322911325178402e-05, "loss": 0.0012, "step": 281 }, { "epoch": 1.5326086956521738, "grad_norm": 0.061533456725221265, "learning_rate": 1.0264215360038483e-05, "loss": 0.0018, "step": 282 }, { "epoch": 1.5380434782608696, "grad_norm": 0.053405604412389306, "learning_rate": 1.0205510284256286e-05, "loss": 0.0014, "step": 283 }, { "epoch": 1.5434782608695652, "grad_norm": 0.1699993644991474, "learning_rate": 1.0146798122093167e-05, "loss": 0.0029, "step": 284 }, { "epoch": 1.5489130434782608, "grad_norm": 0.07043260478387495, "learning_rate": 1.0088080898054852e-05, "loss": 0.0013, "step": 285 }, { "epoch": 1.5543478260869565, "grad_norm": 0.050883436804006456, "learning_rate": 1.00293606368216e-05, "loss": 0.0018, "step": 286 }, { "epoch": 1.5597826086956523, "grad_norm": 0.2015858838482068, "learning_rate": 9.970639363178401e-06, "loss": 0.0034, "step": 287 }, { "epoch": 1.5652173913043477, "grad_norm": 0.15696624949542315, "learning_rate": 9.91191910194515e-06, "loss": 0.0024, "step": 288 }, { "epoch": 1.5706521739130435, "grad_norm": 0.016094697472839387, "learning_rate": 9.853201877906836e-06, "loss": 0.0007, "step": 289 }, { "epoch": 1.5760869565217392, "grad_norm": 2.6447259699825225, "learning_rate": 9.79448971574372e-06, "loss": 0.0868, "step": 290 }, { "epoch": 1.5815217391304348, "grad_norm": 0.034146999181789345, "learning_rate": 9.73578463996152e-06, "loss": 0.001, "step": 291 }, { "epoch": 1.5869565217391304, "grad_norm": 2.3913058327100507, "learning_rate": 9.677088674821601e-06, "loss": 0.0933, "step": 292 }, { "epoch": 1.5923913043478262, "grad_norm": 2.7206555164113113, "learning_rate": 9.618403844271179e-06, "loss": 0.0834, "step": 293 }, { "epoch": 1.5978260869565217, "grad_norm": 2.04432325341852, "learning_rate": 9.559732171873524e-06, "loss": 0.0509, "step": 294 }, { "epoch": 1.6032608695652173, "grad_norm": 3.408481044696874, "learning_rate": 9.50107568073819e-06, "loss": 0.1523, "step": 295 }, { "epoch": 1.608695652173913, "grad_norm": 0.15857623535162915, "learning_rate": 9.442436393451252e-06, "loss": 0.0037, "step": 296 }, { "epoch": 1.6141304347826086, "grad_norm": 0.48149742863897177, "learning_rate": 9.383816332005569e-06, "loss": 0.0066, "step": 297 }, { "epoch": 1.6195652173913042, "grad_norm": 0.43146507945514945, "learning_rate": 9.325217517731047e-06, "loss": 0.0063, "step": 298 }, { "epoch": 1.625, "grad_norm": 3.7183270965419526, "learning_rate": 9.266641971224963e-06, "loss": 0.0717, "step": 299 }, { "epoch": 1.6304347826086958, "grad_norm": 0.6284145395909966, "learning_rate": 9.208091712282261e-06, "loss": 0.0113, "step": 300 }, { "epoch": 1.6358695652173914, "grad_norm": 0.12204274733613643, "learning_rate": 9.149568759825937e-06, "loss": 0.003, "step": 301 }, { "epoch": 1.641304347826087, "grad_norm": 1.1716856729713159, "learning_rate": 9.091075131837399e-06, "loss": 0.016, "step": 302 }, { "epoch": 1.6467391304347827, "grad_norm": 2.3073801254975743, "learning_rate": 9.032612845286896e-06, "loss": 0.0625, "step": 303 }, { "epoch": 1.6521739130434783, "grad_norm": 0.24584369141616186, "learning_rate": 8.974183916063967e-06, "loss": 0.0038, "step": 304 }, { "epoch": 1.6576086956521738, "grad_norm": 0.896272637025756, "learning_rate": 8.915790358907924e-06, "loss": 0.0124, "step": 305 }, { "epoch": 1.6630434782608696, "grad_norm": 3.8696382415332957, "learning_rate": 8.857434187338381e-06, "loss": 0.0462, "step": 306 }, { "epoch": 1.6684782608695652, "grad_norm": 0.12503032249914797, "learning_rate": 8.799117413585836e-06, "loss": 0.0025, "step": 307 }, { "epoch": 1.6739130434782608, "grad_norm": 0.45154839467695335, "learning_rate": 8.740842048522268e-06, "loss": 0.0061, "step": 308 }, { "epoch": 1.6793478260869565, "grad_norm": 0.09419278918622512, "learning_rate": 8.682610101591813e-06, "loss": 0.002, "step": 309 }, { "epoch": 1.6847826086956523, "grad_norm": 0.4958479599321362, "learning_rate": 8.624423580741462e-06, "loss": 0.0086, "step": 310 }, { "epoch": 1.6902173913043477, "grad_norm": 0.11770008527271246, "learning_rate": 8.56628449235183e-06, "loss": 0.0025, "step": 311 }, { "epoch": 1.6956521739130435, "grad_norm": 0.369565128723298, "learning_rate": 8.508194841167975e-06, "loss": 0.0059, "step": 312 }, { "epoch": 1.7010869565217392, "grad_norm": 0.06235754588692365, "learning_rate": 8.450156630230267e-06, "loss": 0.0019, "step": 313 }, { "epoch": 1.7065217391304348, "grad_norm": 0.02787223131850643, "learning_rate": 8.39217186080532e-06, "loss": 0.0012, "step": 314 }, { "epoch": 1.7119565217391304, "grad_norm": 0.03719997929743275, "learning_rate": 8.334242532316977e-06, "loss": 0.0012, "step": 315 }, { "epoch": 1.7173913043478262, "grad_norm": 0.42795195182267215, "learning_rate": 8.276370642277383e-06, "loss": 0.0048, "step": 316 }, { "epoch": 1.7228260869565217, "grad_norm": 0.9372903840892463, "learning_rate": 8.21855818621809e-06, "loss": 0.0203, "step": 317 }, { "epoch": 1.7282608695652173, "grad_norm": 0.13870817101483046, "learning_rate": 8.160807157621262e-06, "loss": 0.0025, "step": 318 }, { "epoch": 1.733695652173913, "grad_norm": 0.2445880882562458, "learning_rate": 8.103119547850924e-06, "loss": 0.0037, "step": 319 }, { "epoch": 1.7391304347826086, "grad_norm": 0.06926518467785787, "learning_rate": 8.045497346084297e-06, "loss": 0.002, "step": 320 }, { "epoch": 1.7445652173913042, "grad_norm": 0.029704630377944685, "learning_rate": 7.98794253924322e-06, "loss": 0.0011, "step": 321 }, { "epoch": 1.75, "grad_norm": 0.02657434909385738, "learning_rate": 7.930457111925616e-06, "loss": 0.0012, "step": 322 }, { "epoch": 1.7554347826086958, "grad_norm": 0.087118861417369, "learning_rate": 7.873043046337086e-06, "loss": 0.002, "step": 323 }, { "epoch": 1.7608695652173914, "grad_norm": 0.029028883768708425, "learning_rate": 7.815702322222539e-06, "loss": 0.0009, "step": 324 }, { "epoch": 1.766304347826087, "grad_norm": 0.574091822654542, "learning_rate": 7.758436916797923e-06, "loss": 0.0092, "step": 325 }, { "epoch": 1.7717391304347827, "grad_norm": 0.043721730276414336, "learning_rate": 7.701248804682069e-06, "loss": 0.0014, "step": 326 }, { "epoch": 1.7771739130434783, "grad_norm": 2.4824141009923726, "learning_rate": 7.64413995782858e-06, "loss": 0.1501, "step": 327 }, { "epoch": 1.7826086956521738, "grad_norm": 0.3656857182755404, "learning_rate": 7.5871123454578534e-06, "loss": 0.0055, "step": 328 }, { "epoch": 1.7880434782608696, "grad_norm": 0.030565125424490584, "learning_rate": 7.530167933989161e-06, "loss": 0.001, "step": 329 }, { "epoch": 1.7934782608695652, "grad_norm": 0.6771809217496879, "learning_rate": 7.47330868697285e-06, "loss": 0.01, "step": 330 }, { "epoch": 1.7989130434782608, "grad_norm": 0.24573870561094346, "learning_rate": 7.4165365650226425e-06, "loss": 0.0049, "step": 331 }, { "epoch": 1.8043478260869565, "grad_norm": 0.8696535124002203, "learning_rate": 7.3598535257480244e-06, "loss": 0.0126, "step": 332 }, { "epoch": 1.8097826086956523, "grad_norm": 0.02189894312561321, "learning_rate": 7.30326152368674e-06, "loss": 0.0008, "step": 333 }, { "epoch": 1.8152173913043477, "grad_norm": 0.031609375803459974, "learning_rate": 7.246762510237404e-06, "loss": 0.0011, "step": 334 }, { "epoch": 1.8206521739130435, "grad_norm": 0.020342266321765227, "learning_rate": 7.1903584335922e-06, "loss": 0.0008, "step": 335 }, { "epoch": 1.8260869565217392, "grad_norm": 0.09248271114619741, "learning_rate": 7.134051238669722e-06, "loss": 0.0018, "step": 336 }, { "epoch": 1.8315217391304348, "grad_norm": 0.10061723518020388, "learning_rate": 7.077842867047897e-06, "loss": 0.0024, "step": 337 }, { "epoch": 1.8369565217391304, "grad_norm": 0.21992324150498122, "learning_rate": 7.021735256897035e-06, "loss": 0.0027, "step": 338 }, { "epoch": 1.8423913043478262, "grad_norm": 0.030816726743244916, "learning_rate": 6.965730342913011e-06, "loss": 0.0011, "step": 339 }, { "epoch": 1.8478260869565217, "grad_norm": 0.01683095603625154, "learning_rate": 6.909830056250527e-06, "loss": 0.0008, "step": 340 }, { "epoch": 1.8532608695652173, "grad_norm": 0.23379778261250125, "learning_rate": 6.8540363244565524e-06, "loss": 0.0043, "step": 341 }, { "epoch": 1.858695652173913, "grad_norm": 0.03675133534148478, "learning_rate": 6.798351071403839e-06, "loss": 0.001, "step": 342 }, { "epoch": 1.8641304347826086, "grad_norm": 0.1140408877999425, "learning_rate": 6.742776217224587e-06, "loss": 0.0027, "step": 343 }, { "epoch": 1.8695652173913042, "grad_norm": 0.02850900102579577, "learning_rate": 6.687313678244243e-06, "loss": 0.0009, "step": 344 }, { "epoch": 1.875, "grad_norm": 0.02532716939465366, "learning_rate": 6.6319653669154e-06, "loss": 0.001, "step": 345 }, { "epoch": 1.8804347826086958, "grad_norm": 0.10582087034471738, "learning_rate": 6.576733191751879e-06, "loss": 0.0029, "step": 346 }, { "epoch": 1.8858695652173914, "grad_norm": 2.4137374896779877, "learning_rate": 6.521619057262904e-06, "loss": 0.1004, "step": 347 }, { "epoch": 1.891304347826087, "grad_norm": 2.0298394937535122, "learning_rate": 6.466624863887437e-06, "loss": 0.0361, "step": 348 }, { "epoch": 1.8967391304347827, "grad_norm": 0.15424873092333466, "learning_rate": 6.411752507928643e-06, "loss": 0.0031, "step": 349 }, { "epoch": 1.9021739130434783, "grad_norm": 0.7343430535593085, "learning_rate": 6.357003881488499e-06, "loss": 0.0086, "step": 350 }, { "epoch": 1.9076086956521738, "grad_norm": 0.0169679254906056, "learning_rate": 6.302380872402562e-06, "loss": 0.0007, "step": 351 }, { "epoch": 1.9130434782608696, "grad_norm": 0.026108663412252976, "learning_rate": 6.247885364174866e-06, "loss": 0.001, "step": 352 }, { "epoch": 1.9184782608695652, "grad_norm": 0.022679414032134804, "learning_rate": 6.193519235912972e-06, "loss": 0.0008, "step": 353 }, { "epoch": 1.9239130434782608, "grad_norm": 0.02365404382322627, "learning_rate": 6.139284362263185e-06, "loss": 0.0008, "step": 354 }, { "epoch": 1.9293478260869565, "grad_norm": 0.014446988115359962, "learning_rate": 6.085182613345893e-06, "loss": 0.0006, "step": 355 }, { "epoch": 1.9347826086956523, "grad_norm": 0.016091425374232204, "learning_rate": 6.031215854691097e-06, "loss": 0.0007, "step": 356 }, { "epoch": 1.9402173913043477, "grad_norm": 0.01553827774955186, "learning_rate": 5.977385947174084e-06, "loss": 0.0007, "step": 357 }, { "epoch": 1.9456521739130435, "grad_norm": 0.17966133137766196, "learning_rate": 5.923694746951253e-06, "loss": 0.0028, "step": 358 }, { "epoch": 1.9510869565217392, "grad_norm": 0.02477310360295687, "learning_rate": 5.8701441053961185e-06, "loss": 0.0009, "step": 359 }, { "epoch": 1.9565217391304348, "grad_norm": 0.025478377542260965, "learning_rate": 5.816735869035458e-06, "loss": 0.0009, "step": 360 }, { "epoch": 1.9619565217391304, "grad_norm": 0.01385737253155479, "learning_rate": 5.7634718794856626e-06, "loss": 0.0006, "step": 361 }, { "epoch": 1.9673913043478262, "grad_norm": 0.2920694264321747, "learning_rate": 5.710353973389215e-06, "loss": 0.003, "step": 362 }, { "epoch": 1.9728260869565217, "grad_norm": 0.0609584809389905, "learning_rate": 5.657383982351368e-06, "loss": 0.0014, "step": 363 }, { "epoch": 1.9782608695652173, "grad_norm": 0.014022955163492444, "learning_rate": 5.604563732876989e-06, "loss": 0.0006, "step": 364 }, { "epoch": 1.983695652173913, "grad_norm": 0.02973603833790608, "learning_rate": 5.55189504630756e-06, "loss": 0.0009, "step": 365 }, { "epoch": 1.9891304347826086, "grad_norm": 0.07663989298851219, "learning_rate": 5.4993797387584056e-06, "loss": 0.0015, "step": 366 }, { "epoch": 1.9945652173913042, "grad_norm": 3.723476839809668, "learning_rate": 5.447019621056049e-06, "loss": 0.1512, "step": 367 }, { "epoch": 2.0, "grad_norm": 0.023508663828369594, "learning_rate": 5.394816498675772e-06, "loss": 0.0008, "step": 368 }, { "epoch": 2.005434782608696, "grad_norm": 0.014915331253251566, "learning_rate": 5.342772171679364e-06, "loss": 0.0006, "step": 369 }, { "epoch": 2.010869565217391, "grad_norm": 0.15045045132635565, "learning_rate": 5.290888434653056e-06, "loss": 0.0035, "step": 370 }, { "epoch": 2.016304347826087, "grad_norm": 0.02078710490582649, "learning_rate": 5.239167076645626e-06, "loss": 0.0009, "step": 371 }, { "epoch": 2.0217391304347827, "grad_norm": 0.08909809356955653, "learning_rate": 5.187609881106725e-06, "loss": 0.0021, "step": 372 }, { "epoch": 2.027173913043478, "grad_norm": 0.019002236928891497, "learning_rate": 5.136218625825374e-06, "loss": 0.0006, "step": 373 }, { "epoch": 2.032608695652174, "grad_norm": 0.04208850827532741, "learning_rate": 5.084995082868658e-06, "loss": 0.0009, "step": 374 }, { "epoch": 2.0380434782608696, "grad_norm": 0.046840875573742065, "learning_rate": 5.033941018520625e-06, "loss": 0.0014, "step": 375 }, { "epoch": 2.0434782608695654, "grad_norm": 0.1033934706809575, "learning_rate": 4.983058193221384e-06, "loss": 0.0019, "step": 376 }, { "epoch": 2.0489130434782608, "grad_norm": 0.1705166302206335, "learning_rate": 4.932348361506402e-06, "loss": 0.0033, "step": 377 }, { "epoch": 2.0543478260869565, "grad_norm": 0.028909235733879053, "learning_rate": 4.881813271946e-06, "loss": 0.0012, "step": 378 }, { "epoch": 2.0597826086956523, "grad_norm": 0.3030377695298429, "learning_rate": 4.831454667085059e-06, "loss": 0.0039, "step": 379 }, { "epoch": 2.0652173913043477, "grad_norm": 0.0477055277967709, "learning_rate": 4.781274283382941e-06, "loss": 0.001, "step": 380 }, { "epoch": 2.0706521739130435, "grad_norm": 0.0199106085983902, "learning_rate": 4.7312738511536035e-06, "loss": 0.0008, "step": 381 }, { "epoch": 2.0760869565217392, "grad_norm": 0.027962787198971308, "learning_rate": 4.681455094505938e-06, "loss": 0.001, "step": 382 }, { "epoch": 2.0815217391304346, "grad_norm": 0.0382934899009715, "learning_rate": 4.631819731284323e-06, "loss": 0.0011, "step": 383 }, { "epoch": 2.0869565217391304, "grad_norm": 0.013418670608056855, "learning_rate": 4.58236947300939e-06, "loss": 0.0006, "step": 384 }, { "epoch": 2.092391304347826, "grad_norm": 0.04141147762016092, "learning_rate": 4.5331060248189924e-06, "loss": 0.0013, "step": 385 }, { "epoch": 2.097826086956522, "grad_norm": 0.029823878767931914, "learning_rate": 4.4840310854094335e-06, "loss": 0.001, "step": 386 }, { "epoch": 2.1032608695652173, "grad_norm": 0.2181034359186816, "learning_rate": 4.435146346976873e-06, "loss": 0.004, "step": 387 }, { "epoch": 2.108695652173913, "grad_norm": 0.36490526428814946, "learning_rate": 4.386453495158983e-06, "loss": 0.0042, "step": 388 }, { "epoch": 2.114130434782609, "grad_norm": 0.0743305865977075, "learning_rate": 4.33795420897683e-06, "loss": 0.0011, "step": 389 }, { "epoch": 2.119565217391304, "grad_norm": 0.3000013681179252, "learning_rate": 4.289650160776967e-06, "loss": 0.0046, "step": 390 }, { "epoch": 2.125, "grad_norm": 0.05973611485866258, "learning_rate": 4.241543016173778e-06, "loss": 0.0011, "step": 391 }, { "epoch": 2.130434782608696, "grad_norm": 0.02140783876818863, "learning_rate": 4.19363443399204e-06, "loss": 0.0008, "step": 392 }, { "epoch": 2.135869565217391, "grad_norm": 0.01680791379596923, "learning_rate": 4.1459260662097235e-06, "loss": 0.0007, "step": 393 }, { "epoch": 2.141304347826087, "grad_norm": 0.5362708346340234, "learning_rate": 4.098419557901036e-06, "loss": 0.0077, "step": 394 }, { "epoch": 2.1467391304347827, "grad_norm": 0.016360773071928784, "learning_rate": 4.051116547179677e-06, "loss": 0.0007, "step": 395 }, { "epoch": 2.1521739130434785, "grad_norm": 0.28985199290673336, "learning_rate": 4.00401866514238e-06, "loss": 0.0044, "step": 396 }, { "epoch": 2.157608695652174, "grad_norm": 0.01604718518106245, "learning_rate": 3.957127535812651e-06, "loss": 0.0007, "step": 397 }, { "epoch": 2.1630434782608696, "grad_norm": 0.05241001721895836, "learning_rate": 3.910444776084777e-06, "loss": 0.0016, "step": 398 }, { "epoch": 2.1684782608695654, "grad_norm": 0.02209678496389779, "learning_rate": 3.8639719956680624e-06, "loss": 0.0008, "step": 399 }, { "epoch": 2.1739130434782608, "grad_norm": 0.020559716878607803, "learning_rate": 3.817710797031338e-06, "loss": 0.0008, "step": 400 }, { "epoch": 2.1793478260869565, "grad_norm": 0.014824391810911752, "learning_rate": 3.771662775347692e-06, "loss": 0.0006, "step": 401 }, { "epoch": 2.1847826086956523, "grad_norm": 0.015796576868617806, "learning_rate": 3.7258295184394743e-06, "loss": 0.0007, "step": 402 }, { "epoch": 2.1902173913043477, "grad_norm": 1.9188157660999832, "learning_rate": 3.680212606723542e-06, "loss": 0.0306, "step": 403 }, { "epoch": 2.1956521739130435, "grad_norm": 0.06391438687127189, "learning_rate": 3.6348136131567537e-06, "loss": 0.0019, "step": 404 }, { "epoch": 2.2010869565217392, "grad_norm": 0.17262747887734978, "learning_rate": 3.5896341031817517e-06, "loss": 0.0036, "step": 405 }, { "epoch": 2.2065217391304346, "grad_norm": 0.056665382264410494, "learning_rate": 3.5446756346729673e-06, "loss": 0.0012, "step": 406 }, { "epoch": 2.2119565217391304, "grad_norm": 1.9642610912379441, "learning_rate": 3.4999397578829076e-06, "loss": 0.037, "step": 407 }, { "epoch": 2.217391304347826, "grad_norm": 0.014116778100650137, "learning_rate": 3.4554280153886967e-06, "loss": 0.0006, "step": 408 }, { "epoch": 2.2228260869565215, "grad_norm": 0.024488008150664965, "learning_rate": 3.4111419420388904e-06, "loss": 0.001, "step": 409 }, { "epoch": 2.2282608695652173, "grad_norm": 0.5674032898921303, "learning_rate": 3.3670830649005437e-06, "loss": 0.0041, "step": 410 }, { "epoch": 2.233695652173913, "grad_norm": 0.02286422293729417, "learning_rate": 3.323252903206562e-06, "loss": 0.0009, "step": 411 }, { "epoch": 2.239130434782609, "grad_norm": 0.27168054236566974, "learning_rate": 3.279652968303313e-06, "loss": 0.0043, "step": 412 }, { "epoch": 2.244565217391304, "grad_norm": 0.1593898805811067, "learning_rate": 3.236284763598512e-06, "loss": 0.0035, "step": 413 }, { "epoch": 2.25, "grad_norm": 0.013081366094026997, "learning_rate": 3.1931497845093753e-06, "loss": 0.0006, "step": 414 }, { "epoch": 2.255434782608696, "grad_norm": 0.012814297915516075, "learning_rate": 3.150249518411067e-06, "loss": 0.0006, "step": 415 }, { "epoch": 2.260869565217391, "grad_norm": 0.07415100436276072, "learning_rate": 3.1075854445854093e-06, "loss": 0.0018, "step": 416 }, { "epoch": 2.266304347826087, "grad_norm": 0.027114643295979856, "learning_rate": 3.0651590341698633e-06, "loss": 0.0009, "step": 417 }, { "epoch": 2.2717391304347827, "grad_norm": 0.13722514020501544, "learning_rate": 3.0229717501068133e-06, "loss": 0.0023, "step": 418 }, { "epoch": 2.2771739130434785, "grad_norm": 0.023053695918606187, "learning_rate": 2.981025047093118e-06, "loss": 0.0009, "step": 419 }, { "epoch": 2.282608695652174, "grad_norm": 3.7468189613648253, "learning_rate": 2.9393203715299477e-06, "loss": 0.0598, "step": 420 }, { "epoch": 2.2880434782608696, "grad_norm": 0.08634045866789929, "learning_rate": 2.8978591614729114e-06, "loss": 0.0015, "step": 421 }, { "epoch": 2.2934782608695654, "grad_norm": 0.13994711242571936, "learning_rate": 2.856642846582469e-06, "loss": 0.0019, "step": 422 }, { "epoch": 2.2989130434782608, "grad_norm": 0.0519996408733201, "learning_rate": 2.8156728480746386e-06, "loss": 0.0011, "step": 423 }, { "epoch": 2.3043478260869565, "grad_norm": 0.01904905289611891, "learning_rate": 2.77495057867198e-06, "loss": 0.0007, "step": 424 }, { "epoch": 2.3097826086956523, "grad_norm": 1.2476206988634295, "learning_rate": 2.7344774425548917e-06, "loss": 0.0339, "step": 425 }, { "epoch": 2.3152173913043477, "grad_norm": 1.7884596495622582, "learning_rate": 2.694254835313187e-06, "loss": 0.1375, "step": 426 }, { "epoch": 2.3206521739130435, "grad_norm": 0.31025512064642874, "learning_rate": 2.654284143897976e-06, "loss": 0.0034, "step": 427 }, { "epoch": 2.3260869565217392, "grad_norm": 0.3488873501510679, "learning_rate": 2.6145667465738333e-06, "loss": 0.0039, "step": 428 }, { "epoch": 2.3315217391304346, "grad_norm": 0.589409734181312, "learning_rate": 2.57510401287128e-06, "loss": 0.0044, "step": 429 }, { "epoch": 2.3369565217391304, "grad_norm": 0.3987654975780055, "learning_rate": 2.535897303539554e-06, "loss": 0.0061, "step": 430 }, { "epoch": 2.342391304347826, "grad_norm": 0.015719041310887562, "learning_rate": 2.4969479704996935e-06, "loss": 0.0006, "step": 431 }, { "epoch": 2.3478260869565215, "grad_norm": 0.015180271606601303, "learning_rate": 2.4582573567979196e-06, "loss": 0.0006, "step": 432 }, { "epoch": 2.3532608695652173, "grad_norm": 0.04482488635397311, "learning_rate": 2.4198267965593224e-06, "loss": 0.0011, "step": 433 }, { "epoch": 2.358695652173913, "grad_norm": 0.28160845350626884, "learning_rate": 2.381657614941858e-06, "loss": 0.005, "step": 434 }, { "epoch": 2.364130434782609, "grad_norm": 0.09873212459265543, "learning_rate": 2.3437511280906576e-06, "loss": 0.002, "step": 435 }, { "epoch": 2.369565217391304, "grad_norm": 0.028522981368259783, "learning_rate": 2.306108643092647e-06, "loss": 0.0008, "step": 436 }, { "epoch": 2.375, "grad_norm": 0.030887088059580514, "learning_rate": 2.268731457931467e-06, "loss": 0.001, "step": 437 }, { "epoch": 2.380434782608696, "grad_norm": 0.2056153085824592, "learning_rate": 2.2316208614427226e-06, "loss": 0.003, "step": 438 }, { "epoch": 2.385869565217391, "grad_norm": 0.03316498797260578, "learning_rate": 2.1947781332695406e-06, "loss": 0.001, "step": 439 }, { "epoch": 2.391304347826087, "grad_norm": 0.020603866879399167, "learning_rate": 2.1582045438184464e-06, "loss": 0.0007, "step": 440 }, { "epoch": 2.3967391304347827, "grad_norm": 0.022416446968247912, "learning_rate": 2.121901354215553e-06, "loss": 0.0008, "step": 441 }, { "epoch": 2.4021739130434785, "grad_norm": 1.2759832400444016, "learning_rate": 2.085869816263081e-06, "loss": 0.0222, "step": 442 }, { "epoch": 2.407608695652174, "grad_norm": 2.7040121657564558, "learning_rate": 2.050111172396192e-06, "loss": 0.0472, "step": 443 }, { "epoch": 2.4130434782608696, "grad_norm": 0.10233992459998235, "learning_rate": 2.0146266556401405e-06, "loss": 0.0016, "step": 444 }, { "epoch": 2.4184782608695654, "grad_norm": 0.244848209656816, "learning_rate": 1.97941748956777e-06, "loss": 0.004, "step": 445 }, { "epoch": 2.4239130434782608, "grad_norm": 0.05688444318906805, "learning_rate": 1.944484888257312e-06, "loss": 0.0013, "step": 446 }, { "epoch": 2.4293478260869565, "grad_norm": 0.5574195380686696, "learning_rate": 1.9098300562505266e-06, "loss": 0.0112, "step": 447 }, { "epoch": 2.4347826086956523, "grad_norm": 0.0932057849593417, "learning_rate": 1.8754541885111631e-06, "loss": 0.0018, "step": 448 }, { "epoch": 2.4402173913043477, "grad_norm": 0.10747253772821316, "learning_rate": 1.8413584703837618e-06, "loss": 0.0018, "step": 449 }, { "epoch": 2.4456521739130435, "grad_norm": 0.39067007335009907, "learning_rate": 1.8075440775527754e-06, "loss": 0.0063, "step": 450 }, { "epoch": 2.4510869565217392, "grad_norm": 0.028328534672816628, "learning_rate": 1.7740121760020324e-06, "loss": 0.001, "step": 451 }, { "epoch": 2.4565217391304346, "grad_norm": 0.12079880404676811, "learning_rate": 1.740763921974531e-06, "loss": 0.0024, "step": 452 }, { "epoch": 2.4619565217391304, "grad_norm": 0.10850662346060039, "learning_rate": 1.7078004619325728e-06, "loss": 0.0017, "step": 453 }, { "epoch": 2.467391304347826, "grad_norm": 0.2673103325118139, "learning_rate": 1.6751229325182194e-06, "loss": 0.0067, "step": 454 }, { "epoch": 2.4728260869565215, "grad_norm": 0.20052250560415452, "learning_rate": 1.6427324605141125e-06, "loss": 0.0037, "step": 455 }, { "epoch": 2.4782608695652173, "grad_norm": 0.08452549445673675, "learning_rate": 1.610630162804615e-06, "loss": 0.0015, "step": 456 }, { "epoch": 2.483695652173913, "grad_norm": 0.01638519542637996, "learning_rate": 1.578817146337297e-06, "loss": 0.0006, "step": 457 }, { "epoch": 2.489130434782609, "grad_norm": 0.03107206330508472, "learning_rate": 1.5472945080847679e-06, "loss": 0.0008, "step": 458 }, { "epoch": 2.494565217391304, "grad_norm": 0.03654411098415488, "learning_rate": 1.516063335006851e-06, "loss": 0.0009, "step": 459 }, { "epoch": 2.5, "grad_norm": 0.07287899663917816, "learning_rate": 1.485124704013101e-06, "loss": 0.0017, "step": 460 }, { "epoch": 2.505434782608696, "grad_norm": 0.9588849867572242, "learning_rate": 1.4544796819256724e-06, "loss": 0.0086, "step": 461 }, { "epoch": 2.5108695652173916, "grad_norm": 0.02467713549047941, "learning_rate": 1.4241293254425337e-06, "loss": 0.0007, "step": 462 }, { "epoch": 2.516304347826087, "grad_norm": 0.04748495142661645, "learning_rate": 1.3940746811010297e-06, "loss": 0.0011, "step": 463 }, { "epoch": 2.5217391304347827, "grad_norm": 0.03054669361577949, "learning_rate": 1.3643167852417894e-06, "loss": 0.001, "step": 464 }, { "epoch": 2.5271739130434785, "grad_norm": 0.027111109257002528, "learning_rate": 1.3348566639730032e-06, "loss": 0.0011, "step": 465 }, { "epoch": 2.532608695652174, "grad_norm": 0.04377035701857717, "learning_rate": 1.3056953331350297e-06, "loss": 0.001, "step": 466 }, { "epoch": 2.5380434782608696, "grad_norm": 0.08382313642398824, "learning_rate": 1.2768337982653744e-06, "loss": 0.0014, "step": 467 }, { "epoch": 2.5434782608695654, "grad_norm": 0.030219514519134735, "learning_rate": 1.2482730545640133e-06, "loss": 0.0011, "step": 468 }, { "epoch": 2.5489130434782608, "grad_norm": 0.42539314485494417, "learning_rate": 1.2200140868590759e-06, "loss": 0.0063, "step": 469 }, { "epoch": 2.5543478260869565, "grad_norm": 0.025687483062924163, "learning_rate": 1.1920578695728903e-06, "loss": 0.0009, "step": 470 }, { "epoch": 2.5597826086956523, "grad_norm": 0.027491319722765094, "learning_rate": 1.1644053666883803e-06, "loss": 0.0009, "step": 471 }, { "epoch": 2.5652173913043477, "grad_norm": 0.12070804850917503, "learning_rate": 1.137057531715825e-06, "loss": 0.0023, "step": 472 }, { "epoch": 2.5706521739130435, "grad_norm": 0.1648819505998384, "learning_rate": 1.1100153076599862e-06, "loss": 0.0025, "step": 473 }, { "epoch": 2.5760869565217392, "grad_norm": 0.1168751069545925, "learning_rate": 1.0832796269875757e-06, "loss": 0.0023, "step": 474 }, { "epoch": 2.5815217391304346, "grad_norm": 0.030968178239974237, "learning_rate": 1.0568514115951256e-06, "loss": 0.001, "step": 475 }, { "epoch": 2.5869565217391304, "grad_norm": 1.2108714841296098, "learning_rate": 1.0307315727771806e-06, "loss": 0.0126, "step": 476 }, { "epoch": 2.592391304347826, "grad_norm": 0.027899777268609836, "learning_rate": 1.0049210111948815e-06, "loss": 0.0009, "step": 477 }, { "epoch": 2.5978260869565215, "grad_norm": 0.03180410299281123, "learning_rate": 9.794206168449127e-07, "loss": 0.0009, "step": 478 }, { "epoch": 2.6032608695652173, "grad_norm": 0.033244233145600086, "learning_rate": 9.542312690288035e-07, "loss": 0.0009, "step": 479 }, { "epoch": 2.608695652173913, "grad_norm": 0.03761724722059268, "learning_rate": 9.293538363226196e-07, "loss": 0.0013, "step": 480 }, { "epoch": 2.6141304347826084, "grad_norm": 0.09136376989366057, "learning_rate": 9.04789176547004e-07, "loss": 0.0018, "step": 481 }, { "epoch": 2.619565217391304, "grad_norm": 0.18059210345284965, "learning_rate": 8.80538136737602e-07, "loss": 0.0029, "step": 482 }, { "epoch": 2.625, "grad_norm": 0.030807943380701246, "learning_rate": 8.566015531158534e-07, "loss": 0.0008, "step": 483 }, { "epoch": 2.630434782608696, "grad_norm": 0.05710411212363332, "learning_rate": 8.329802510601559e-07, "loss": 0.0014, "step": 484 }, { "epoch": 2.6358695652173916, "grad_norm": 0.061848371459409315, "learning_rate": 8.096750450774071e-07, "loss": 0.0016, "step": 485 }, { "epoch": 2.641304347826087, "grad_norm": 1.0253370343843025, "learning_rate": 7.866867387749199e-07, "loss": 0.0166, "step": 486 }, { "epoch": 2.6467391304347827, "grad_norm": 0.029136594892818037, "learning_rate": 7.640161248327061e-07, "loss": 0.001, "step": 487 }, { "epoch": 2.6521739130434785, "grad_norm": 1.092489264260611, "learning_rate": 7.416639849761531e-07, "loss": 0.0248, "step": 488 }, { "epoch": 2.657608695652174, "grad_norm": 2.2914238948250363, "learning_rate": 7.196310899490577e-07, "loss": 0.0723, "step": 489 }, { "epoch": 2.6630434782608696, "grad_norm": 0.016249601644455224, "learning_rate": 6.979181994870587e-07, "loss": 0.0007, "step": 490 }, { "epoch": 2.6684782608695654, "grad_norm": 0.021265124563151435, "learning_rate": 6.765260622914361e-07, "loss": 0.0007, "step": 491 }, { "epoch": 2.6739130434782608, "grad_norm": 0.03831610583206101, "learning_rate": 6.554554160032899e-07, "loss": 0.001, "step": 492 }, { "epoch": 2.6793478260869565, "grad_norm": 0.03101608692853337, "learning_rate": 6.347069871781164e-07, "loss": 0.0009, "step": 493 }, { "epoch": 2.6847826086956523, "grad_norm": 0.01978989576112469, "learning_rate": 6.142814912607409e-07, "loss": 0.0008, "step": 494 }, { "epoch": 2.6902173913043477, "grad_norm": 0.3852432741704962, "learning_rate": 5.941796325606574e-07, "loss": 0.007, "step": 495 }, { "epoch": 2.6956521739130435, "grad_norm": 0.39628033120487305, "learning_rate": 5.744021042277437e-07, "loss": 0.0052, "step": 496 }, { "epoch": 2.7010869565217392, "grad_norm": 0.09815745867450933, "learning_rate": 5.549495882283528e-07, "loss": 0.0019, "step": 497 }, { "epoch": 2.7065217391304346, "grad_norm": 2.2778045886314655, "learning_rate": 5.358227553218031e-07, "loss": 0.0699, "step": 498 }, { "epoch": 2.7119565217391304, "grad_norm": 0.027783255312989117, "learning_rate": 5.17022265037247e-07, "loss": 0.0009, "step": 499 }, { "epoch": 2.717391304347826, "grad_norm": 0.04524039432637041, "learning_rate": 4.985487656509313e-07, "loss": 0.0013, "step": 500 }, { "epoch": 2.7228260869565215, "grad_norm": 1.8660426088847626, "learning_rate": 4.804028941638405e-07, "loss": 0.0379, "step": 501 }, { "epoch": 2.7282608695652173, "grad_norm": 0.05194490259797287, "learning_rate": 4.6258527627973446e-07, "loss": 0.0011, "step": 502 }, { "epoch": 2.733695652173913, "grad_norm": 0.5524275731086881, "learning_rate": 4.450965263835694e-07, "loss": 0.0059, "step": 503 }, { "epoch": 2.7391304347826084, "grad_norm": 0.09638176861935786, "learning_rate": 4.2793724752031807e-07, "loss": 0.0014, "step": 504 }, { "epoch": 2.744565217391304, "grad_norm": 1.5902794253403654, "learning_rate": 4.111080313741711e-07, "loss": 0.0265, "step": 505 }, { "epoch": 2.75, "grad_norm": 0.027472533837749617, "learning_rate": 3.9460945824813635e-07, "loss": 0.0007, "step": 506 }, { "epoch": 2.755434782608696, "grad_norm": 0.1279143225656888, "learning_rate": 3.7844209704403055e-07, "loss": 0.0029, "step": 507 }, { "epoch": 2.7608695652173916, "grad_norm": 0.026463459883835142, "learning_rate": 3.626065052428551e-07, "loss": 0.0008, "step": 508 }, { "epoch": 2.766304347826087, "grad_norm": 0.27505638314757236, "learning_rate": 3.471032288855869e-07, "loss": 0.0041, "step": 509 }, { "epoch": 2.7717391304347827, "grad_norm": 0.03755249242727417, "learning_rate": 3.3193280255433556e-07, "loss": 0.0011, "step": 510 }, { "epoch": 2.7771739130434785, "grad_norm": 1.3351363822022542, "learning_rate": 3.170957493539195e-07, "loss": 0.0158, "step": 511 }, { "epoch": 2.782608695652174, "grad_norm": 0.02416580008302714, "learning_rate": 3.0259258089382236e-07, "loss": 0.0009, "step": 512 }, { "epoch": 2.7880434782608696, "grad_norm": 0.24305894735810873, "learning_rate": 2.88423797270555e-07, "loss": 0.0033, "step": 513 }, { "epoch": 2.7934782608695654, "grad_norm": 0.0170002796253045, "learning_rate": 2.745898870504116e-07, "loss": 0.0006, "step": 514 }, { "epoch": 2.7989130434782608, "grad_norm": 0.07161898082689806, "learning_rate": 2.6109132725262166e-07, "loss": 0.0017, "step": 515 }, { "epoch": 2.8043478260869565, "grad_norm": 1.0122242308252756, "learning_rate": 2.479285833329015e-07, "loss": 0.0147, "step": 516 }, { "epoch": 2.8097826086956523, "grad_norm": 0.32571610548502183, "learning_rate": 2.351021091674044e-07, "loss": 0.0056, "step": 517 }, { "epoch": 2.8152173913043477, "grad_norm": 0.04130977709089724, "learning_rate": 2.226123470370689e-07, "loss": 0.0012, "step": 518 }, { "epoch": 2.8206521739130435, "grad_norm": 1.7181531921107351, "learning_rate": 2.104597276123721e-07, "loss": 0.0401, "step": 519 }, { "epoch": 2.8260869565217392, "grad_norm": 0.02705959014069028, "learning_rate": 1.9864466993847808e-07, "loss": 0.0009, "step": 520 }, { "epoch": 2.8315217391304346, "grad_norm": 0.018768961604310398, "learning_rate": 1.8716758142078295e-07, "loss": 0.0007, "step": 521 }, { "epoch": 2.8369565217391304, "grad_norm": 0.018345985510552047, "learning_rate": 1.7602885781087486e-07, "loss": 0.0008, "step": 522 }, { "epoch": 2.842391304347826, "grad_norm": 0.02605179058078712, "learning_rate": 1.6522888319288166e-07, "loss": 0.0009, "step": 523 }, { "epoch": 2.8478260869565215, "grad_norm": 0.03616601605859018, "learning_rate": 1.5476802997022812e-07, "loss": 0.001, "step": 524 }, { "epoch": 2.8532608695652173, "grad_norm": 0.025004543897905514, "learning_rate": 1.4464665885279948e-07, "loss": 0.0008, "step": 525 }, { "epoch": 2.858695652173913, "grad_norm": 2.0898366886384756, "learning_rate": 1.3486511884449827e-07, "loss": 0.0181, "step": 526 }, { "epoch": 2.8641304347826084, "grad_norm": 1.4763355304020689, "learning_rate": 1.254237472312092e-07, "loss": 0.0246, "step": 527 }, { "epoch": 2.869565217391304, "grad_norm": 0.012470194296572264, "learning_rate": 1.1632286956917427e-07, "loss": 0.0006, "step": 528 }, { "epoch": 2.875, "grad_norm": 0.01829002489612097, "learning_rate": 1.075627996737627e-07, "loss": 0.0008, "step": 529 }, { "epoch": 2.880434782608696, "grad_norm": 0.38603420185863435, "learning_rate": 9.914383960865081e-08, "loss": 0.0047, "step": 530 }, { "epoch": 2.8858695652173916, "grad_norm": 0.19367430087338477, "learning_rate": 9.106627967540915e-08, "loss": 0.0024, "step": 531 }, { "epoch": 2.891304347826087, "grad_norm": 0.4411181370450418, "learning_rate": 8.333039840348833e-08, "loss": 0.0042, "step": 532 }, { "epoch": 2.8967391304347827, "grad_norm": 0.012209939294930026, "learning_rate": 7.593646254061448e-08, "loss": 0.0006, "step": 533 }, { "epoch": 2.9021739130434785, "grad_norm": 0.330056623962809, "learning_rate": 6.888472704359661e-08, "loss": 0.006, "step": 534 }, { "epoch": 2.907608695652174, "grad_norm": 0.034219507512194006, "learning_rate": 6.217543506952916e-08, "loss": 0.001, "step": 535 }, { "epoch": 2.9130434782608696, "grad_norm": 0.018554429841025816, "learning_rate": 5.580881796741322e-08, "loss": 0.0007, "step": 536 }, { "epoch": 2.9184782608695654, "grad_norm": 0.03648100584653491, "learning_rate": 4.978509527017283e-08, "loss": 0.0009, "step": 537 }, { "epoch": 2.9239130434782608, "grad_norm": 0.024703372477637507, "learning_rate": 4.410447468709001e-08, "loss": 0.001, "step": 538 }, { "epoch": 2.9293478260869565, "grad_norm": 0.04043268074636393, "learning_rate": 3.8767152096641504e-08, "loss": 0.001, "step": 539 }, { "epoch": 2.9347826086956523, "grad_norm": 0.046268704953150705, "learning_rate": 3.377331153974206e-08, "loss": 0.0015, "step": 540 }, { "epoch": 2.9402173913043477, "grad_norm": 0.06556550271817785, "learning_rate": 2.912312521340277e-08, "loss": 0.001, "step": 541 }, { "epoch": 2.9456521739130435, "grad_norm": 0.11585425133268303, "learning_rate": 2.4816753464789177e-08, "loss": 0.0018, "step": 542 }, { "epoch": 2.9510869565217392, "grad_norm": 0.07767672609852741, "learning_rate": 2.0854344785694593e-08, "loss": 0.0016, "step": 543 }, { "epoch": 2.9565217391304346, "grad_norm": 0.4371414896745222, "learning_rate": 1.7236035807416397e-08, "loss": 0.0058, "step": 544 }, { "epoch": 2.9619565217391304, "grad_norm": 0.05437143993551097, "learning_rate": 1.3961951296053156e-08, "loss": 0.0012, "step": 545 }, { "epoch": 2.967391304347826, "grad_norm": 0.08967830928285274, "learning_rate": 1.1032204148191395e-08, "loss": 0.0015, "step": 546 }, { "epoch": 2.9728260869565215, "grad_norm": 0.0632058201324481, "learning_rate": 8.446895387019815e-09, "loss": 0.0013, "step": 547 }, { "epoch": 2.9782608695652173, "grad_norm": 0.5781101930402831, "learning_rate": 6.206114158845422e-09, "loss": 0.0104, "step": 548 }, { "epoch": 2.983695652173913, "grad_norm": 0.03599473788037359, "learning_rate": 4.309937730015978e-09, "loss": 0.0009, "step": 549 }, { "epoch": 2.9891304347826084, "grad_norm": 0.029545010274494552, "learning_rate": 2.758431484259916e-09, "loss": 0.001, "step": 550 }, { "epoch": 2.994565217391304, "grad_norm": 0.04112363341260623, "learning_rate": 1.5516489204303598e-09, "loss": 0.001, "step": 551 }, { "epoch": 3.0, "grad_norm": 0.28259585170188845, "learning_rate": 6.896316506554979e-10, "loss": 0.0056, "step": 552 }, { "epoch": 3.0, "step": 552, "total_flos": 4395674998272.0, "train_loss": 0.49586228307948593, "train_runtime": 3133.0691, "train_samples_per_second": 2.813, "train_steps_per_second": 0.176 } ], "logging_steps": 1.0, "max_steps": 552, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000, "total_flos": 4395674998272.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }