Mamba-370m-88F1-45000 / trainer_state.json
ChlorophyllChampion's picture
Upload 11 files
82f0515 verified
raw
history blame
No virus
72.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9820824512777984,
"eval_steps": 500,
"global_step": 45000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.288641095161438,
"learning_rate": 9.97817594552716e-06,
"loss": 5.7446,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 0.8790096044540405,
"learning_rate": 9.956351891054321e-06,
"loss": 5.5963,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 0.7012779712677002,
"learning_rate": 9.93452783658148e-06,
"loss": 5.4268,
"step": 300
},
{
"epoch": 0.01,
"grad_norm": 0.5707438588142395,
"learning_rate": 9.912703782108642e-06,
"loss": 5.3015,
"step": 400
},
{
"epoch": 0.01,
"grad_norm": 0.4737650454044342,
"learning_rate": 9.890879727635801e-06,
"loss": 5.191,
"step": 500
},
{
"epoch": 0.01,
"grad_norm": 0.383696585893631,
"learning_rate": 9.86905567316296e-06,
"loss": 5.0995,
"step": 600
},
{
"epoch": 0.02,
"grad_norm": 0.4450148940086365,
"learning_rate": 9.847231618690121e-06,
"loss": 5.0545,
"step": 700
},
{
"epoch": 0.02,
"grad_norm": 0.40447548031806946,
"learning_rate": 9.82540756421728e-06,
"loss": 5.0217,
"step": 800
},
{
"epoch": 0.02,
"grad_norm": 0.44727349281311035,
"learning_rate": 9.80358350974444e-06,
"loss": 4.9963,
"step": 900
},
{
"epoch": 0.02,
"grad_norm": 0.3116198778152466,
"learning_rate": 9.781759455271601e-06,
"loss": 4.9777,
"step": 1000
},
{
"epoch": 0.02,
"grad_norm": 0.2814035415649414,
"learning_rate": 9.75993540079876e-06,
"loss": 4.9637,
"step": 1100
},
{
"epoch": 0.03,
"grad_norm": 0.49883121252059937,
"learning_rate": 9.738111346325922e-06,
"loss": 4.9526,
"step": 1200
},
{
"epoch": 0.03,
"grad_norm": 0.22215589880943298,
"learning_rate": 9.716287291853081e-06,
"loss": 4.9413,
"step": 1300
},
{
"epoch": 0.03,
"grad_norm": 0.2715405225753784,
"learning_rate": 9.694463237380242e-06,
"loss": 4.934,
"step": 1400
},
{
"epoch": 0.03,
"grad_norm": 0.1865217238664627,
"learning_rate": 9.672639182907401e-06,
"loss": 4.9255,
"step": 1500
},
{
"epoch": 0.03,
"grad_norm": 0.5209760069847107,
"learning_rate": 9.65081512843456e-06,
"loss": 4.9237,
"step": 1600
},
{
"epoch": 0.04,
"grad_norm": 0.20881299674510956,
"learning_rate": 9.62899107396172e-06,
"loss": 4.9139,
"step": 1700
},
{
"epoch": 0.04,
"grad_norm": 0.2848042845726013,
"learning_rate": 9.607167019488881e-06,
"loss": 4.9177,
"step": 1800
},
{
"epoch": 0.04,
"grad_norm": 0.49689555168151855,
"learning_rate": 9.58534296501604e-06,
"loss": 4.907,
"step": 1900
},
{
"epoch": 0.04,
"grad_norm": 0.37812674045562744,
"learning_rate": 9.563518910543202e-06,
"loss": 4.9057,
"step": 2000
},
{
"epoch": 0.05,
"grad_norm": 0.2938688099384308,
"learning_rate": 9.541694856070361e-06,
"loss": 4.899,
"step": 2100
},
{
"epoch": 0.05,
"grad_norm": 0.8247494101524353,
"learning_rate": 9.519870801597522e-06,
"loss": 4.8931,
"step": 2200
},
{
"epoch": 0.05,
"grad_norm": 0.3141603469848633,
"learning_rate": 9.498046747124681e-06,
"loss": 4.8942,
"step": 2300
},
{
"epoch": 0.05,
"grad_norm": 0.35256093740463257,
"learning_rate": 9.476222692651842e-06,
"loss": 4.8915,
"step": 2400
},
{
"epoch": 0.05,
"grad_norm": 0.2673836946487427,
"learning_rate": 9.454398638179002e-06,
"loss": 4.8863,
"step": 2500
},
{
"epoch": 0.06,
"grad_norm": 0.3463526964187622,
"learning_rate": 9.432574583706163e-06,
"loss": 4.8847,
"step": 2600
},
{
"epoch": 0.06,
"grad_norm": 0.35548537969589233,
"learning_rate": 9.41075052923332e-06,
"loss": 4.8811,
"step": 2700
},
{
"epoch": 0.06,
"grad_norm": 0.3124917447566986,
"learning_rate": 9.388926474760482e-06,
"loss": 4.8783,
"step": 2800
},
{
"epoch": 0.06,
"grad_norm": 0.294010192155838,
"learning_rate": 9.367102420287641e-06,
"loss": 4.8777,
"step": 2900
},
{
"epoch": 0.07,
"grad_norm": 0.36635321378707886,
"learning_rate": 9.345278365814802e-06,
"loss": 4.8754,
"step": 3000
},
{
"epoch": 0.07,
"grad_norm": 0.5457169413566589,
"learning_rate": 9.323454311341961e-06,
"loss": 4.8722,
"step": 3100
},
{
"epoch": 0.07,
"grad_norm": 0.2539723217487335,
"learning_rate": 9.301630256869122e-06,
"loss": 4.8691,
"step": 3200
},
{
"epoch": 0.07,
"grad_norm": 0.29804033041000366,
"learning_rate": 9.279806202396282e-06,
"loss": 4.867,
"step": 3300
},
{
"epoch": 0.07,
"grad_norm": 0.30564549565315247,
"learning_rate": 9.257982147923443e-06,
"loss": 4.8638,
"step": 3400
},
{
"epoch": 0.08,
"grad_norm": 0.25882017612457275,
"learning_rate": 9.236158093450602e-06,
"loss": 4.8631,
"step": 3500
},
{
"epoch": 0.08,
"grad_norm": 0.49584251642227173,
"learning_rate": 9.214334038977763e-06,
"loss": 4.8632,
"step": 3600
},
{
"epoch": 0.08,
"grad_norm": 0.5460361838340759,
"learning_rate": 9.192509984504923e-06,
"loss": 4.8606,
"step": 3700
},
{
"epoch": 0.08,
"grad_norm": 0.386802613735199,
"learning_rate": 9.170685930032082e-06,
"loss": 4.8601,
"step": 3800
},
{
"epoch": 0.09,
"grad_norm": 0.31271764636039734,
"learning_rate": 9.148861875559241e-06,
"loss": 4.8585,
"step": 3900
},
{
"epoch": 0.09,
"grad_norm": 0.2538485825061798,
"learning_rate": 9.127037821086402e-06,
"loss": 4.8567,
"step": 4000
},
{
"epoch": 0.09,
"grad_norm": 0.27854135632514954,
"learning_rate": 9.105213766613562e-06,
"loss": 4.8532,
"step": 4100
},
{
"epoch": 0.09,
"grad_norm": 0.5394597053527832,
"learning_rate": 9.083389712140723e-06,
"loss": 4.8536,
"step": 4200
},
{
"epoch": 0.09,
"grad_norm": 0.2850706875324249,
"learning_rate": 9.061565657667882e-06,
"loss": 4.853,
"step": 4300
},
{
"epoch": 0.1,
"grad_norm": 0.7254090309143066,
"learning_rate": 9.039741603195043e-06,
"loss": 4.8502,
"step": 4400
},
{
"epoch": 0.1,
"grad_norm": 0.6587111353874207,
"learning_rate": 9.017917548722203e-06,
"loss": 4.847,
"step": 4500
},
{
"epoch": 0.1,
"grad_norm": 0.7545880079269409,
"learning_rate": 8.996093494249362e-06,
"loss": 4.8501,
"step": 4600
},
{
"epoch": 0.1,
"grad_norm": 0.5480329990386963,
"learning_rate": 8.974269439776523e-06,
"loss": 4.8504,
"step": 4700
},
{
"epoch": 0.1,
"grad_norm": 0.26011794805526733,
"learning_rate": 8.952445385303682e-06,
"loss": 4.8478,
"step": 4800
},
{
"epoch": 0.11,
"grad_norm": 0.9462392926216125,
"learning_rate": 8.930621330830842e-06,
"loss": 4.8464,
"step": 4900
},
{
"epoch": 0.11,
"grad_norm": 0.526184618473053,
"learning_rate": 8.908797276358003e-06,
"loss": 4.8423,
"step": 5000
},
{
"epoch": 0.11,
"grad_norm": 1.1413213014602661,
"learning_rate": 8.886973221885162e-06,
"loss": 4.8433,
"step": 5100
},
{
"epoch": 0.11,
"grad_norm": 0.3846365511417389,
"learning_rate": 8.865149167412323e-06,
"loss": 4.8422,
"step": 5200
},
{
"epoch": 0.12,
"grad_norm": 0.4587865471839905,
"learning_rate": 8.843325112939482e-06,
"loss": 4.8427,
"step": 5300
},
{
"epoch": 0.12,
"grad_norm": 1.0439590215682983,
"learning_rate": 8.821501058466644e-06,
"loss": 4.8374,
"step": 5400
},
{
"epoch": 0.12,
"grad_norm": 0.9531126022338867,
"learning_rate": 8.799677003993803e-06,
"loss": 4.8381,
"step": 5500
},
{
"epoch": 0.12,
"grad_norm": 0.8820663690567017,
"learning_rate": 8.777852949520962e-06,
"loss": 4.8388,
"step": 5600
},
{
"epoch": 0.12,
"grad_norm": 1.2842873334884644,
"learning_rate": 8.756028895048123e-06,
"loss": 4.8375,
"step": 5700
},
{
"epoch": 0.13,
"grad_norm": 0.4219115972518921,
"learning_rate": 8.734204840575283e-06,
"loss": 4.8368,
"step": 5800
},
{
"epoch": 0.13,
"grad_norm": 0.27131006121635437,
"learning_rate": 8.712380786102442e-06,
"loss": 4.8358,
"step": 5900
},
{
"epoch": 0.13,
"grad_norm": 0.4501149654388428,
"learning_rate": 8.690556731629603e-06,
"loss": 4.8328,
"step": 6000
},
{
"epoch": 0.13,
"grad_norm": 1.1851238012313843,
"learning_rate": 8.668732677156762e-06,
"loss": 4.8353,
"step": 6100
},
{
"epoch": 0.14,
"grad_norm": 0.2990129292011261,
"learning_rate": 8.646908622683924e-06,
"loss": 4.8308,
"step": 6200
},
{
"epoch": 0.14,
"grad_norm": 0.3613658547401428,
"learning_rate": 8.625084568211083e-06,
"loss": 4.8318,
"step": 6300
},
{
"epoch": 0.14,
"grad_norm": 1.5732179880142212,
"learning_rate": 8.603260513738242e-06,
"loss": 4.8318,
"step": 6400
},
{
"epoch": 0.14,
"grad_norm": 0.8767447471618652,
"learning_rate": 8.581436459265403e-06,
"loss": 4.8299,
"step": 6500
},
{
"epoch": 0.14,
"grad_norm": 0.6536487340927124,
"learning_rate": 8.559612404792563e-06,
"loss": 4.8337,
"step": 6600
},
{
"epoch": 0.15,
"grad_norm": 0.5771917104721069,
"learning_rate": 8.537788350319724e-06,
"loss": 4.83,
"step": 6700
},
{
"epoch": 0.15,
"grad_norm": 1.4667840003967285,
"learning_rate": 8.515964295846883e-06,
"loss": 4.8282,
"step": 6800
},
{
"epoch": 0.15,
"grad_norm": 0.6794707775115967,
"learning_rate": 8.494140241374044e-06,
"loss": 4.8276,
"step": 6900
},
{
"epoch": 0.15,
"grad_norm": 0.7510731220245361,
"learning_rate": 8.472316186901203e-06,
"loss": 4.8286,
"step": 7000
},
{
"epoch": 0.15,
"grad_norm": 0.8489832282066345,
"learning_rate": 8.450492132428363e-06,
"loss": 4.8293,
"step": 7100
},
{
"epoch": 0.16,
"grad_norm": 0.6585586071014404,
"learning_rate": 8.428668077955522e-06,
"loss": 4.8287,
"step": 7200
},
{
"epoch": 0.16,
"grad_norm": 0.7326397895812988,
"learning_rate": 8.406844023482683e-06,
"loss": 4.8275,
"step": 7300
},
{
"epoch": 0.16,
"grad_norm": 1.032246708869934,
"learning_rate": 8.385019969009843e-06,
"loss": 4.8242,
"step": 7400
},
{
"epoch": 0.16,
"grad_norm": 0.7991085648536682,
"learning_rate": 8.363195914537004e-06,
"loss": 4.8244,
"step": 7500
},
{
"epoch": 0.17,
"grad_norm": 0.5138424634933472,
"learning_rate": 8.341371860064163e-06,
"loss": 4.8233,
"step": 7600
},
{
"epoch": 0.17,
"grad_norm": 0.4894898235797882,
"learning_rate": 8.319547805591324e-06,
"loss": 4.824,
"step": 7700
},
{
"epoch": 0.17,
"grad_norm": 0.3397706151008606,
"learning_rate": 8.297723751118483e-06,
"loss": 4.8236,
"step": 7800
},
{
"epoch": 0.17,
"grad_norm": 1.1616212129592896,
"learning_rate": 8.275899696645645e-06,
"loss": 4.824,
"step": 7900
},
{
"epoch": 0.17,
"grad_norm": 0.5268859267234802,
"learning_rate": 8.254075642172802e-06,
"loss": 4.8226,
"step": 8000
},
{
"epoch": 0.18,
"grad_norm": 0.6282734274864197,
"learning_rate": 8.232251587699963e-06,
"loss": 4.8224,
"step": 8100
},
{
"epoch": 0.18,
"grad_norm": 0.3148588240146637,
"learning_rate": 8.210427533227123e-06,
"loss": 4.8238,
"step": 8200
},
{
"epoch": 0.18,
"grad_norm": 0.6932478547096252,
"learning_rate": 8.188603478754284e-06,
"loss": 4.8237,
"step": 8300
},
{
"epoch": 0.18,
"grad_norm": 0.8093019723892212,
"learning_rate": 8.166779424281443e-06,
"loss": 4.8189,
"step": 8400
},
{
"epoch": 0.19,
"grad_norm": 1.4945412874221802,
"learning_rate": 8.144955369808604e-06,
"loss": 4.8182,
"step": 8500
},
{
"epoch": 0.19,
"grad_norm": 1.250691294670105,
"learning_rate": 8.123131315335763e-06,
"loss": 4.8236,
"step": 8600
},
{
"epoch": 0.19,
"grad_norm": 1.7631908655166626,
"learning_rate": 8.101307260862924e-06,
"loss": 4.8212,
"step": 8700
},
{
"epoch": 0.19,
"grad_norm": 1.1759347915649414,
"learning_rate": 8.079483206390084e-06,
"loss": 4.8222,
"step": 8800
},
{
"epoch": 0.19,
"grad_norm": 0.7231265306472778,
"learning_rate": 8.057659151917245e-06,
"loss": 4.8199,
"step": 8900
},
{
"epoch": 0.2,
"grad_norm": 0.3531375527381897,
"learning_rate": 8.035835097444404e-06,
"loss": 4.8185,
"step": 9000
},
{
"epoch": 0.2,
"grad_norm": 0.43936291337013245,
"learning_rate": 8.014011042971564e-06,
"loss": 4.8204,
"step": 9100
},
{
"epoch": 0.2,
"grad_norm": 0.5637614727020264,
"learning_rate": 7.992186988498723e-06,
"loss": 4.8175,
"step": 9200
},
{
"epoch": 0.2,
"grad_norm": 1.217545509338379,
"learning_rate": 7.970362934025884e-06,
"loss": 4.8201,
"step": 9300
},
{
"epoch": 0.21,
"grad_norm": 0.44672998785972595,
"learning_rate": 7.948538879553043e-06,
"loss": 4.8185,
"step": 9400
},
{
"epoch": 0.21,
"grad_norm": 1.0661417245864868,
"learning_rate": 7.926714825080204e-06,
"loss": 4.8185,
"step": 9500
},
{
"epoch": 0.21,
"grad_norm": 1.0079288482666016,
"learning_rate": 7.904890770607364e-06,
"loss": 4.8196,
"step": 9600
},
{
"epoch": 0.21,
"grad_norm": 1.2175804376602173,
"learning_rate": 7.883066716134525e-06,
"loss": 4.8164,
"step": 9700
},
{
"epoch": 0.21,
"grad_norm": 0.90774005651474,
"learning_rate": 7.861242661661684e-06,
"loss": 4.8163,
"step": 9800
},
{
"epoch": 0.22,
"grad_norm": 0.6773905754089355,
"learning_rate": 7.839418607188845e-06,
"loss": 4.8173,
"step": 9900
},
{
"epoch": 0.22,
"grad_norm": 0.4682576060295105,
"learning_rate": 7.817594552716005e-06,
"loss": 4.8157,
"step": 10000
},
{
"epoch": 0.22,
"grad_norm": 1.6193821430206299,
"learning_rate": 7.795770498243164e-06,
"loss": 4.8138,
"step": 10100
},
{
"epoch": 0.22,
"grad_norm": 0.4081961214542389,
"learning_rate": 7.773946443770323e-06,
"loss": 4.8123,
"step": 10200
},
{
"epoch": 0.22,
"grad_norm": 0.4450671374797821,
"learning_rate": 7.752122389297484e-06,
"loss": 4.8148,
"step": 10300
},
{
"epoch": 0.23,
"grad_norm": 1.6656301021575928,
"learning_rate": 7.730298334824644e-06,
"loss": 4.8142,
"step": 10400
},
{
"epoch": 0.23,
"grad_norm": 0.447218120098114,
"learning_rate": 7.708474280351805e-06,
"loss": 4.8143,
"step": 10500
},
{
"epoch": 0.23,
"grad_norm": 0.8786011338233948,
"learning_rate": 7.686650225878964e-06,
"loss": 4.8165,
"step": 10600
},
{
"epoch": 0.23,
"grad_norm": 0.8793469071388245,
"learning_rate": 7.664826171406125e-06,
"loss": 4.8126,
"step": 10700
},
{
"epoch": 0.24,
"grad_norm": 0.6492193937301636,
"learning_rate": 7.643002116933285e-06,
"loss": 4.8106,
"step": 10800
},
{
"epoch": 0.24,
"grad_norm": 1.1710741519927979,
"learning_rate": 7.621178062460445e-06,
"loss": 4.8131,
"step": 10900
},
{
"epoch": 0.24,
"grad_norm": 0.5375717282295227,
"learning_rate": 7.599354007987605e-06,
"loss": 4.8103,
"step": 11000
},
{
"epoch": 0.24,
"grad_norm": 0.38140830397605896,
"learning_rate": 7.577529953514765e-06,
"loss": 4.8127,
"step": 11100
},
{
"epoch": 0.24,
"grad_norm": 0.7099872827529907,
"learning_rate": 7.555705899041924e-06,
"loss": 4.8104,
"step": 11200
},
{
"epoch": 0.25,
"grad_norm": 2.029292583465576,
"learning_rate": 7.533881844569084e-06,
"loss": 4.8127,
"step": 11300
},
{
"epoch": 0.25,
"grad_norm": 0.3873797357082367,
"learning_rate": 7.512057790096244e-06,
"loss": 4.812,
"step": 11400
},
{
"epoch": 0.25,
"grad_norm": 1.0038777589797974,
"learning_rate": 7.490233735623404e-06,
"loss": 4.8082,
"step": 11500
},
{
"epoch": 0.25,
"grad_norm": 0.9627403616905212,
"learning_rate": 7.4684096811505646e-06,
"loss": 4.8084,
"step": 11600
},
{
"epoch": 0.26,
"grad_norm": 1.518621563911438,
"learning_rate": 7.446585626677725e-06,
"loss": 4.8094,
"step": 11700
},
{
"epoch": 0.26,
"grad_norm": 1.7235260009765625,
"learning_rate": 7.424761572204885e-06,
"loss": 4.8074,
"step": 11800
},
{
"epoch": 0.26,
"grad_norm": 0.3688587248325348,
"learning_rate": 7.402937517732045e-06,
"loss": 4.8097,
"step": 11900
},
{
"epoch": 0.26,
"grad_norm": 0.4294319748878479,
"learning_rate": 7.381113463259205e-06,
"loss": 4.8102,
"step": 12000
},
{
"epoch": 0.26,
"grad_norm": 0.3772117495536804,
"learning_rate": 7.359289408786366e-06,
"loss": 4.8091,
"step": 12100
},
{
"epoch": 0.27,
"grad_norm": 1.3151636123657227,
"learning_rate": 7.337465354313526e-06,
"loss": 4.809,
"step": 12200
},
{
"epoch": 0.27,
"grad_norm": 1.2710403203964233,
"learning_rate": 7.315641299840684e-06,
"loss": 4.807,
"step": 12300
},
{
"epoch": 0.27,
"grad_norm": 0.9449421763420105,
"learning_rate": 7.2938172453678445e-06,
"loss": 4.8092,
"step": 12400
},
{
"epoch": 0.27,
"grad_norm": 1.3880749940872192,
"learning_rate": 7.271993190895005e-06,
"loss": 4.8069,
"step": 12500
},
{
"epoch": 0.27,
"grad_norm": 0.5149086117744446,
"learning_rate": 7.250169136422165e-06,
"loss": 4.8114,
"step": 12600
},
{
"epoch": 0.28,
"grad_norm": 2.83113431930542,
"learning_rate": 7.228345081949325e-06,
"loss": 4.8091,
"step": 12700
},
{
"epoch": 0.28,
"grad_norm": 0.9617094397544861,
"learning_rate": 7.206521027476485e-06,
"loss": 4.8045,
"step": 12800
},
{
"epoch": 0.28,
"grad_norm": 1.3411821126937866,
"learning_rate": 7.1846969730036456e-06,
"loss": 4.8057,
"step": 12900
},
{
"epoch": 0.28,
"grad_norm": 0.40530508756637573,
"learning_rate": 7.162872918530806e-06,
"loss": 4.8063,
"step": 13000
},
{
"epoch": 0.29,
"grad_norm": 0.9460883736610413,
"learning_rate": 7.141048864057966e-06,
"loss": 4.8064,
"step": 13100
},
{
"epoch": 0.29,
"grad_norm": 2.0108609199523926,
"learning_rate": 7.119224809585125e-06,
"loss": 4.8069,
"step": 13200
},
{
"epoch": 0.29,
"grad_norm": 1.5277504920959473,
"learning_rate": 7.0974007551122855e-06,
"loss": 4.8068,
"step": 13300
},
{
"epoch": 0.29,
"grad_norm": 0.932159960269928,
"learning_rate": 7.075576700639445e-06,
"loss": 4.8018,
"step": 13400
},
{
"epoch": 0.29,
"grad_norm": 1.497610092163086,
"learning_rate": 7.053752646166605e-06,
"loss": 4.803,
"step": 13500
},
{
"epoch": 0.3,
"grad_norm": 1.3723843097686768,
"learning_rate": 7.031928591693765e-06,
"loss": 4.8015,
"step": 13600
},
{
"epoch": 0.3,
"grad_norm": 0.7923598885536194,
"learning_rate": 7.0101045372209255e-06,
"loss": 4.8079,
"step": 13700
},
{
"epoch": 0.3,
"grad_norm": 1.1815203428268433,
"learning_rate": 6.988280482748086e-06,
"loss": 4.8027,
"step": 13800
},
{
"epoch": 0.3,
"grad_norm": 1.163899540901184,
"learning_rate": 6.966456428275246e-06,
"loss": 4.8029,
"step": 13900
},
{
"epoch": 0.31,
"grad_norm": 0.4986168146133423,
"learning_rate": 6.944632373802406e-06,
"loss": 4.8076,
"step": 14000
},
{
"epoch": 0.31,
"grad_norm": 0.6901636123657227,
"learning_rate": 6.9228083193295655e-06,
"loss": 4.8044,
"step": 14100
},
{
"epoch": 0.31,
"grad_norm": 0.4600536525249481,
"learning_rate": 6.900984264856726e-06,
"loss": 4.8058,
"step": 14200
},
{
"epoch": 0.31,
"grad_norm": 1.3702683448791504,
"learning_rate": 6.879160210383886e-06,
"loss": 4.8022,
"step": 14300
},
{
"epoch": 0.31,
"grad_norm": 0.6329184770584106,
"learning_rate": 6.857336155911045e-06,
"loss": 4.8018,
"step": 14400
},
{
"epoch": 0.32,
"grad_norm": 0.4246502220630646,
"learning_rate": 6.8355121014382055e-06,
"loss": 4.8045,
"step": 14500
},
{
"epoch": 0.32,
"grad_norm": 0.6552340388298035,
"learning_rate": 6.813688046965366e-06,
"loss": 4.8023,
"step": 14600
},
{
"epoch": 0.32,
"grad_norm": 0.6138970255851746,
"learning_rate": 6.791863992492526e-06,
"loss": 4.8051,
"step": 14700
},
{
"epoch": 0.32,
"grad_norm": 0.8509910106658936,
"learning_rate": 6.770039938019686e-06,
"loss": 4.8034,
"step": 14800
},
{
"epoch": 0.33,
"grad_norm": 1.2667418718338013,
"learning_rate": 6.7482158835468455e-06,
"loss": 4.8035,
"step": 14900
},
{
"epoch": 0.33,
"grad_norm": 0.723858118057251,
"learning_rate": 6.726391829074006e-06,
"loss": 4.8048,
"step": 15000
},
{
"epoch": 0.33,
"grad_norm": 0.5858640670776367,
"learning_rate": 6.704567774601166e-06,
"loss": 4.8043,
"step": 15100
},
{
"epoch": 0.33,
"grad_norm": 1.3455156087875366,
"learning_rate": 6.682743720128326e-06,
"loss": 4.8048,
"step": 15200
},
{
"epoch": 0.33,
"grad_norm": 0.7291358709335327,
"learning_rate": 6.660919665655486e-06,
"loss": 4.8034,
"step": 15300
},
{
"epoch": 0.34,
"grad_norm": 0.9461612105369568,
"learning_rate": 6.6390956111826465e-06,
"loss": 4.8009,
"step": 15400
},
{
"epoch": 0.34,
"grad_norm": 1.1789884567260742,
"learning_rate": 6.617271556709806e-06,
"loss": 4.8041,
"step": 15500
},
{
"epoch": 0.34,
"grad_norm": 0.9464890360832214,
"learning_rate": 6.595447502236966e-06,
"loss": 4.8045,
"step": 15600
},
{
"epoch": 0.34,
"grad_norm": 0.7488701343536377,
"learning_rate": 6.5736234477641254e-06,
"loss": 4.8018,
"step": 15700
},
{
"epoch": 0.34,
"grad_norm": 0.8682188391685486,
"learning_rate": 6.551799393291286e-06,
"loss": 4.8011,
"step": 15800
},
{
"epoch": 0.35,
"grad_norm": 0.4264489710330963,
"learning_rate": 6.529975338818446e-06,
"loss": 4.8023,
"step": 15900
},
{
"epoch": 0.35,
"grad_norm": 0.49021920561790466,
"learning_rate": 6.508151284345606e-06,
"loss": 4.8028,
"step": 16000
},
{
"epoch": 0.35,
"grad_norm": 0.5845347046852112,
"learning_rate": 6.486327229872766e-06,
"loss": 4.8036,
"step": 16100
},
{
"epoch": 0.35,
"grad_norm": 0.5416790246963501,
"learning_rate": 6.4645031753999265e-06,
"loss": 4.7982,
"step": 16200
},
{
"epoch": 0.36,
"grad_norm": 0.6061177253723145,
"learning_rate": 6.442679120927087e-06,
"loss": 4.7992,
"step": 16300
},
{
"epoch": 0.36,
"grad_norm": 1.0177648067474365,
"learning_rate": 6.420855066454247e-06,
"loss": 4.8009,
"step": 16400
},
{
"epoch": 0.36,
"grad_norm": 1.1131385564804077,
"learning_rate": 6.399031011981407e-06,
"loss": 4.8017,
"step": 16500
},
{
"epoch": 0.36,
"grad_norm": 0.519646167755127,
"learning_rate": 6.377206957508566e-06,
"loss": 4.7977,
"step": 16600
},
{
"epoch": 0.36,
"grad_norm": 0.7085474729537964,
"learning_rate": 6.355382903035726e-06,
"loss": 4.8033,
"step": 16700
},
{
"epoch": 0.37,
"grad_norm": 2.178063154220581,
"learning_rate": 6.333558848562886e-06,
"loss": 4.8005,
"step": 16800
},
{
"epoch": 0.37,
"grad_norm": 0.6010052561759949,
"learning_rate": 6.311734794090046e-06,
"loss": 4.7981,
"step": 16900
},
{
"epoch": 0.37,
"grad_norm": 1.1030943393707275,
"learning_rate": 6.2899107396172064e-06,
"loss": 4.8012,
"step": 17000
},
{
"epoch": 0.37,
"grad_norm": 1.7612066268920898,
"learning_rate": 6.268086685144367e-06,
"loss": 4.7985,
"step": 17100
},
{
"epoch": 0.38,
"grad_norm": 1.4463589191436768,
"learning_rate": 6.246262630671527e-06,
"loss": 4.7989,
"step": 17200
},
{
"epoch": 0.38,
"grad_norm": 0.5492839813232422,
"learning_rate": 6.224438576198687e-06,
"loss": 4.7981,
"step": 17300
},
{
"epoch": 0.38,
"grad_norm": 1.7029542922973633,
"learning_rate": 6.202614521725847e-06,
"loss": 4.7995,
"step": 17400
},
{
"epoch": 0.38,
"grad_norm": 0.5425244569778442,
"learning_rate": 6.1807904672530075e-06,
"loss": 4.8013,
"step": 17500
},
{
"epoch": 0.38,
"grad_norm": 2.1396992206573486,
"learning_rate": 6.158966412780166e-06,
"loss": 4.7976,
"step": 17600
},
{
"epoch": 0.39,
"grad_norm": 0.6928815841674805,
"learning_rate": 6.137142358307326e-06,
"loss": 4.8004,
"step": 17700
},
{
"epoch": 0.39,
"grad_norm": 0.7560544013977051,
"learning_rate": 6.115318303834486e-06,
"loss": 4.7991,
"step": 17800
},
{
"epoch": 0.39,
"grad_norm": 1.5612252950668335,
"learning_rate": 6.093494249361647e-06,
"loss": 4.7969,
"step": 17900
},
{
"epoch": 0.39,
"grad_norm": 0.8462795615196228,
"learning_rate": 6.071670194888807e-06,
"loss": 4.7983,
"step": 18000
},
{
"epoch": 0.4,
"grad_norm": 1.2593392133712769,
"learning_rate": 6.049846140415967e-06,
"loss": 4.7961,
"step": 18100
},
{
"epoch": 0.4,
"grad_norm": 1.180788278579712,
"learning_rate": 6.028022085943127e-06,
"loss": 4.7965,
"step": 18200
},
{
"epoch": 0.4,
"grad_norm": 0.8912612795829773,
"learning_rate": 6.0061980314702874e-06,
"loss": 4.796,
"step": 18300
},
{
"epoch": 0.4,
"grad_norm": 1.3480154275894165,
"learning_rate": 5.984373976997448e-06,
"loss": 4.7985,
"step": 18400
},
{
"epoch": 0.4,
"grad_norm": 2.097245931625366,
"learning_rate": 5.962549922524608e-06,
"loss": 4.7963,
"step": 18500
},
{
"epoch": 0.41,
"grad_norm": 0.4926351308822632,
"learning_rate": 5.940725868051768e-06,
"loss": 4.7993,
"step": 18600
},
{
"epoch": 0.41,
"grad_norm": 0.5842808485031128,
"learning_rate": 5.918901813578927e-06,
"loss": 4.7971,
"step": 18700
},
{
"epoch": 0.41,
"grad_norm": 0.43305909633636475,
"learning_rate": 5.897077759106087e-06,
"loss": 4.7946,
"step": 18800
},
{
"epoch": 0.41,
"grad_norm": 0.8926731944084167,
"learning_rate": 5.875253704633247e-06,
"loss": 4.7965,
"step": 18900
},
{
"epoch": 0.41,
"grad_norm": 0.48742425441741943,
"learning_rate": 5.853429650160407e-06,
"loss": 4.7976,
"step": 19000
},
{
"epoch": 0.42,
"grad_norm": 0.4740324020385742,
"learning_rate": 5.831605595687567e-06,
"loss": 4.7967,
"step": 19100
},
{
"epoch": 0.42,
"grad_norm": 0.622294008731842,
"learning_rate": 5.809781541214728e-06,
"loss": 4.7979,
"step": 19200
},
{
"epoch": 0.42,
"grad_norm": 1.5809967517852783,
"learning_rate": 5.787957486741888e-06,
"loss": 4.7955,
"step": 19300
},
{
"epoch": 0.42,
"grad_norm": 1.089276909828186,
"learning_rate": 5.766133432269048e-06,
"loss": 4.7952,
"step": 19400
},
{
"epoch": 0.43,
"grad_norm": 0.7803537845611572,
"learning_rate": 5.744309377796207e-06,
"loss": 4.7945,
"step": 19500
},
{
"epoch": 0.43,
"grad_norm": 0.7891755104064941,
"learning_rate": 5.722485323323368e-06,
"loss": 4.7986,
"step": 19600
},
{
"epoch": 0.43,
"grad_norm": 1.4096457958221436,
"learning_rate": 5.700661268850528e-06,
"loss": 4.798,
"step": 19700
},
{
"epoch": 0.43,
"grad_norm": 0.7508895397186279,
"learning_rate": 5.678837214377687e-06,
"loss": 4.7939,
"step": 19800
},
{
"epoch": 0.43,
"grad_norm": 1.578983187675476,
"learning_rate": 5.657013159904847e-06,
"loss": 4.7985,
"step": 19900
},
{
"epoch": 0.44,
"grad_norm": 0.5898668169975281,
"learning_rate": 5.635189105432008e-06,
"loss": 4.7991,
"step": 20000
},
{
"epoch": 0.44,
"grad_norm": 0.5105552673339844,
"learning_rate": 5.613365050959168e-06,
"loss": 4.798,
"step": 20100
},
{
"epoch": 0.44,
"grad_norm": 0.5170775651931763,
"learning_rate": 5.591540996486328e-06,
"loss": 4.7946,
"step": 20200
},
{
"epoch": 0.44,
"grad_norm": 1.588223934173584,
"learning_rate": 5.569716942013487e-06,
"loss": 4.797,
"step": 20300
},
{
"epoch": 0.45,
"grad_norm": 0.7536285519599915,
"learning_rate": 5.5478928875406476e-06,
"loss": 4.7934,
"step": 20400
},
{
"epoch": 0.45,
"grad_norm": 0.5207072496414185,
"learning_rate": 5.526068833067808e-06,
"loss": 4.7975,
"step": 20500
},
{
"epoch": 0.45,
"grad_norm": 1.6971161365509033,
"learning_rate": 5.504244778594968e-06,
"loss": 4.797,
"step": 20600
},
{
"epoch": 0.45,
"grad_norm": 0.7362131476402283,
"learning_rate": 5.482420724122128e-06,
"loss": 4.7926,
"step": 20700
},
{
"epoch": 0.45,
"grad_norm": 0.8522694706916809,
"learning_rate": 5.4605966696492876e-06,
"loss": 4.7967,
"step": 20800
},
{
"epoch": 0.46,
"grad_norm": 0.5707879662513733,
"learning_rate": 5.438772615176448e-06,
"loss": 4.7946,
"step": 20900
},
{
"epoch": 0.46,
"grad_norm": 1.427687406539917,
"learning_rate": 5.416948560703608e-06,
"loss": 4.7958,
"step": 21000
},
{
"epoch": 0.46,
"grad_norm": 1.4829745292663574,
"learning_rate": 5.395124506230768e-06,
"loss": 4.7942,
"step": 21100
},
{
"epoch": 0.46,
"grad_norm": 2.6114389896392822,
"learning_rate": 5.3733004517579275e-06,
"loss": 4.7956,
"step": 21200
},
{
"epoch": 0.46,
"grad_norm": 1.3793102502822876,
"learning_rate": 5.351476397285088e-06,
"loss": 4.7932,
"step": 21300
},
{
"epoch": 0.47,
"grad_norm": 2.681830406188965,
"learning_rate": 5.329652342812248e-06,
"loss": 4.7945,
"step": 21400
},
{
"epoch": 0.47,
"grad_norm": 0.9123592972755432,
"learning_rate": 5.307828288339408e-06,
"loss": 4.7951,
"step": 21500
},
{
"epoch": 0.47,
"grad_norm": 0.771221935749054,
"learning_rate": 5.286004233866568e-06,
"loss": 4.7949,
"step": 21600
},
{
"epoch": 0.47,
"grad_norm": 1.176202654838562,
"learning_rate": 5.2641801793937286e-06,
"loss": 4.7941,
"step": 21700
},
{
"epoch": 0.48,
"grad_norm": 1.7281347513198853,
"learning_rate": 5.242356124920889e-06,
"loss": 4.7922,
"step": 21800
},
{
"epoch": 0.48,
"grad_norm": 0.7822020053863525,
"learning_rate": 5.220532070448048e-06,
"loss": 4.7945,
"step": 21900
},
{
"epoch": 0.48,
"grad_norm": 0.47084948420524597,
"learning_rate": 5.1987080159752075e-06,
"loss": 4.7954,
"step": 22000
},
{
"epoch": 0.48,
"grad_norm": 0.6029684543609619,
"learning_rate": 5.176883961502368e-06,
"loss": 4.7949,
"step": 22100
},
{
"epoch": 0.48,
"grad_norm": 1.7270153760910034,
"learning_rate": 5.155059907029528e-06,
"loss": 4.7945,
"step": 22200
},
{
"epoch": 0.49,
"grad_norm": 0.7985190153121948,
"learning_rate": 5.133235852556688e-06,
"loss": 4.7963,
"step": 22300
},
{
"epoch": 0.49,
"grad_norm": 0.5503430962562561,
"learning_rate": 5.111411798083848e-06,
"loss": 4.7929,
"step": 22400
},
{
"epoch": 0.49,
"grad_norm": 0.8454219102859497,
"learning_rate": 5.0895877436110085e-06,
"loss": 4.7934,
"step": 22500
},
{
"epoch": 0.49,
"grad_norm": 1.7326210737228394,
"learning_rate": 5.067763689138169e-06,
"loss": 4.7923,
"step": 22600
},
{
"epoch": 0.5,
"grad_norm": 1.3302743434906006,
"learning_rate": 5.045939634665329e-06,
"loss": 4.7975,
"step": 22700
},
{
"epoch": 0.5,
"grad_norm": 1.2851781845092773,
"learning_rate": 5.024115580192489e-06,
"loss": 4.7941,
"step": 22800
},
{
"epoch": 0.5,
"grad_norm": 0.6426960825920105,
"learning_rate": 5.002291525719649e-06,
"loss": 4.7956,
"step": 22900
},
{
"epoch": 0.5,
"grad_norm": 0.518356204032898,
"learning_rate": 4.980467471246809e-06,
"loss": 4.7952,
"step": 23000
},
{
"epoch": 0.5,
"grad_norm": 1.0310689210891724,
"learning_rate": 4.958643416773969e-06,
"loss": 4.7948,
"step": 23100
},
{
"epoch": 0.51,
"grad_norm": 0.9940834045410156,
"learning_rate": 4.936819362301128e-06,
"loss": 4.7944,
"step": 23200
},
{
"epoch": 0.51,
"grad_norm": 0.5921940803527832,
"learning_rate": 4.9149953078282885e-06,
"loss": 4.7917,
"step": 23300
},
{
"epoch": 0.51,
"grad_norm": 0.5630788207054138,
"learning_rate": 4.893171253355449e-06,
"loss": 4.7923,
"step": 23400
},
{
"epoch": 0.51,
"grad_norm": 0.5146437287330627,
"learning_rate": 4.871347198882609e-06,
"loss": 4.7913,
"step": 23500
},
{
"epoch": 0.52,
"grad_norm": 0.8808703422546387,
"learning_rate": 4.849523144409769e-06,
"loss": 4.7921,
"step": 23600
},
{
"epoch": 0.52,
"grad_norm": 2.0775110721588135,
"learning_rate": 4.827699089936929e-06,
"loss": 4.7917,
"step": 23700
},
{
"epoch": 0.52,
"grad_norm": 0.8530829548835754,
"learning_rate": 4.805875035464089e-06,
"loss": 4.792,
"step": 23800
},
{
"epoch": 0.52,
"grad_norm": 0.6423510909080505,
"learning_rate": 4.784050980991249e-06,
"loss": 4.7912,
"step": 23900
},
{
"epoch": 0.52,
"grad_norm": 0.6276280879974365,
"learning_rate": 4.762226926518409e-06,
"loss": 4.7916,
"step": 24000
},
{
"epoch": 0.53,
"grad_norm": 1.0554680824279785,
"learning_rate": 4.740402872045569e-06,
"loss": 4.7918,
"step": 24100
},
{
"epoch": 0.53,
"grad_norm": 0.6865511536598206,
"learning_rate": 4.7185788175727295e-06,
"loss": 4.7945,
"step": 24200
},
{
"epoch": 0.53,
"grad_norm": 0.5930079817771912,
"learning_rate": 4.696754763099889e-06,
"loss": 4.7912,
"step": 24300
},
{
"epoch": 0.53,
"grad_norm": 1.2960429191589355,
"learning_rate": 4.674930708627049e-06,
"loss": 4.79,
"step": 24400
},
{
"epoch": 0.53,
"grad_norm": 0.62696772813797,
"learning_rate": 4.653106654154209e-06,
"loss": 4.7904,
"step": 24500
},
{
"epoch": 0.54,
"grad_norm": 1.9733163118362427,
"learning_rate": 4.6312825996813695e-06,
"loss": 4.7891,
"step": 24600
},
{
"epoch": 0.54,
"grad_norm": 1.0360530614852905,
"learning_rate": 4.60945854520853e-06,
"loss": 4.7878,
"step": 24700
},
{
"epoch": 0.54,
"grad_norm": 1.112536072731018,
"learning_rate": 4.587634490735689e-06,
"loss": 4.7911,
"step": 24800
},
{
"epoch": 0.54,
"grad_norm": 0.8656798601150513,
"learning_rate": 4.565810436262849e-06,
"loss": 4.7914,
"step": 24900
},
{
"epoch": 0.55,
"grad_norm": 0.5879213809967041,
"learning_rate": 4.5439863817900095e-06,
"loss": 4.7909,
"step": 25000
},
{
"epoch": 0.55,
"grad_norm": 0.8013112545013428,
"learning_rate": 4.52216232731717e-06,
"loss": 4.7876,
"step": 25100
},
{
"epoch": 0.55,
"grad_norm": 0.8607842326164246,
"learning_rate": 4.50033827284433e-06,
"loss": 4.7896,
"step": 25200
},
{
"epoch": 0.55,
"grad_norm": 0.590959370136261,
"learning_rate": 4.47851421837149e-06,
"loss": 4.7895,
"step": 25300
},
{
"epoch": 0.55,
"grad_norm": 1.595662236213684,
"learning_rate": 4.4566901638986495e-06,
"loss": 4.7904,
"step": 25400
},
{
"epoch": 0.56,
"grad_norm": 0.5102413892745972,
"learning_rate": 4.43486610942581e-06,
"loss": 4.7907,
"step": 25500
},
{
"epoch": 0.56,
"grad_norm": 1.1544299125671387,
"learning_rate": 4.41304205495297e-06,
"loss": 4.791,
"step": 25600
},
{
"epoch": 0.56,
"grad_norm": 1.2537280321121216,
"learning_rate": 4.39121800048013e-06,
"loss": 4.7938,
"step": 25700
},
{
"epoch": 0.56,
"grad_norm": 0.6835722327232361,
"learning_rate": 4.3693939460072895e-06,
"loss": 4.7924,
"step": 25800
},
{
"epoch": 0.57,
"grad_norm": 1.0808320045471191,
"learning_rate": 4.34756989153445e-06,
"loss": 4.7919,
"step": 25900
},
{
"epoch": 0.57,
"grad_norm": 1.7416857481002808,
"learning_rate": 4.32574583706161e-06,
"loss": 4.7903,
"step": 26000
},
{
"epoch": 0.57,
"grad_norm": 0.857487142086029,
"learning_rate": 4.30392178258877e-06,
"loss": 4.789,
"step": 26100
},
{
"epoch": 0.57,
"grad_norm": 0.9709984064102173,
"learning_rate": 4.2820977281159294e-06,
"loss": 4.7862,
"step": 26200
},
{
"epoch": 0.57,
"grad_norm": 1.2289056777954102,
"learning_rate": 4.26027367364309e-06,
"loss": 4.7914,
"step": 26300
},
{
"epoch": 0.58,
"grad_norm": 0.7405953407287598,
"learning_rate": 4.23844961917025e-06,
"loss": 4.7943,
"step": 26400
},
{
"epoch": 0.58,
"grad_norm": 1.3948872089385986,
"learning_rate": 4.21662556469741e-06,
"loss": 4.7898,
"step": 26500
},
{
"epoch": 0.58,
"grad_norm": 0.6792095303535461,
"learning_rate": 4.1948015102245694e-06,
"loss": 4.7887,
"step": 26600
},
{
"epoch": 0.58,
"grad_norm": 0.7049844264984131,
"learning_rate": 4.17297745575173e-06,
"loss": 4.793,
"step": 26700
},
{
"epoch": 0.58,
"grad_norm": 0.5475296974182129,
"learning_rate": 4.15115340127889e-06,
"loss": 4.7881,
"step": 26800
},
{
"epoch": 0.59,
"grad_norm": 0.6123815774917603,
"learning_rate": 4.12932934680605e-06,
"loss": 4.7901,
"step": 26900
},
{
"epoch": 0.59,
"grad_norm": 1.7768149375915527,
"learning_rate": 4.107505292333209e-06,
"loss": 4.7918,
"step": 27000
},
{
"epoch": 0.59,
"grad_norm": 0.5179715156555176,
"learning_rate": 4.08568123786037e-06,
"loss": 4.7884,
"step": 27100
},
{
"epoch": 0.59,
"grad_norm": 0.607803225517273,
"learning_rate": 4.06385718338753e-06,
"loss": 4.79,
"step": 27200
},
{
"epoch": 0.6,
"grad_norm": 1.2372949123382568,
"learning_rate": 4.04203312891469e-06,
"loss": 4.7889,
"step": 27300
},
{
"epoch": 0.6,
"grad_norm": 1.3154016733169556,
"learning_rate": 4.02020907444185e-06,
"loss": 4.7906,
"step": 27400
},
{
"epoch": 0.6,
"grad_norm": 0.5069667100906372,
"learning_rate": 3.99838501996901e-06,
"loss": 4.7871,
"step": 27500
},
{
"epoch": 0.6,
"grad_norm": 0.9286430478096008,
"learning_rate": 3.97656096549617e-06,
"loss": 4.793,
"step": 27600
},
{
"epoch": 0.6,
"grad_norm": 1.3074681758880615,
"learning_rate": 3.95473691102333e-06,
"loss": 4.7886,
"step": 27700
},
{
"epoch": 0.61,
"grad_norm": 0.9673444628715515,
"learning_rate": 3.93291285655049e-06,
"loss": 4.7916,
"step": 27800
},
{
"epoch": 0.61,
"grad_norm": 2.3527958393096924,
"learning_rate": 3.9110888020776504e-06,
"loss": 4.7917,
"step": 27900
},
{
"epoch": 0.61,
"grad_norm": 0.6838983297348022,
"learning_rate": 3.88926474760481e-06,
"loss": 4.792,
"step": 28000
},
{
"epoch": 0.61,
"grad_norm": 1.2325012683868408,
"learning_rate": 3.86744069313197e-06,
"loss": 4.79,
"step": 28100
},
{
"epoch": 0.62,
"grad_norm": 0.6116911172866821,
"learning_rate": 3.84561663865913e-06,
"loss": 4.7873,
"step": 28200
},
{
"epoch": 0.62,
"grad_norm": 1.2169787883758545,
"learning_rate": 3.82379258418629e-06,
"loss": 4.7877,
"step": 28300
},
{
"epoch": 0.62,
"grad_norm": 1.7613152265548706,
"learning_rate": 3.8019685297134506e-06,
"loss": 4.7869,
"step": 28400
},
{
"epoch": 0.62,
"grad_norm": 0.9709558486938477,
"learning_rate": 3.7801444752406104e-06,
"loss": 4.7882,
"step": 28500
},
{
"epoch": 0.62,
"grad_norm": 0.7928886413574219,
"learning_rate": 3.75832042076777e-06,
"loss": 4.7891,
"step": 28600
},
{
"epoch": 0.63,
"grad_norm": 0.5544800162315369,
"learning_rate": 3.7364963662949304e-06,
"loss": 4.7922,
"step": 28700
},
{
"epoch": 0.63,
"grad_norm": 1.1702040433883667,
"learning_rate": 3.7146723118220906e-06,
"loss": 4.788,
"step": 28800
},
{
"epoch": 0.63,
"grad_norm": 1.300114631652832,
"learning_rate": 3.692848257349251e-06,
"loss": 4.7907,
"step": 28900
},
{
"epoch": 0.63,
"grad_norm": 0.8479206562042236,
"learning_rate": 3.671024202876411e-06,
"loss": 4.7869,
"step": 29000
},
{
"epoch": 0.64,
"grad_norm": 0.6059646606445312,
"learning_rate": 3.6492001484035704e-06,
"loss": 4.7891,
"step": 29100
},
{
"epoch": 0.64,
"grad_norm": 1.6603209972381592,
"learning_rate": 3.6273760939307306e-06,
"loss": 4.7899,
"step": 29200
},
{
"epoch": 0.64,
"grad_norm": 0.5265193581581116,
"learning_rate": 3.605552039457891e-06,
"loss": 4.7873,
"step": 29300
},
{
"epoch": 0.64,
"grad_norm": 1.8754724264144897,
"learning_rate": 3.583727984985051e-06,
"loss": 4.786,
"step": 29400
},
{
"epoch": 0.64,
"grad_norm": 0.6786229610443115,
"learning_rate": 3.561903930512211e-06,
"loss": 4.7885,
"step": 29500
},
{
"epoch": 0.65,
"grad_norm": 2.067661762237549,
"learning_rate": 3.5400798760393706e-06,
"loss": 4.7899,
"step": 29600
},
{
"epoch": 0.65,
"grad_norm": 0.8492066860198975,
"learning_rate": 3.5182558215665308e-06,
"loss": 4.7896,
"step": 29700
},
{
"epoch": 0.65,
"grad_norm": 0.989614725112915,
"learning_rate": 3.496431767093691e-06,
"loss": 4.7883,
"step": 29800
},
{
"epoch": 0.65,
"grad_norm": 1.0036453008651733,
"learning_rate": 3.474607712620851e-06,
"loss": 4.7879,
"step": 29900
},
{
"epoch": 0.65,
"grad_norm": 1.2347569465637207,
"learning_rate": 3.4527836581480114e-06,
"loss": 4.7883,
"step": 30000
},
{
"epoch": 0.66,
"grad_norm": 0.543158233165741,
"learning_rate": 3.4309596036751708e-06,
"loss": 4.7895,
"step": 30100
},
{
"epoch": 0.66,
"grad_norm": 1.209635615348816,
"learning_rate": 3.409135549202331e-06,
"loss": 4.7888,
"step": 30200
},
{
"epoch": 0.66,
"grad_norm": 0.4899618923664093,
"learning_rate": 3.387311494729491e-06,
"loss": 4.7848,
"step": 30300
},
{
"epoch": 0.66,
"grad_norm": 0.5836758017539978,
"learning_rate": 3.3654874402566514e-06,
"loss": 4.7886,
"step": 30400
},
{
"epoch": 0.67,
"grad_norm": 0.5948976874351501,
"learning_rate": 3.343663385783811e-06,
"loss": 4.7909,
"step": 30500
},
{
"epoch": 0.67,
"grad_norm": 1.0364673137664795,
"learning_rate": 3.3218393313109714e-06,
"loss": 4.7873,
"step": 30600
},
{
"epoch": 0.67,
"grad_norm": 0.48503726720809937,
"learning_rate": 3.300015276838131e-06,
"loss": 4.7859,
"step": 30700
},
{
"epoch": 0.67,
"grad_norm": 1.423990249633789,
"learning_rate": 3.2781912223652914e-06,
"loss": 4.7868,
"step": 30800
},
{
"epoch": 0.67,
"grad_norm": 0.7693653106689453,
"learning_rate": 3.256367167892451e-06,
"loss": 4.789,
"step": 30900
},
{
"epoch": 0.68,
"grad_norm": 0.8543452620506287,
"learning_rate": 3.2345431134196114e-06,
"loss": 4.79,
"step": 31000
},
{
"epoch": 0.68,
"grad_norm": 1.765480399131775,
"learning_rate": 3.2127190589467716e-06,
"loss": 4.7901,
"step": 31100
},
{
"epoch": 0.68,
"grad_norm": 1.3010811805725098,
"learning_rate": 3.1908950044739313e-06,
"loss": 4.7885,
"step": 31200
},
{
"epoch": 0.68,
"grad_norm": 0.4873560965061188,
"learning_rate": 3.169070950001091e-06,
"loss": 4.7865,
"step": 31300
},
{
"epoch": 0.69,
"grad_norm": 1.61781907081604,
"learning_rate": 3.1472468955282513e-06,
"loss": 4.7873,
"step": 31400
},
{
"epoch": 0.69,
"grad_norm": 0.9210663437843323,
"learning_rate": 3.1254228410554115e-06,
"loss": 4.7878,
"step": 31500
},
{
"epoch": 0.69,
"grad_norm": 0.7144986391067505,
"learning_rate": 3.1035987865825718e-06,
"loss": 4.7865,
"step": 31600
},
{
"epoch": 0.69,
"grad_norm": 0.6649860143661499,
"learning_rate": 3.0817747321097315e-06,
"loss": 4.7898,
"step": 31700
},
{
"epoch": 0.69,
"grad_norm": 0.7339694499969482,
"learning_rate": 3.0599506776368913e-06,
"loss": 4.786,
"step": 31800
},
{
"epoch": 0.7,
"grad_norm": 1.963561773300171,
"learning_rate": 3.0381266231640515e-06,
"loss": 4.7882,
"step": 31900
},
{
"epoch": 0.7,
"grad_norm": 1.8179004192352295,
"learning_rate": 3.0163025686912117e-06,
"loss": 4.7844,
"step": 32000
},
{
"epoch": 0.7,
"grad_norm": 1.7514827251434326,
"learning_rate": 2.994478514218372e-06,
"loss": 4.7871,
"step": 32100
},
{
"epoch": 0.7,
"grad_norm": 1.3072575330734253,
"learning_rate": 2.972654459745532e-06,
"loss": 4.7897,
"step": 32200
},
{
"epoch": 0.7,
"grad_norm": 0.9841882586479187,
"learning_rate": 2.9508304052726915e-06,
"loss": 4.787,
"step": 32300
},
{
"epoch": 0.71,
"grad_norm": 0.8150787353515625,
"learning_rate": 2.9290063507998517e-06,
"loss": 4.787,
"step": 32400
},
{
"epoch": 0.71,
"grad_norm": 0.6560021638870239,
"learning_rate": 2.907182296327012e-06,
"loss": 4.7853,
"step": 32500
},
{
"epoch": 0.71,
"grad_norm": 1.4383647441864014,
"learning_rate": 2.885358241854172e-06,
"loss": 4.7858,
"step": 32600
},
{
"epoch": 0.71,
"grad_norm": 0.6023637056350708,
"learning_rate": 2.8635341873813323e-06,
"loss": 4.786,
"step": 32700
},
{
"epoch": 0.72,
"grad_norm": 0.7451430559158325,
"learning_rate": 2.8417101329084917e-06,
"loss": 4.7917,
"step": 32800
},
{
"epoch": 0.72,
"grad_norm": 0.5302537679672241,
"learning_rate": 2.819886078435652e-06,
"loss": 4.7863,
"step": 32900
},
{
"epoch": 0.72,
"grad_norm": 0.9489607214927673,
"learning_rate": 2.798062023962812e-06,
"loss": 4.7873,
"step": 33000
},
{
"epoch": 0.72,
"grad_norm": 1.1517263650894165,
"learning_rate": 2.7762379694899723e-06,
"loss": 4.788,
"step": 33100
},
{
"epoch": 0.72,
"grad_norm": 0.5979319214820862,
"learning_rate": 2.754413915017132e-06,
"loss": 4.7898,
"step": 33200
},
{
"epoch": 0.73,
"grad_norm": 0.8156241178512573,
"learning_rate": 2.732589860544292e-06,
"loss": 4.7883,
"step": 33300
},
{
"epoch": 0.73,
"grad_norm": 1.709427833557129,
"learning_rate": 2.710765806071452e-06,
"loss": 4.7867,
"step": 33400
},
{
"epoch": 0.73,
"grad_norm": 0.5464685559272766,
"learning_rate": 2.6889417515986123e-06,
"loss": 4.7889,
"step": 33500
},
{
"epoch": 0.73,
"grad_norm": 1.050419569015503,
"learning_rate": 2.667117697125772e-06,
"loss": 4.7883,
"step": 33600
},
{
"epoch": 0.74,
"grad_norm": 0.6171194314956665,
"learning_rate": 2.6452936426529323e-06,
"loss": 4.7893,
"step": 33700
},
{
"epoch": 0.74,
"grad_norm": 0.7161327600479126,
"learning_rate": 2.6234695881800925e-06,
"loss": 4.787,
"step": 33800
},
{
"epoch": 0.74,
"grad_norm": 0.5941922664642334,
"learning_rate": 2.6016455337072523e-06,
"loss": 4.7843,
"step": 33900
},
{
"epoch": 0.74,
"grad_norm": 1.1841336488723755,
"learning_rate": 2.579821479234412e-06,
"loss": 4.7856,
"step": 34000
},
{
"epoch": 0.74,
"grad_norm": 0.7025249004364014,
"learning_rate": 2.5579974247615723e-06,
"loss": 4.7843,
"step": 34100
},
{
"epoch": 0.75,
"grad_norm": 1.5568642616271973,
"learning_rate": 2.5361733702887325e-06,
"loss": 4.7851,
"step": 34200
},
{
"epoch": 0.75,
"grad_norm": 1.3719669580459595,
"learning_rate": 2.5143493158158927e-06,
"loss": 4.7846,
"step": 34300
},
{
"epoch": 0.75,
"grad_norm": 0.6786982417106628,
"learning_rate": 2.4925252613430525e-06,
"loss": 4.7845,
"step": 34400
},
{
"epoch": 0.75,
"grad_norm": 0.6673011183738708,
"learning_rate": 2.4707012068702123e-06,
"loss": 4.7876,
"step": 34500
},
{
"epoch": 0.76,
"grad_norm": 1.6152336597442627,
"learning_rate": 2.4488771523973725e-06,
"loss": 4.7864,
"step": 34600
},
{
"epoch": 0.76,
"grad_norm": 1.4687036275863647,
"learning_rate": 2.4270530979245327e-06,
"loss": 4.7851,
"step": 34700
},
{
"epoch": 0.76,
"grad_norm": 0.5648071765899658,
"learning_rate": 2.4052290434516925e-06,
"loss": 4.7877,
"step": 34800
},
{
"epoch": 0.76,
"grad_norm": 0.547589898109436,
"learning_rate": 2.3834049889788527e-06,
"loss": 4.7886,
"step": 34900
},
{
"epoch": 0.76,
"grad_norm": 0.9473028182983398,
"learning_rate": 2.361580934506013e-06,
"loss": 4.7869,
"step": 35000
},
{
"epoch": 0.77,
"grad_norm": 1.109892725944519,
"learning_rate": 2.3397568800331727e-06,
"loss": 4.7893,
"step": 35100
},
{
"epoch": 0.77,
"grad_norm": 1.2780309915542603,
"learning_rate": 2.317932825560333e-06,
"loss": 4.7856,
"step": 35200
},
{
"epoch": 0.77,
"grad_norm": 1.0184062719345093,
"learning_rate": 2.2961087710874927e-06,
"loss": 4.7851,
"step": 35300
},
{
"epoch": 0.77,
"grad_norm": 1.2379289865493774,
"learning_rate": 2.274284716614653e-06,
"loss": 4.7895,
"step": 35400
},
{
"epoch": 0.77,
"grad_norm": 1.372209072113037,
"learning_rate": 2.252460662141813e-06,
"loss": 4.7869,
"step": 35500
},
{
"epoch": 0.78,
"grad_norm": 0.6445749402046204,
"learning_rate": 2.230636607668973e-06,
"loss": 4.7833,
"step": 35600
},
{
"epoch": 0.78,
"grad_norm": 0.5948896408081055,
"learning_rate": 2.208812553196133e-06,
"loss": 4.7873,
"step": 35700
},
{
"epoch": 0.78,
"grad_norm": 0.6000692844390869,
"learning_rate": 2.1869884987232933e-06,
"loss": 4.785,
"step": 35800
},
{
"epoch": 0.78,
"grad_norm": 1.0963830947875977,
"learning_rate": 2.165164444250453e-06,
"loss": 4.7883,
"step": 35900
},
{
"epoch": 0.79,
"grad_norm": 1.0998800992965698,
"learning_rate": 2.1433403897776133e-06,
"loss": 4.7888,
"step": 36000
},
{
"epoch": 0.79,
"grad_norm": 1.1177624464035034,
"learning_rate": 2.121516335304773e-06,
"loss": 4.7847,
"step": 36100
},
{
"epoch": 0.79,
"grad_norm": 0.6763335466384888,
"learning_rate": 2.0996922808319333e-06,
"loss": 4.785,
"step": 36200
},
{
"epoch": 0.79,
"grad_norm": 0.9737639427185059,
"learning_rate": 2.077868226359093e-06,
"loss": 4.787,
"step": 36300
},
{
"epoch": 0.79,
"grad_norm": 0.756729245185852,
"learning_rate": 2.0560441718862532e-06,
"loss": 4.7865,
"step": 36400
},
{
"epoch": 0.8,
"grad_norm": 1.7666120529174805,
"learning_rate": 2.0342201174134134e-06,
"loss": 4.7866,
"step": 36500
},
{
"epoch": 0.8,
"grad_norm": 0.5760579705238342,
"learning_rate": 2.0123960629405732e-06,
"loss": 4.7849,
"step": 36600
},
{
"epoch": 0.8,
"grad_norm": 1.0564417839050293,
"learning_rate": 1.9905720084677334e-06,
"loss": 4.7863,
"step": 36700
},
{
"epoch": 0.8,
"grad_norm": 0.7988053560256958,
"learning_rate": 1.9687479539948932e-06,
"loss": 4.7849,
"step": 36800
},
{
"epoch": 0.81,
"grad_norm": 1.6380457878112793,
"learning_rate": 1.9469238995220534e-06,
"loss": 4.786,
"step": 36900
},
{
"epoch": 0.81,
"grad_norm": 0.6611093282699585,
"learning_rate": 1.9250998450492132e-06,
"loss": 4.7862,
"step": 37000
},
{
"epoch": 0.81,
"grad_norm": 0.7461991310119629,
"learning_rate": 1.9032757905763734e-06,
"loss": 4.7859,
"step": 37100
},
{
"epoch": 0.81,
"grad_norm": 0.7583649158477783,
"learning_rate": 1.8814517361035334e-06,
"loss": 4.7871,
"step": 37200
},
{
"epoch": 0.81,
"grad_norm": 2.0912837982177734,
"learning_rate": 1.8596276816306934e-06,
"loss": 4.7834,
"step": 37300
},
{
"epoch": 0.82,
"grad_norm": 0.5466910004615784,
"learning_rate": 1.8378036271578536e-06,
"loss": 4.7869,
"step": 37400
},
{
"epoch": 0.82,
"grad_norm": 0.6969348192214966,
"learning_rate": 1.8159795726850136e-06,
"loss": 4.7834,
"step": 37500
},
{
"epoch": 0.82,
"grad_norm": 1.0343713760375977,
"learning_rate": 1.7941555182121736e-06,
"loss": 4.7849,
"step": 37600
},
{
"epoch": 0.82,
"grad_norm": 1.1310784816741943,
"learning_rate": 1.7723314637393336e-06,
"loss": 4.7817,
"step": 37700
},
{
"epoch": 0.82,
"grad_norm": 0.7537195682525635,
"learning_rate": 1.7505074092664936e-06,
"loss": 4.7873,
"step": 37800
},
{
"epoch": 0.83,
"grad_norm": 0.5658436417579651,
"learning_rate": 1.7286833547936538e-06,
"loss": 4.7837,
"step": 37900
},
{
"epoch": 0.83,
"grad_norm": 0.7453294396400452,
"learning_rate": 1.7068593003208136e-06,
"loss": 4.7842,
"step": 38000
},
{
"epoch": 0.83,
"grad_norm": 0.6501612663269043,
"learning_rate": 1.6850352458479738e-06,
"loss": 4.7868,
"step": 38100
},
{
"epoch": 0.83,
"grad_norm": 0.5469274520874023,
"learning_rate": 1.663211191375134e-06,
"loss": 4.7874,
"step": 38200
},
{
"epoch": 0.84,
"grad_norm": 0.8729673624038696,
"learning_rate": 1.6413871369022938e-06,
"loss": 4.7844,
"step": 38300
},
{
"epoch": 0.84,
"grad_norm": 0.5102872252464294,
"learning_rate": 1.619563082429454e-06,
"loss": 4.7833,
"step": 38400
},
{
"epoch": 0.84,
"grad_norm": 0.5068778395652771,
"learning_rate": 1.5977390279566138e-06,
"loss": 4.7854,
"step": 38500
},
{
"epoch": 0.84,
"grad_norm": 1.865955114364624,
"learning_rate": 1.575914973483774e-06,
"loss": 4.7869,
"step": 38600
},
{
"epoch": 0.84,
"grad_norm": 0.6757729053497314,
"learning_rate": 1.554090919010934e-06,
"loss": 4.7853,
"step": 38700
},
{
"epoch": 0.85,
"grad_norm": 1.1830861568450928,
"learning_rate": 1.532266864538094e-06,
"loss": 4.7843,
"step": 38800
},
{
"epoch": 0.85,
"grad_norm": 0.9592554569244385,
"learning_rate": 1.510442810065254e-06,
"loss": 4.783,
"step": 38900
},
{
"epoch": 0.85,
"grad_norm": 0.7421624064445496,
"learning_rate": 1.4886187555924142e-06,
"loss": 4.7852,
"step": 39000
},
{
"epoch": 0.85,
"grad_norm": 1.3924355506896973,
"learning_rate": 1.466794701119574e-06,
"loss": 4.7847,
"step": 39100
},
{
"epoch": 0.86,
"grad_norm": 1.4272788763046265,
"learning_rate": 1.4449706466467342e-06,
"loss": 4.7829,
"step": 39200
},
{
"epoch": 0.86,
"grad_norm": 0.593848466873169,
"learning_rate": 1.423146592173894e-06,
"loss": 4.7882,
"step": 39300
},
{
"epoch": 0.86,
"grad_norm": 0.579724133014679,
"learning_rate": 1.4013225377010542e-06,
"loss": 4.788,
"step": 39400
},
{
"epoch": 0.86,
"grad_norm": 0.6681183576583862,
"learning_rate": 1.3794984832282144e-06,
"loss": 4.7828,
"step": 39500
},
{
"epoch": 0.86,
"grad_norm": 0.9919196963310242,
"learning_rate": 1.3576744287553742e-06,
"loss": 4.7849,
"step": 39600
},
{
"epoch": 0.87,
"grad_norm": 1.0009275674819946,
"learning_rate": 1.3358503742825344e-06,
"loss": 4.7848,
"step": 39700
},
{
"epoch": 0.87,
"grad_norm": 1.044150710105896,
"learning_rate": 1.3140263198096944e-06,
"loss": 4.7865,
"step": 39800
},
{
"epoch": 0.87,
"grad_norm": 0.9887941479682922,
"learning_rate": 1.2922022653368544e-06,
"loss": 4.7861,
"step": 39900
},
{
"epoch": 0.87,
"grad_norm": 1.3287606239318848,
"learning_rate": 1.2703782108640146e-06,
"loss": 4.7831,
"step": 40000
},
{
"epoch": 0.88,
"grad_norm": 1.6255909204483032,
"learning_rate": 1.2485541563911746e-06,
"loss": 4.784,
"step": 40100
},
{
"epoch": 0.88,
"grad_norm": 2.1618168354034424,
"learning_rate": 1.2267301019183346e-06,
"loss": 4.7854,
"step": 40200
},
{
"epoch": 0.88,
"grad_norm": 1.4494011402130127,
"learning_rate": 1.2049060474454946e-06,
"loss": 4.7858,
"step": 40300
},
{
"epoch": 0.88,
"grad_norm": 0.5680764317512512,
"learning_rate": 1.1830819929726546e-06,
"loss": 4.7863,
"step": 40400
},
{
"epoch": 0.88,
"grad_norm": 0.6083593964576721,
"learning_rate": 1.1612579384998145e-06,
"loss": 4.7858,
"step": 40500
},
{
"epoch": 0.89,
"grad_norm": 1.0342472791671753,
"learning_rate": 1.1394338840269745e-06,
"loss": 4.7845,
"step": 40600
},
{
"epoch": 0.89,
"grad_norm": 0.8548548817634583,
"learning_rate": 1.1176098295541345e-06,
"loss": 4.7844,
"step": 40700
},
{
"epoch": 0.89,
"grad_norm": 1.0878795385360718,
"learning_rate": 1.0957857750812947e-06,
"loss": 4.7827,
"step": 40800
},
{
"epoch": 0.89,
"grad_norm": 0.6437754034996033,
"learning_rate": 1.0739617206084547e-06,
"loss": 4.785,
"step": 40900
},
{
"epoch": 0.89,
"grad_norm": 0.8510251641273499,
"learning_rate": 1.0521376661356147e-06,
"loss": 4.7863,
"step": 41000
},
{
"epoch": 0.9,
"grad_norm": 0.6618091464042664,
"learning_rate": 1.0303136116627747e-06,
"loss": 4.7857,
"step": 41100
},
{
"epoch": 0.9,
"grad_norm": 0.6428667902946472,
"learning_rate": 1.008489557189935e-06,
"loss": 4.7836,
"step": 41200
},
{
"epoch": 0.9,
"grad_norm": 0.7751622200012207,
"learning_rate": 9.86665502717095e-07,
"loss": 4.7838,
"step": 41300
},
{
"epoch": 0.9,
"grad_norm": 0.6780493855476379,
"learning_rate": 9.64841448244255e-07,
"loss": 4.785,
"step": 41400
},
{
"epoch": 0.91,
"grad_norm": 0.6410045623779297,
"learning_rate": 9.430173937714149e-07,
"loss": 4.7867,
"step": 41500
},
{
"epoch": 0.91,
"grad_norm": 0.6422233581542969,
"learning_rate": 9.21193339298575e-07,
"loss": 4.7864,
"step": 41600
},
{
"epoch": 0.91,
"grad_norm": 0.48585817217826843,
"learning_rate": 8.99369284825735e-07,
"loss": 4.7862,
"step": 41700
},
{
"epoch": 0.91,
"grad_norm": 1.0709576606750488,
"learning_rate": 8.77545230352895e-07,
"loss": 4.7845,
"step": 41800
},
{
"epoch": 0.91,
"grad_norm": 0.5594016313552856,
"learning_rate": 8.55721175880055e-07,
"loss": 4.7852,
"step": 41900
},
{
"epoch": 0.92,
"grad_norm": 0.6192421913146973,
"learning_rate": 8.338971214072151e-07,
"loss": 4.7857,
"step": 42000
},
{
"epoch": 0.92,
"grad_norm": 1.3113442659378052,
"learning_rate": 8.120730669343751e-07,
"loss": 4.7837,
"step": 42100
},
{
"epoch": 0.92,
"grad_norm": 0.6697332859039307,
"learning_rate": 7.902490124615351e-07,
"loss": 4.7861,
"step": 42200
},
{
"epoch": 0.92,
"grad_norm": 0.584178626537323,
"learning_rate": 7.684249579886951e-07,
"loss": 4.7852,
"step": 42300
},
{
"epoch": 0.93,
"grad_norm": 0.5884829759597778,
"learning_rate": 7.466009035158553e-07,
"loss": 4.7852,
"step": 42400
},
{
"epoch": 0.93,
"grad_norm": 0.7332740426063538,
"learning_rate": 7.247768490430153e-07,
"loss": 4.7843,
"step": 42500
},
{
"epoch": 0.93,
"grad_norm": 1.1577807664871216,
"learning_rate": 7.029527945701753e-07,
"loss": 4.7838,
"step": 42600
},
{
"epoch": 0.93,
"grad_norm": 0.6798880100250244,
"learning_rate": 6.811287400973353e-07,
"loss": 4.7824,
"step": 42700
},
{
"epoch": 0.93,
"grad_norm": 1.1418567895889282,
"learning_rate": 6.593046856244954e-07,
"loss": 4.7832,
"step": 42800
},
{
"epoch": 0.94,
"grad_norm": 0.8666886687278748,
"learning_rate": 6.374806311516554e-07,
"loss": 4.7836,
"step": 42900
},
{
"epoch": 0.94,
"grad_norm": 0.611757218837738,
"learning_rate": 6.156565766788154e-07,
"loss": 4.7881,
"step": 43000
},
{
"epoch": 0.94,
"grad_norm": 0.6125512719154358,
"learning_rate": 5.938325222059755e-07,
"loss": 4.784,
"step": 43100
},
{
"epoch": 0.94,
"grad_norm": 0.8485561609268188,
"learning_rate": 5.720084677331355e-07,
"loss": 4.7855,
"step": 43200
},
{
"epoch": 0.94,
"grad_norm": 1.1883187294006348,
"learning_rate": 5.501844132602956e-07,
"loss": 4.7849,
"step": 43300
},
{
"epoch": 0.95,
"grad_norm": 0.5186755657196045,
"learning_rate": 5.283603587874556e-07,
"loss": 4.7815,
"step": 43400
},
{
"epoch": 0.95,
"grad_norm": 0.8067595958709717,
"learning_rate": 5.065363043146156e-07,
"loss": 4.7834,
"step": 43500
},
{
"epoch": 0.95,
"grad_norm": 0.5256661176681519,
"learning_rate": 4.847122498417756e-07,
"loss": 4.7846,
"step": 43600
},
{
"epoch": 0.95,
"grad_norm": 0.5952714085578918,
"learning_rate": 4.628881953689357e-07,
"loss": 4.7824,
"step": 43700
},
{
"epoch": 0.96,
"grad_norm": 0.6391135454177856,
"learning_rate": 4.410641408960957e-07,
"loss": 4.7865,
"step": 43800
},
{
"epoch": 0.96,
"grad_norm": 0.5354276299476624,
"learning_rate": 4.192400864232558e-07,
"loss": 4.7834,
"step": 43900
},
{
"epoch": 0.96,
"grad_norm": 0.8023102283477783,
"learning_rate": 3.974160319504158e-07,
"loss": 4.7826,
"step": 44000
},
{
"epoch": 0.96,
"grad_norm": 0.9348645210266113,
"learning_rate": 3.755919774775758e-07,
"loss": 4.7842,
"step": 44100
},
{
"epoch": 0.96,
"grad_norm": 0.9189215302467346,
"learning_rate": 3.537679230047358e-07,
"loss": 4.7827,
"step": 44200
},
{
"epoch": 0.97,
"grad_norm": 0.6931596994400024,
"learning_rate": 3.319438685318959e-07,
"loss": 4.7866,
"step": 44300
},
{
"epoch": 0.97,
"grad_norm": 1.1738442182540894,
"learning_rate": 3.101198140590559e-07,
"loss": 4.7836,
"step": 44400
},
{
"epoch": 0.97,
"grad_norm": 0.6504749059677124,
"learning_rate": 2.882957595862159e-07,
"loss": 4.788,
"step": 44500
},
{
"epoch": 0.97,
"grad_norm": 0.6548625230789185,
"learning_rate": 2.6647170511337596e-07,
"loss": 4.7857,
"step": 44600
},
{
"epoch": 0.98,
"grad_norm": 0.621839165687561,
"learning_rate": 2.44647650640536e-07,
"loss": 4.782,
"step": 44700
},
{
"epoch": 0.98,
"grad_norm": 0.5665038228034973,
"learning_rate": 2.2282359616769603e-07,
"loss": 4.7858,
"step": 44800
},
{
"epoch": 0.98,
"grad_norm": 0.6663780808448792,
"learning_rate": 2.0099954169485608e-07,
"loss": 4.784,
"step": 44900
},
{
"epoch": 0.98,
"grad_norm": 0.4924936294555664,
"learning_rate": 1.791754872220161e-07,
"loss": 4.7842,
"step": 45000
}
],
"logging_steps": 100,
"max_steps": 45821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 1.4750802130806374e+18,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}