diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7030 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3535169785169785, + "eval_steps": 500, + "global_step": 500000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.596311569213867, + "learning_rate": 6.249999999999999e-07, + "loss": 9.2114, + "step": 500 + }, + { + "epoch": 0.0, + "grad_norm": 5.16270637512207, + "learning_rate": 1.2499999999999999e-06, + "loss": 7.3081, + "step": 1000 + }, + { + "epoch": 0.0, + "grad_norm": 3.267263412475586, + "learning_rate": 1.875e-06, + "loss": 6.1634, + "step": 1500 + }, + { + "epoch": 0.01, + "grad_norm": 2.1591992378234863, + "learning_rate": 2.4999999999999998e-06, + "loss": 5.421, + "step": 2000 + }, + { + "epoch": 0.01, + "grad_norm": 2.160722494125366, + "learning_rate": 3.125e-06, + "loss": 4.856, + "step": 2500 + }, + { + "epoch": 0.01, + "grad_norm": 2.330343723297119, + "learning_rate": 3.75e-06, + "loss": 4.3826, + "step": 3000 + }, + { + "epoch": 0.01, + "grad_norm": 2.1275618076324463, + "learning_rate": 4.3750000000000005e-06, + "loss": 3.9848, + "step": 3500 + }, + { + "epoch": 0.01, + "grad_norm": 2.1402294635772705, + "learning_rate": 4.9999999999999996e-06, + "loss": 3.6491, + "step": 4000 + }, + { + "epoch": 0.01, + "grad_norm": 2.159619092941284, + "learning_rate": 5.625e-06, + "loss": 3.3861, + "step": 4500 + }, + { + "epoch": 0.01, + "grad_norm": 2.5018796920776367, + "learning_rate": 6.25e-06, + "loss": 3.1845, + "step": 5000 + }, + { + "epoch": 0.01, + "grad_norm": 1.9996334314346313, + "learning_rate": 6.875e-06, + "loss": 3.0335, + "step": 5500 + }, + { + "epoch": 0.02, + "grad_norm": 2.103320598602295, + "learning_rate": 7.5e-06, + "loss": 2.9096, + "step": 6000 + }, + { + "epoch": 0.02, + "grad_norm": 2.025847911834717, + "learning_rate": 8.125e-06, + "loss": 2.8088, + "step": 6500 + }, + { + "epoch": 0.02, + "grad_norm": 2.030522108078003, + "learning_rate": 8.750000000000001e-06, + "loss": 2.7156, + "step": 7000 + }, + { + "epoch": 0.02, + "grad_norm": 1.992558479309082, + "learning_rate": 9.375000000000001e-06, + "loss": 2.6262, + "step": 7500 + }, + { + "epoch": 0.02, + "grad_norm": 2.1512062549591064, + "learning_rate": 9.999999999999999e-06, + "loss": 2.5432, + "step": 8000 + }, + { + "epoch": 0.02, + "grad_norm": 2.0734474658966064, + "learning_rate": 1.0625e-05, + "loss": 2.4722, + "step": 8500 + }, + { + "epoch": 0.02, + "grad_norm": 1.9478808641433716, + "learning_rate": 1.125e-05, + "loss": 2.4111, + "step": 9000 + }, + { + "epoch": 0.03, + "grad_norm": 1.762665033340454, + "learning_rate": 1.1874999999999999e-05, + "loss": 2.3521, + "step": 9500 + }, + { + "epoch": 0.03, + "grad_norm": 1.8274019956588745, + "learning_rate": 1.25e-05, + "loss": 2.3099, + "step": 10000 + }, + { + "epoch": 0.03, + "grad_norm": 1.905918002128601, + "learning_rate": 1.3125e-05, + "loss": 2.2629, + "step": 10500 + }, + { + "epoch": 0.03, + "grad_norm": 1.8081414699554443, + "learning_rate": 1.375e-05, + "loss": 2.2239, + "step": 11000 + }, + { + "epoch": 0.03, + "grad_norm": 7.712226867675781, + "learning_rate": 1.4375e-05, + "loss": 2.1907, + "step": 11500 + }, + { + "epoch": 0.03, + "grad_norm": 1.6963427066802979, + "learning_rate": 1.5e-05, + "loss": 2.1602, + "step": 12000 + }, + { + "epoch": 0.03, + "grad_norm": 1.717537522315979, + "learning_rate": 1.5625e-05, + "loss": 2.1384, + "step": 12500 + }, + { + "epoch": 0.04, + "grad_norm": 1.745806336402893, + "learning_rate": 1.625e-05, + "loss": 2.1061, + "step": 13000 + }, + { + "epoch": 0.04, + "grad_norm": 1.7633601427078247, + "learning_rate": 1.6875e-05, + "loss": 2.0838, + "step": 13500 + }, + { + "epoch": 0.04, + "grad_norm": 1.7061880826950073, + "learning_rate": 1.7500000000000002e-05, + "loss": 2.0648, + "step": 14000 + }, + { + "epoch": 0.04, + "grad_norm": 1.7471063137054443, + "learning_rate": 1.8125e-05, + "loss": 2.0462, + "step": 14500 + }, + { + "epoch": 0.04, + "grad_norm": 1.705340027809143, + "learning_rate": 1.8750000000000002e-05, + "loss": 2.0281, + "step": 15000 + }, + { + "epoch": 0.04, + "grad_norm": 41.675968170166016, + "learning_rate": 1.9375e-05, + "loss": 2.003, + "step": 15500 + }, + { + "epoch": 0.04, + "grad_norm": 1.737722396850586, + "learning_rate": 1.9999999999999998e-05, + "loss": 1.9914, + "step": 16000 + }, + { + "epoch": 0.04, + "grad_norm": 1.8232406377792358, + "learning_rate": 2.0625e-05, + "loss": 1.9724, + "step": 16500 + }, + { + "epoch": 0.05, + "grad_norm": 1.8312487602233887, + "learning_rate": 2.125e-05, + "loss": 1.9577, + "step": 17000 + }, + { + "epoch": 0.05, + "grad_norm": 2.025630235671997, + "learning_rate": 2.1875e-05, + "loss": 1.9411, + "step": 17500 + }, + { + "epoch": 0.05, + "grad_norm": 1.9454607963562012, + "learning_rate": 2.25e-05, + "loss": 1.9263, + "step": 18000 + }, + { + "epoch": 0.05, + "grad_norm": 1.637341856956482, + "learning_rate": 2.3125000000000003e-05, + "loss": 1.9221, + "step": 18500 + }, + { + "epoch": 0.05, + "grad_norm": 1.846366286277771, + "learning_rate": 2.3749999999999998e-05, + "loss": 1.9086, + "step": 19000 + }, + { + "epoch": 0.05, + "grad_norm": 1.802040457725525, + "learning_rate": 2.4375e-05, + "loss": 1.8961, + "step": 19500 + }, + { + "epoch": 0.05, + "grad_norm": 1.7378031015396118, + "learning_rate": 2.5e-05, + "loss": 1.8893, + "step": 20000 + }, + { + "epoch": 0.06, + "grad_norm": 1.6410856246948242, + "learning_rate": 2.5625e-05, + "loss": 1.8752, + "step": 20500 + }, + { + "epoch": 0.06, + "grad_norm": 1.7153388261795044, + "learning_rate": 2.625e-05, + "loss": 1.862, + "step": 21000 + }, + { + "epoch": 0.06, + "grad_norm": 1.6210004091262817, + "learning_rate": 2.6875000000000003e-05, + "loss": 1.855, + "step": 21500 + }, + { + "epoch": 0.06, + "grad_norm": 1.6593818664550781, + "learning_rate": 2.75e-05, + "loss": 1.8478, + "step": 22000 + }, + { + "epoch": 0.06, + "grad_norm": 1.659287691116333, + "learning_rate": 2.8125e-05, + "loss": 1.8353, + "step": 22500 + }, + { + "epoch": 0.06, + "grad_norm": 1.703875184059143, + "learning_rate": 2.875e-05, + "loss": 1.8288, + "step": 23000 + }, + { + "epoch": 0.06, + "grad_norm": 1.7122712135314941, + "learning_rate": 2.9375e-05, + "loss": 1.8289, + "step": 23500 + }, + { + "epoch": 0.06, + "grad_norm": 1.6744304895401, + "learning_rate": 3e-05, + "loss": 1.8219, + "step": 24000 + }, + { + "epoch": 0.07, + "grad_norm": 1.7783963680267334, + "learning_rate": 2.9968487394957983e-05, + "loss": 1.8141, + "step": 24500 + }, + { + "epoch": 0.07, + "grad_norm": 1.7388477325439453, + "learning_rate": 2.9936974789915968e-05, + "loss": 1.805, + "step": 25000 + }, + { + "epoch": 0.07, + "grad_norm": 1.6574689149856567, + "learning_rate": 2.990546218487395e-05, + "loss": 1.8005, + "step": 25500 + }, + { + "epoch": 0.07, + "grad_norm": 1.6803966760635376, + "learning_rate": 2.9873949579831935e-05, + "loss": 1.7902, + "step": 26000 + }, + { + "epoch": 0.07, + "grad_norm": 1.6314315795898438, + "learning_rate": 2.9842436974789916e-05, + "loss": 1.7832, + "step": 26500 + }, + { + "epoch": 0.07, + "grad_norm": 1.6180912256240845, + "learning_rate": 2.98109243697479e-05, + "loss": 1.7774, + "step": 27000 + }, + { + "epoch": 0.07, + "grad_norm": 1.6669533252716064, + "learning_rate": 2.9779411764705883e-05, + "loss": 1.774, + "step": 27500 + }, + { + "epoch": 0.08, + "grad_norm": 1.5653916597366333, + "learning_rate": 2.9747899159663868e-05, + "loss": 1.7673, + "step": 28000 + }, + { + "epoch": 0.08, + "grad_norm": 1.6632215976715088, + "learning_rate": 2.971638655462185e-05, + "loss": 1.7639, + "step": 28500 + }, + { + "epoch": 0.08, + "grad_norm": 1.6262154579162598, + "learning_rate": 2.9684873949579835e-05, + "loss": 1.757, + "step": 29000 + }, + { + "epoch": 0.08, + "grad_norm": 4.847783088684082, + "learning_rate": 2.9653361344537817e-05, + "loss": 1.9286, + "step": 29500 + }, + { + "epoch": 0.08, + "grad_norm": 2.6416807174682617, + "learning_rate": 2.9621848739495802e-05, + "loss": 3.7773, + "step": 30000 + }, + { + "epoch": 0.08, + "grad_norm": 3.4526023864746094, + "learning_rate": 2.9590336134453784e-05, + "loss": 4.3611, + "step": 30500 + }, + { + "epoch": 0.08, + "grad_norm": 65.76104736328125, + "learning_rate": 2.9558823529411766e-05, + "loss": 4.5628, + "step": 31000 + }, + { + "epoch": 0.09, + "grad_norm": 6.145516395568848, + "learning_rate": 2.9527310924369747e-05, + "loss": 4.388, + "step": 31500 + }, + { + "epoch": 0.09, + "grad_norm": 4.991481781005859, + "learning_rate": 2.949579831932773e-05, + "loss": 4.1991, + "step": 32000 + }, + { + "epoch": 0.09, + "grad_norm": 2.632403612136841, + "learning_rate": 2.9464285714285714e-05, + "loss": 3.7935, + "step": 32500 + }, + { + "epoch": 0.09, + "grad_norm": 3.691666841506958, + "learning_rate": 2.9432773109243696e-05, + "loss": 3.4704, + "step": 33000 + }, + { + "epoch": 0.09, + "grad_norm": 14.81291675567627, + "learning_rate": 2.940126050420168e-05, + "loss": 2.6663, + "step": 33500 + }, + { + "epoch": 0.09, + "grad_norm": 2.4295215606689453, + "learning_rate": 2.9369747899159663e-05, + "loss": 2.4661, + "step": 34000 + }, + { + "epoch": 0.09, + "grad_norm": 52.97163391113281, + "learning_rate": 2.9338235294117648e-05, + "loss": 2.1129, + "step": 34500 + }, + { + "epoch": 0.09, + "grad_norm": 2.337153196334839, + "learning_rate": 2.930672268907563e-05, + "loss": 1.7961, + "step": 35000 + }, + { + "epoch": 0.1, + "grad_norm": 6.669353008270264, + "learning_rate": 2.9275210084033615e-05, + "loss": 1.7907, + "step": 35500 + }, + { + "epoch": 0.1, + "grad_norm": 1.5874249935150146, + "learning_rate": 2.9243697478991596e-05, + "loss": 1.7663, + "step": 36000 + }, + { + "epoch": 0.1, + "grad_norm": 1.7114965915679932, + "learning_rate": 2.921218487394958e-05, + "loss": 1.7439, + "step": 36500 + }, + { + "epoch": 0.1, + "grad_norm": 1.8134816884994507, + "learning_rate": 2.9180672268907563e-05, + "loss": 1.7361, + "step": 37000 + }, + { + "epoch": 0.1, + "grad_norm": 1.505012035369873, + "learning_rate": 2.9149159663865545e-05, + "loss": 1.7323, + "step": 37500 + }, + { + "epoch": 0.1, + "grad_norm": 1.6047751903533936, + "learning_rate": 2.911764705882353e-05, + "loss": 1.7212, + "step": 38000 + }, + { + "epoch": 0.1, + "grad_norm": 1.5497486591339111, + "learning_rate": 2.9086134453781512e-05, + "loss": 1.7215, + "step": 38500 + }, + { + "epoch": 0.11, + "grad_norm": 1.5367647409439087, + "learning_rate": 2.9054621848739497e-05, + "loss": 1.7027, + "step": 39000 + }, + { + "epoch": 0.11, + "grad_norm": 4.223250865936279, + "learning_rate": 2.902310924369748e-05, + "loss": 1.6914, + "step": 39500 + }, + { + "epoch": 0.11, + "grad_norm": 1.5872981548309326, + "learning_rate": 2.8991596638655464e-05, + "loss": 1.6878, + "step": 40000 + }, + { + "epoch": 0.11, + "grad_norm": 1.5480022430419922, + "learning_rate": 2.8960084033613446e-05, + "loss": 1.6816, + "step": 40500 + }, + { + "epoch": 0.11, + "grad_norm": 1.5464568138122559, + "learning_rate": 2.892857142857143e-05, + "loss": 1.6796, + "step": 41000 + }, + { + "epoch": 0.11, + "grad_norm": 1.557543158531189, + "learning_rate": 2.8897058823529413e-05, + "loss": 1.6709, + "step": 41500 + }, + { + "epoch": 0.11, + "grad_norm": 1.5462812185287476, + "learning_rate": 2.8865546218487398e-05, + "loss": 1.6728, + "step": 42000 + }, + { + "epoch": 0.12, + "grad_norm": 1.5833927392959595, + "learning_rate": 2.883403361344538e-05, + "loss": 1.6676, + "step": 42500 + }, + { + "epoch": 0.12, + "grad_norm": 1.63410222530365, + "learning_rate": 2.8802521008403365e-05, + "loss": 1.6696, + "step": 43000 + }, + { + "epoch": 0.12, + "grad_norm": 1.4682618379592896, + "learning_rate": 2.8771008403361346e-05, + "loss": 1.6693, + "step": 43500 + }, + { + "epoch": 0.12, + "grad_norm": 1.5386840105056763, + "learning_rate": 2.8739495798319328e-05, + "loss": 1.6602, + "step": 44000 + }, + { + "epoch": 0.12, + "grad_norm": 1.5572445392608643, + "learning_rate": 2.8707983193277313e-05, + "loss": 1.6581, + "step": 44500 + }, + { + "epoch": 0.12, + "grad_norm": 1.5247888565063477, + "learning_rate": 2.8676470588235295e-05, + "loss": 1.6546, + "step": 45000 + }, + { + "epoch": 0.12, + "grad_norm": 1.5297437906265259, + "learning_rate": 2.864495798319328e-05, + "loss": 1.6467, + "step": 45500 + }, + { + "epoch": 0.12, + "grad_norm": 1.5252556800842285, + "learning_rate": 2.8613445378151262e-05, + "loss": 1.6504, + "step": 46000 + }, + { + "epoch": 0.13, + "grad_norm": 1.4626063108444214, + "learning_rate": 2.8581932773109244e-05, + "loss": 1.6441, + "step": 46500 + }, + { + "epoch": 0.13, + "grad_norm": 1.511093020439148, + "learning_rate": 2.8550420168067225e-05, + "loss": 1.6433, + "step": 47000 + }, + { + "epoch": 0.13, + "grad_norm": 1.572654366493225, + "learning_rate": 2.851890756302521e-05, + "loss": 1.6527, + "step": 47500 + }, + { + "epoch": 0.13, + "grad_norm": 1.5643205642700195, + "learning_rate": 2.8487394957983192e-05, + "loss": 1.6376, + "step": 48000 + }, + { + "epoch": 0.13, + "grad_norm": 1.497128963470459, + "learning_rate": 2.8455882352941177e-05, + "loss": 1.6397, + "step": 48500 + }, + { + "epoch": 0.13, + "grad_norm": 1.464203953742981, + "learning_rate": 2.842436974789916e-05, + "loss": 1.6358, + "step": 49000 + }, + { + "epoch": 0.13, + "grad_norm": 1.8414405584335327, + "learning_rate": 2.8392857142857144e-05, + "loss": 1.6366, + "step": 49500 + }, + { + "epoch": 0.14, + "grad_norm": 1.7834322452545166, + "learning_rate": 2.8361344537815126e-05, + "loss": 1.642, + "step": 50000 + }, + { + "epoch": 0.14, + "grad_norm": 1.477858304977417, + "learning_rate": 2.8329831932773108e-05, + "loss": 1.6342, + "step": 50500 + }, + { + "epoch": 0.14, + "grad_norm": 1.5328236818313599, + "learning_rate": 2.8298319327731093e-05, + "loss": 1.6333, + "step": 51000 + }, + { + "epoch": 0.14, + "grad_norm": 1.540300965309143, + "learning_rate": 2.8266806722689075e-05, + "loss": 1.6352, + "step": 51500 + }, + { + "epoch": 0.14, + "grad_norm": 1.8767386674880981, + "learning_rate": 2.823529411764706e-05, + "loss": 1.6328, + "step": 52000 + }, + { + "epoch": 0.14, + "grad_norm": 1.5387629270553589, + "learning_rate": 2.820378151260504e-05, + "loss": 1.632, + "step": 52500 + }, + { + "epoch": 0.14, + "grad_norm": 1.6315770149230957, + "learning_rate": 2.8172268907563027e-05, + "loss": 1.627, + "step": 53000 + }, + { + "epoch": 0.14, + "grad_norm": 5.726038455963135, + "learning_rate": 2.814075630252101e-05, + "loss": 1.6293, + "step": 53500 + }, + { + "epoch": 0.15, + "grad_norm": 1.5697258710861206, + "learning_rate": 2.8109243697478993e-05, + "loss": 1.6211, + "step": 54000 + }, + { + "epoch": 0.15, + "grad_norm": 1.5938401222229004, + "learning_rate": 2.8077731092436975e-05, + "loss": 1.6196, + "step": 54500 + }, + { + "epoch": 0.15, + "grad_norm": 1.5256606340408325, + "learning_rate": 2.804621848739496e-05, + "loss": 1.6177, + "step": 55000 + }, + { + "epoch": 0.15, + "grad_norm": 2.223390817642212, + "learning_rate": 2.8014705882352942e-05, + "loss": 1.6246, + "step": 55500 + }, + { + "epoch": 0.15, + "grad_norm": 1.4948030710220337, + "learning_rate": 2.7983193277310927e-05, + "loss": 1.6239, + "step": 56000 + }, + { + "epoch": 0.15, + "grad_norm": 1.5147298574447632, + "learning_rate": 2.795168067226891e-05, + "loss": 1.6164, + "step": 56500 + }, + { + "epoch": 0.15, + "grad_norm": 1.5068755149841309, + "learning_rate": 2.792016806722689e-05, + "loss": 1.612, + "step": 57000 + }, + { + "epoch": 0.16, + "grad_norm": 1.5074622631072998, + "learning_rate": 2.7888655462184876e-05, + "loss": 1.6113, + "step": 57500 + }, + { + "epoch": 0.16, + "grad_norm": 1.4880355596542358, + "learning_rate": 2.7857142857142858e-05, + "loss": 1.6102, + "step": 58000 + }, + { + "epoch": 0.16, + "grad_norm": 1.6379941701889038, + "learning_rate": 2.7825630252100843e-05, + "loss": 1.6084, + "step": 58500 + }, + { + "epoch": 0.16, + "grad_norm": 1.4973347187042236, + "learning_rate": 2.7794117647058824e-05, + "loss": 1.6007, + "step": 59000 + }, + { + "epoch": 0.16, + "grad_norm": 1.5474885702133179, + "learning_rate": 2.776260504201681e-05, + "loss": 1.6042, + "step": 59500 + }, + { + "epoch": 0.16, + "grad_norm": 1.602220058441162, + "learning_rate": 2.773109243697479e-05, + "loss": 1.6106, + "step": 60000 + }, + { + "epoch": 0.16, + "grad_norm": 1.6185747385025024, + "learning_rate": 2.7699579831932776e-05, + "loss": 1.6058, + "step": 60500 + }, + { + "epoch": 0.17, + "grad_norm": 1.56905996799469, + "learning_rate": 2.7668067226890758e-05, + "loss": 1.6013, + "step": 61000 + }, + { + "epoch": 0.17, + "grad_norm": 1.5619949102401733, + "learning_rate": 2.763655462184874e-05, + "loss": 1.6034, + "step": 61500 + }, + { + "epoch": 0.17, + "grad_norm": 1.504239559173584, + "learning_rate": 2.7605042016806722e-05, + "loss": 1.6057, + "step": 62000 + }, + { + "epoch": 0.17, + "grad_norm": 1.4879348278045654, + "learning_rate": 2.7573529411764707e-05, + "loss": 1.6021, + "step": 62500 + }, + { + "epoch": 0.17, + "grad_norm": 1.5099623203277588, + "learning_rate": 2.754201680672269e-05, + "loss": 1.6026, + "step": 63000 + }, + { + "epoch": 0.17, + "grad_norm": 1.4979091882705688, + "learning_rate": 2.751050420168067e-05, + "loss": 1.5986, + "step": 63500 + }, + { + "epoch": 0.17, + "grad_norm": 1.4825040102005005, + "learning_rate": 2.7478991596638655e-05, + "loss": 1.5957, + "step": 64000 + }, + { + "epoch": 0.17, + "grad_norm": 1.493453860282898, + "learning_rate": 2.7447478991596637e-05, + "loss": 1.5989, + "step": 64500 + }, + { + "epoch": 0.18, + "grad_norm": 1.530388593673706, + "learning_rate": 2.7415966386554622e-05, + "loss": 1.5953, + "step": 65000 + }, + { + "epoch": 0.18, + "grad_norm": 1.5459638833999634, + "learning_rate": 2.7384453781512604e-05, + "loss": 1.5957, + "step": 65500 + }, + { + "epoch": 0.18, + "grad_norm": 2.0421242713928223, + "learning_rate": 2.735294117647059e-05, + "loss": 1.5984, + "step": 66000 + }, + { + "epoch": 0.18, + "grad_norm": 1.4634993076324463, + "learning_rate": 2.732142857142857e-05, + "loss": 1.5897, + "step": 66500 + }, + { + "epoch": 0.18, + "grad_norm": 1.530594825744629, + "learning_rate": 2.7289915966386556e-05, + "loss": 1.5902, + "step": 67000 + }, + { + "epoch": 0.18, + "grad_norm": 1.5332798957824707, + "learning_rate": 2.7258403361344538e-05, + "loss": 1.5874, + "step": 67500 + }, + { + "epoch": 0.18, + "grad_norm": 1.753754734992981, + "learning_rate": 2.7226890756302523e-05, + "loss": 1.59, + "step": 68000 + }, + { + "epoch": 0.19, + "grad_norm": 1.5545145273208618, + "learning_rate": 2.7195378151260505e-05, + "loss": 1.5949, + "step": 68500 + }, + { + "epoch": 0.19, + "grad_norm": 1.5194141864776611, + "learning_rate": 2.716386554621849e-05, + "loss": 1.588, + "step": 69000 + }, + { + "epoch": 0.19, + "grad_norm": 1.532632827758789, + "learning_rate": 2.713235294117647e-05, + "loss": 1.5918, + "step": 69500 + }, + { + "epoch": 0.19, + "grad_norm": 1.4970754384994507, + "learning_rate": 2.7100840336134453e-05, + "loss": 1.5851, + "step": 70000 + }, + { + "epoch": 0.19, + "grad_norm": 1.4157612323760986, + "learning_rate": 2.706932773109244e-05, + "loss": 1.5823, + "step": 70500 + }, + { + "epoch": 0.19, + "grad_norm": 1.5014020204544067, + "learning_rate": 2.703781512605042e-05, + "loss": 1.5847, + "step": 71000 + }, + { + "epoch": 0.19, + "grad_norm": 1.4652481079101562, + "learning_rate": 2.7006302521008405e-05, + "loss": 1.5886, + "step": 71500 + }, + { + "epoch": 0.19, + "grad_norm": 1.5810528993606567, + "learning_rate": 2.6974789915966387e-05, + "loss": 1.5805, + "step": 72000 + }, + { + "epoch": 0.2, + "grad_norm": 1.4908738136291504, + "learning_rate": 2.6943277310924372e-05, + "loss": 1.5812, + "step": 72500 + }, + { + "epoch": 0.2, + "grad_norm": 1.4520491361618042, + "learning_rate": 2.6911764705882354e-05, + "loss": 1.5837, + "step": 73000 + }, + { + "epoch": 0.2, + "grad_norm": 1.46824049949646, + "learning_rate": 2.688025210084034e-05, + "loss": 1.5778, + "step": 73500 + }, + { + "epoch": 0.2, + "grad_norm": 1.5032325983047485, + "learning_rate": 2.684873949579832e-05, + "loss": 1.5777, + "step": 74000 + }, + { + "epoch": 0.2, + "grad_norm": 1.5338232517242432, + "learning_rate": 2.6817226890756306e-05, + "loss": 1.5768, + "step": 74500 + }, + { + "epoch": 0.2, + "grad_norm": 1.5439281463623047, + "learning_rate": 2.6785714285714288e-05, + "loss": 1.5782, + "step": 75000 + }, + { + "epoch": 0.2, + "grad_norm": 1.536665439605713, + "learning_rate": 2.675420168067227e-05, + "loss": 1.5758, + "step": 75500 + }, + { + "epoch": 0.21, + "grad_norm": 1.4520212411880493, + "learning_rate": 2.6722689075630255e-05, + "loss": 1.5732, + "step": 76000 + }, + { + "epoch": 0.21, + "grad_norm": 1.5352224111557007, + "learning_rate": 2.6691176470588233e-05, + "loss": 1.5745, + "step": 76500 + }, + { + "epoch": 0.21, + "grad_norm": 1.4939314126968384, + "learning_rate": 2.6659663865546218e-05, + "loss": 1.5724, + "step": 77000 + }, + { + "epoch": 0.21, + "grad_norm": 1.4967976808547974, + "learning_rate": 2.66281512605042e-05, + "loss": 1.5693, + "step": 77500 + }, + { + "epoch": 0.21, + "grad_norm": 1.4980648756027222, + "learning_rate": 2.6596638655462185e-05, + "loss": 1.5721, + "step": 78000 + }, + { + "epoch": 0.21, + "grad_norm": 1.5700784921646118, + "learning_rate": 2.6565126050420167e-05, + "loss": 1.5713, + "step": 78500 + }, + { + "epoch": 0.21, + "grad_norm": 1.5124626159667969, + "learning_rate": 2.6533613445378152e-05, + "loss": 1.5709, + "step": 79000 + }, + { + "epoch": 0.22, + "grad_norm": 1.465012788772583, + "learning_rate": 2.6502100840336134e-05, + "loss": 1.5702, + "step": 79500 + }, + { + "epoch": 0.22, + "grad_norm": 1.4589452743530273, + "learning_rate": 2.647058823529412e-05, + "loss": 1.5675, + "step": 80000 + }, + { + "epoch": 0.22, + "grad_norm": 1.547255516052246, + "learning_rate": 2.64390756302521e-05, + "loss": 1.567, + "step": 80500 + }, + { + "epoch": 0.22, + "grad_norm": 1.5208017826080322, + "learning_rate": 2.6407563025210086e-05, + "loss": 1.5654, + "step": 81000 + }, + { + "epoch": 0.22, + "grad_norm": 1.563560128211975, + "learning_rate": 2.6376050420168067e-05, + "loss": 1.5651, + "step": 81500 + }, + { + "epoch": 0.22, + "grad_norm": 1.4551901817321777, + "learning_rate": 2.634453781512605e-05, + "loss": 1.5692, + "step": 82000 + }, + { + "epoch": 0.22, + "grad_norm": 3.783536672592163, + "learning_rate": 2.6313025210084034e-05, + "loss": 1.5698, + "step": 82500 + }, + { + "epoch": 0.22, + "grad_norm": 1.5397638082504272, + "learning_rate": 2.6281512605042016e-05, + "loss": 1.5614, + "step": 83000 + }, + { + "epoch": 0.23, + "grad_norm": 1.5307060480117798, + "learning_rate": 2.625e-05, + "loss": 1.5596, + "step": 83500 + }, + { + "epoch": 0.23, + "grad_norm": 1.5148283243179321, + "learning_rate": 2.6218487394957983e-05, + "loss": 1.5612, + "step": 84000 + }, + { + "epoch": 0.23, + "grad_norm": 1.531973958015442, + "learning_rate": 2.6186974789915968e-05, + "loss": 1.559, + "step": 84500 + }, + { + "epoch": 0.23, + "grad_norm": 1.5402531623840332, + "learning_rate": 2.615546218487395e-05, + "loss": 1.5624, + "step": 85000 + }, + { + "epoch": 0.23, + "grad_norm": 1.486365795135498, + "learning_rate": 2.6123949579831935e-05, + "loss": 1.5601, + "step": 85500 + }, + { + "epoch": 0.23, + "grad_norm": 1.513438105583191, + "learning_rate": 2.6092436974789917e-05, + "loss": 1.5567, + "step": 86000 + }, + { + "epoch": 0.23, + "grad_norm": 1.5112252235412598, + "learning_rate": 2.6060924369747902e-05, + "loss": 1.5574, + "step": 86500 + }, + { + "epoch": 0.24, + "grad_norm": 1.4394776821136475, + "learning_rate": 2.6029411764705883e-05, + "loss": 1.5562, + "step": 87000 + }, + { + "epoch": 0.24, + "grad_norm": 1.6592140197753906, + "learning_rate": 2.599789915966387e-05, + "loss": 1.5551, + "step": 87500 + }, + { + "epoch": 0.24, + "grad_norm": 1.4790719747543335, + "learning_rate": 2.596638655462185e-05, + "loss": 1.5544, + "step": 88000 + }, + { + "epoch": 0.24, + "grad_norm": 1.4369221925735474, + "learning_rate": 2.5934873949579832e-05, + "loss": 1.5538, + "step": 88500 + }, + { + "epoch": 0.24, + "grad_norm": 1.5175668001174927, + "learning_rate": 2.5903361344537817e-05, + "loss": 1.5556, + "step": 89000 + }, + { + "epoch": 0.24, + "grad_norm": 1.4514554738998413, + "learning_rate": 2.58718487394958e-05, + "loss": 1.5539, + "step": 89500 + }, + { + "epoch": 0.24, + "grad_norm": 1.4288485050201416, + "learning_rate": 2.5840336134453784e-05, + "loss": 1.5525, + "step": 90000 + }, + { + "epoch": 0.24, + "grad_norm": 1.546531081199646, + "learning_rate": 2.5808823529411766e-05, + "loss": 1.5527, + "step": 90500 + }, + { + "epoch": 0.25, + "grad_norm": 1.567368507385254, + "learning_rate": 2.5777310924369748e-05, + "loss": 1.5491, + "step": 91000 + }, + { + "epoch": 0.25, + "grad_norm": 1.5126845836639404, + "learning_rate": 2.574579831932773e-05, + "loss": 1.5504, + "step": 91500 + }, + { + "epoch": 0.25, + "grad_norm": 1.5570114850997925, + "learning_rate": 2.5714285714285714e-05, + "loss": 1.5469, + "step": 92000 + }, + { + "epoch": 0.25, + "grad_norm": 1.4678915739059448, + "learning_rate": 2.5682773109243696e-05, + "loss": 1.5493, + "step": 92500 + }, + { + "epoch": 0.25, + "grad_norm": 1.4618594646453857, + "learning_rate": 2.565126050420168e-05, + "loss": 1.555, + "step": 93000 + }, + { + "epoch": 0.25, + "grad_norm": 1.5945430994033813, + "learning_rate": 2.5619747899159663e-05, + "loss": 1.547, + "step": 93500 + }, + { + "epoch": 0.25, + "grad_norm": 1.4740761518478394, + "learning_rate": 2.5588235294117648e-05, + "loss": 1.5463, + "step": 94000 + }, + { + "epoch": 0.26, + "grad_norm": 1.4022290706634521, + "learning_rate": 2.555672268907563e-05, + "loss": 1.5449, + "step": 94500 + }, + { + "epoch": 0.26, + "grad_norm": 2.622828722000122, + "learning_rate": 2.552521008403361e-05, + "loss": 1.55, + "step": 95000 + }, + { + "epoch": 0.26, + "grad_norm": 1.409568428993225, + "learning_rate": 2.5493697478991597e-05, + "loss": 1.5436, + "step": 95500 + }, + { + "epoch": 0.26, + "grad_norm": 1.4889922142028809, + "learning_rate": 2.546218487394958e-05, + "loss": 1.5441, + "step": 96000 + }, + { + "epoch": 0.26, + "grad_norm": 1.4589875936508179, + "learning_rate": 2.5430672268907564e-05, + "loss": 1.5468, + "step": 96500 + }, + { + "epoch": 0.26, + "grad_norm": 1.4680520296096802, + "learning_rate": 2.5399159663865545e-05, + "loss": 1.5429, + "step": 97000 + }, + { + "epoch": 0.26, + "grad_norm": 1.4456883668899536, + "learning_rate": 2.536764705882353e-05, + "loss": 1.5458, + "step": 97500 + }, + { + "epoch": 0.27, + "grad_norm": 1.4655406475067139, + "learning_rate": 2.5336134453781512e-05, + "loss": 1.5399, + "step": 98000 + }, + { + "epoch": 0.27, + "grad_norm": 7.581863880157471, + "learning_rate": 2.5304621848739497e-05, + "loss": 1.5423, + "step": 98500 + }, + { + "epoch": 0.27, + "grad_norm": 1.5289582014083862, + "learning_rate": 2.527310924369748e-05, + "loss": 1.5434, + "step": 99000 + }, + { + "epoch": 0.27, + "grad_norm": 1.475637674331665, + "learning_rate": 2.5241596638655464e-05, + "loss": 1.5415, + "step": 99500 + }, + { + "epoch": 0.27, + "grad_norm": 1.45746910572052, + "learning_rate": 2.5210084033613446e-05, + "loss": 1.5401, + "step": 100000 + }, + { + "epoch": 0.27, + "grad_norm": 1.4924384355545044, + "learning_rate": 2.517857142857143e-05, + "loss": 1.5382, + "step": 100500 + }, + { + "epoch": 0.27, + "grad_norm": 1.4440650939941406, + "learning_rate": 2.5147058823529413e-05, + "loss": 1.539, + "step": 101000 + }, + { + "epoch": 0.27, + "grad_norm": 1.5022001266479492, + "learning_rate": 2.5115546218487395e-05, + "loss": 1.5375, + "step": 101500 + }, + { + "epoch": 0.28, + "grad_norm": 1.4573357105255127, + "learning_rate": 2.508403361344538e-05, + "loss": 1.5423, + "step": 102000 + }, + { + "epoch": 0.28, + "grad_norm": 1.4948347806930542, + "learning_rate": 2.505252100840336e-05, + "loss": 1.538, + "step": 102500 + }, + { + "epoch": 0.28, + "grad_norm": 1.5028940439224243, + "learning_rate": 2.5021008403361347e-05, + "loss": 1.5368, + "step": 103000 + }, + { + "epoch": 0.28, + "grad_norm": 1.510446310043335, + "learning_rate": 2.498949579831933e-05, + "loss": 1.534, + "step": 103500 + }, + { + "epoch": 0.28, + "grad_norm": 1.516194462776184, + "learning_rate": 2.4957983193277314e-05, + "loss": 1.5404, + "step": 104000 + }, + { + "epoch": 0.28, + "grad_norm": 1.452358365058899, + "learning_rate": 2.4926470588235295e-05, + "loss": 1.5349, + "step": 104500 + }, + { + "epoch": 0.28, + "grad_norm": 1.4550226926803589, + "learning_rate": 2.489495798319328e-05, + "loss": 1.5373, + "step": 105000 + }, + { + "epoch": 0.29, + "grad_norm": 1.4559545516967773, + "learning_rate": 2.4863445378151262e-05, + "loss": 1.5341, + "step": 105500 + }, + { + "epoch": 0.29, + "grad_norm": 1.4436681270599365, + "learning_rate": 2.4831932773109244e-05, + "loss": 1.5344, + "step": 106000 + }, + { + "epoch": 0.29, + "grad_norm": 1.4642813205718994, + "learning_rate": 2.4800420168067226e-05, + "loss": 1.5333, + "step": 106500 + }, + { + "epoch": 0.29, + "grad_norm": 1.4824906587600708, + "learning_rate": 2.476890756302521e-05, + "loss": 1.5291, + "step": 107000 + }, + { + "epoch": 0.29, + "grad_norm": 1.515098214149475, + "learning_rate": 2.4737394957983193e-05, + "loss": 1.5285, + "step": 107500 + }, + { + "epoch": 0.29, + "grad_norm": 2.073720693588257, + "learning_rate": 2.4705882352941174e-05, + "loss": 1.5348, + "step": 108000 + }, + { + "epoch": 0.29, + "grad_norm": 1.884777545928955, + "learning_rate": 2.467436974789916e-05, + "loss": 1.5321, + "step": 108500 + }, + { + "epoch": 0.3, + "grad_norm": 1.4791995286941528, + "learning_rate": 2.464285714285714e-05, + "loss": 1.5305, + "step": 109000 + }, + { + "epoch": 0.3, + "grad_norm": 1.4546101093292236, + "learning_rate": 2.4611344537815126e-05, + "loss": 1.5308, + "step": 109500 + }, + { + "epoch": 0.3, + "grad_norm": 1.421767234802246, + "learning_rate": 2.4579831932773108e-05, + "loss": 1.532, + "step": 110000 + }, + { + "epoch": 0.3, + "grad_norm": 1.476372480392456, + "learning_rate": 2.4548319327731093e-05, + "loss": 1.5303, + "step": 110500 + }, + { + "epoch": 0.3, + "grad_norm": 1.4746720790863037, + "learning_rate": 2.4516806722689075e-05, + "loss": 1.531, + "step": 111000 + }, + { + "epoch": 0.3, + "grad_norm": 1.486217975616455, + "learning_rate": 2.448529411764706e-05, + "loss": 1.5277, + "step": 111500 + }, + { + "epoch": 0.3, + "grad_norm": 1.4249714612960815, + "learning_rate": 2.4453781512605042e-05, + "loss": 1.525, + "step": 112000 + }, + { + "epoch": 0.3, + "grad_norm": 1.4237457513809204, + "learning_rate": 2.4422268907563027e-05, + "loss": 1.5263, + "step": 112500 + }, + { + "epoch": 0.31, + "grad_norm": 1.4878206253051758, + "learning_rate": 2.439075630252101e-05, + "loss": 1.5239, + "step": 113000 + }, + { + "epoch": 0.31, + "grad_norm": 1.4781346321105957, + "learning_rate": 2.4359243697478994e-05, + "loss": 1.528, + "step": 113500 + }, + { + "epoch": 0.31, + "grad_norm": 1.4943785667419434, + "learning_rate": 2.4327731092436976e-05, + "loss": 1.5231, + "step": 114000 + }, + { + "epoch": 0.31, + "grad_norm": 1.466009497642517, + "learning_rate": 2.4296218487394957e-05, + "loss": 1.5233, + "step": 114500 + }, + { + "epoch": 0.31, + "grad_norm": 2.4329051971435547, + "learning_rate": 2.4264705882352942e-05, + "loss": 1.5266, + "step": 115000 + }, + { + "epoch": 0.31, + "grad_norm": 1.477039098739624, + "learning_rate": 2.4233193277310924e-05, + "loss": 1.5278, + "step": 115500 + }, + { + "epoch": 0.31, + "grad_norm": 1.5693820714950562, + "learning_rate": 2.420168067226891e-05, + "loss": 1.5254, + "step": 116000 + }, + { + "epoch": 0.32, + "grad_norm": 1.4393528699874878, + "learning_rate": 2.417016806722689e-05, + "loss": 1.5236, + "step": 116500 + }, + { + "epoch": 0.32, + "grad_norm": 1.4845529794692993, + "learning_rate": 2.4138655462184876e-05, + "loss": 1.5206, + "step": 117000 + }, + { + "epoch": 0.32, + "grad_norm": 1.476683259010315, + "learning_rate": 2.4107142857142858e-05, + "loss": 1.5208, + "step": 117500 + }, + { + "epoch": 0.32, + "grad_norm": 1.428836703300476, + "learning_rate": 2.4075630252100843e-05, + "loss": 1.5234, + "step": 118000 + }, + { + "epoch": 0.32, + "grad_norm": 1.449540138244629, + "learning_rate": 2.4044117647058825e-05, + "loss": 1.5234, + "step": 118500 + }, + { + "epoch": 0.32, + "grad_norm": 1.4410090446472168, + "learning_rate": 2.401260504201681e-05, + "loss": 1.5203, + "step": 119000 + }, + { + "epoch": 0.32, + "grad_norm": 1.4714431762695312, + "learning_rate": 2.398109243697479e-05, + "loss": 1.5208, + "step": 119500 + }, + { + "epoch": 0.32, + "grad_norm": 1.469762921333313, + "learning_rate": 2.3949579831932777e-05, + "loss": 1.524, + "step": 120000 + }, + { + "epoch": 0.33, + "grad_norm": 1.5507971048355103, + "learning_rate": 2.391806722689076e-05, + "loss": 1.5224, + "step": 120500 + }, + { + "epoch": 0.33, + "grad_norm": 1.5093679428100586, + "learning_rate": 2.3886554621848737e-05, + "loss": 1.5235, + "step": 121000 + }, + { + "epoch": 0.33, + "grad_norm": 1.492244839668274, + "learning_rate": 2.3855042016806722e-05, + "loss": 1.5196, + "step": 121500 + }, + { + "epoch": 0.33, + "grad_norm": 1.4522676467895508, + "learning_rate": 2.3823529411764704e-05, + "loss": 1.5209, + "step": 122000 + }, + { + "epoch": 0.33, + "grad_norm": 1.527627944946289, + "learning_rate": 2.379201680672269e-05, + "loss": 1.5198, + "step": 122500 + }, + { + "epoch": 0.33, + "grad_norm": 1.488146424293518, + "learning_rate": 2.376050420168067e-05, + "loss": 1.5165, + "step": 123000 + }, + { + "epoch": 0.33, + "grad_norm": 1.4484755992889404, + "learning_rate": 2.3728991596638656e-05, + "loss": 1.5123, + "step": 123500 + }, + { + "epoch": 0.34, + "grad_norm": 1.5184931755065918, + "learning_rate": 2.3697478991596638e-05, + "loss": 1.5177, + "step": 124000 + }, + { + "epoch": 0.34, + "grad_norm": 1.4979966878890991, + "learning_rate": 2.3665966386554623e-05, + "loss": 1.5193, + "step": 124500 + }, + { + "epoch": 0.34, + "grad_norm": 1.4858919382095337, + "learning_rate": 2.3634453781512604e-05, + "loss": 1.5129, + "step": 125000 + }, + { + "epoch": 0.34, + "grad_norm": 1.6100457906723022, + "learning_rate": 2.360294117647059e-05, + "loss": 1.5153, + "step": 125500 + }, + { + "epoch": 0.34, + "grad_norm": 1.4573218822479248, + "learning_rate": 2.357142857142857e-05, + "loss": 1.5173, + "step": 126000 + }, + { + "epoch": 0.34, + "grad_norm": 1.4780622720718384, + "learning_rate": 2.3539915966386556e-05, + "loss": 1.5142, + "step": 126500 + }, + { + "epoch": 0.34, + "grad_norm": 1.4847768545150757, + "learning_rate": 2.3508403361344538e-05, + "loss": 1.5123, + "step": 127000 + }, + { + "epoch": 0.35, + "grad_norm": 1.789902925491333, + "learning_rate": 2.347689075630252e-05, + "loss": 1.5128, + "step": 127500 + }, + { + "epoch": 0.35, + "grad_norm": 1.4414323568344116, + "learning_rate": 2.3445378151260505e-05, + "loss": 1.5112, + "step": 128000 + }, + { + "epoch": 0.35, + "grad_norm": 1.542536735534668, + "learning_rate": 2.3413865546218487e-05, + "loss": 1.5132, + "step": 128500 + }, + { + "epoch": 0.35, + "grad_norm": 1.479336142539978, + "learning_rate": 2.3382352941176472e-05, + "loss": 1.5091, + "step": 129000 + }, + { + "epoch": 0.35, + "grad_norm": 1.5068061351776123, + "learning_rate": 2.3350840336134454e-05, + "loss": 1.5157, + "step": 129500 + }, + { + "epoch": 0.35, + "grad_norm": 1.5134038925170898, + "learning_rate": 2.331932773109244e-05, + "loss": 1.5145, + "step": 130000 + }, + { + "epoch": 0.35, + "grad_norm": 2.804521083831787, + "learning_rate": 2.328781512605042e-05, + "loss": 1.71, + "step": 130500 + }, + { + "epoch": 0.35, + "grad_norm": 9.153915405273438, + "learning_rate": 2.3256302521008406e-05, + "loss": 1.5874, + "step": 131000 + }, + { + "epoch": 0.36, + "grad_norm": 3.567737579345703, + "learning_rate": 2.3224789915966387e-05, + "loss": 1.5532, + "step": 131500 + }, + { + "epoch": 0.36, + "grad_norm": 1.5058925151824951, + "learning_rate": 2.3193277310924373e-05, + "loss": 1.5241, + "step": 132000 + }, + { + "epoch": 0.36, + "grad_norm": 1.48910653591156, + "learning_rate": 2.3161764705882354e-05, + "loss": 1.5197, + "step": 132500 + }, + { + "epoch": 0.36, + "grad_norm": 1.477921962738037, + "learning_rate": 2.313025210084034e-05, + "loss": 1.5191, + "step": 133000 + }, + { + "epoch": 0.36, + "grad_norm": 1.503013014793396, + "learning_rate": 2.309873949579832e-05, + "loss": 1.5112, + "step": 133500 + }, + { + "epoch": 0.36, + "grad_norm": 1.457146406173706, + "learning_rate": 2.3067226890756303e-05, + "loss": 1.5158, + "step": 134000 + }, + { + "epoch": 0.36, + "grad_norm": 1.8954756259918213, + "learning_rate": 2.3035714285714288e-05, + "loss": 1.5138, + "step": 134500 + }, + { + "epoch": 0.37, + "grad_norm": 1.5171183347702026, + "learning_rate": 2.300420168067227e-05, + "loss": 1.5201, + "step": 135000 + }, + { + "epoch": 0.37, + "grad_norm": 1.454849362373352, + "learning_rate": 2.2972689075630255e-05, + "loss": 1.5113, + "step": 135500 + }, + { + "epoch": 0.37, + "grad_norm": 2.3639023303985596, + "learning_rate": 2.2941176470588233e-05, + "loss": 1.5089, + "step": 136000 + }, + { + "epoch": 0.37, + "grad_norm": 1.4599758386611938, + "learning_rate": 2.290966386554622e-05, + "loss": 1.5099, + "step": 136500 + }, + { + "epoch": 0.37, + "grad_norm": 1.5151523351669312, + "learning_rate": 2.28781512605042e-05, + "loss": 1.5077, + "step": 137000 + }, + { + "epoch": 0.37, + "grad_norm": 1.518723726272583, + "learning_rate": 2.2846638655462185e-05, + "loss": 1.5097, + "step": 137500 + }, + { + "epoch": 0.37, + "grad_norm": 1.5430985689163208, + "learning_rate": 2.2815126050420167e-05, + "loss": 1.5089, + "step": 138000 + }, + { + "epoch": 0.37, + "grad_norm": 1.468233585357666, + "learning_rate": 2.2783613445378152e-05, + "loss": 1.5075, + "step": 138500 + }, + { + "epoch": 0.38, + "grad_norm": 1.540824294090271, + "learning_rate": 2.2752100840336134e-05, + "loss": 1.5095, + "step": 139000 + }, + { + "epoch": 0.38, + "grad_norm": 1.4792211055755615, + "learning_rate": 2.272058823529412e-05, + "loss": 1.5123, + "step": 139500 + }, + { + "epoch": 0.38, + "grad_norm": 1.4582479000091553, + "learning_rate": 2.26890756302521e-05, + "loss": 1.5041, + "step": 140000 + }, + { + "epoch": 0.38, + "grad_norm": 1.4484353065490723, + "learning_rate": 2.2657563025210083e-05, + "loss": 1.5098, + "step": 140500 + }, + { + "epoch": 0.38, + "grad_norm": 2.090087413787842, + "learning_rate": 2.2626050420168068e-05, + "loss": 1.504, + "step": 141000 + }, + { + "epoch": 0.38, + "grad_norm": 1.5165677070617676, + "learning_rate": 2.259453781512605e-05, + "loss": 1.5037, + "step": 141500 + }, + { + "epoch": 0.38, + "grad_norm": 1.4467180967330933, + "learning_rate": 2.2563025210084035e-05, + "loss": 1.5037, + "step": 142000 + }, + { + "epoch": 0.39, + "grad_norm": 1.53107750415802, + "learning_rate": 2.2531512605042016e-05, + "loss": 1.5048, + "step": 142500 + }, + { + "epoch": 0.39, + "grad_norm": 1.685832142829895, + "learning_rate": 2.25e-05, + "loss": 1.5051, + "step": 143000 + }, + { + "epoch": 0.39, + "grad_norm": 1.722901701927185, + "learning_rate": 2.2468487394957983e-05, + "loss": 1.5038, + "step": 143500 + }, + { + "epoch": 0.39, + "grad_norm": 1.5191560983657837, + "learning_rate": 2.2436974789915968e-05, + "loss": 1.5021, + "step": 144000 + }, + { + "epoch": 0.39, + "grad_norm": 1.6680717468261719, + "learning_rate": 2.240546218487395e-05, + "loss": 1.5019, + "step": 144500 + }, + { + "epoch": 0.39, + "grad_norm": 1.5664371252059937, + "learning_rate": 2.2373949579831935e-05, + "loss": 1.5028, + "step": 145000 + }, + { + "epoch": 0.39, + "grad_norm": 1.484131932258606, + "learning_rate": 2.2342436974789917e-05, + "loss": 1.5028, + "step": 145500 + }, + { + "epoch": 0.4, + "grad_norm": 1.4882657527923584, + "learning_rate": 2.2310924369747902e-05, + "loss": 1.4993, + "step": 146000 + }, + { + "epoch": 0.4, + "grad_norm": 1.4583569765090942, + "learning_rate": 2.2279411764705884e-05, + "loss": 1.5037, + "step": 146500 + }, + { + "epoch": 0.4, + "grad_norm": 1.559399127960205, + "learning_rate": 2.2247899159663866e-05, + "loss": 1.4994, + "step": 147000 + }, + { + "epoch": 0.4, + "grad_norm": 1.537287950515747, + "learning_rate": 2.221638655462185e-05, + "loss": 1.5008, + "step": 147500 + }, + { + "epoch": 0.4, + "grad_norm": 1.4840517044067383, + "learning_rate": 2.2184873949579832e-05, + "loss": 1.5003, + "step": 148000 + }, + { + "epoch": 0.4, + "grad_norm": 1.6292195320129395, + "learning_rate": 2.2153361344537818e-05, + "loss": 1.4975, + "step": 148500 + }, + { + "epoch": 0.4, + "grad_norm": 1.4870771169662476, + "learning_rate": 2.21218487394958e-05, + "loss": 1.4962, + "step": 149000 + }, + { + "epoch": 0.4, + "grad_norm": 1.4792907238006592, + "learning_rate": 2.2090336134453784e-05, + "loss": 1.4978, + "step": 149500 + }, + { + "epoch": 0.41, + "grad_norm": 1.4179558753967285, + "learning_rate": 2.2058823529411766e-05, + "loss": 1.5012, + "step": 150000 + }, + { + "epoch": 0.41, + "grad_norm": 1.4594039916992188, + "learning_rate": 2.2027310924369748e-05, + "loss": 1.4987, + "step": 150500 + }, + { + "epoch": 0.41, + "grad_norm": 1.5356736183166504, + "learning_rate": 2.199579831932773e-05, + "loss": 1.4975, + "step": 151000 + }, + { + "epoch": 0.41, + "grad_norm": 1.4961708784103394, + "learning_rate": 2.1964285714285715e-05, + "loss": 1.4966, + "step": 151500 + }, + { + "epoch": 0.41, + "grad_norm": 1.5061964988708496, + "learning_rate": 2.1932773109243697e-05, + "loss": 1.4952, + "step": 152000 + }, + { + "epoch": 0.41, + "grad_norm": 1.4668192863464355, + "learning_rate": 2.190126050420168e-05, + "loss": 1.4955, + "step": 152500 + }, + { + "epoch": 0.41, + "grad_norm": 1.520202398300171, + "learning_rate": 2.1869747899159663e-05, + "loss": 1.4987, + "step": 153000 + }, + { + "epoch": 0.42, + "grad_norm": 1.5048165321350098, + "learning_rate": 2.1838235294117645e-05, + "loss": 1.4943, + "step": 153500 + }, + { + "epoch": 0.42, + "grad_norm": 1.4194804430007935, + "learning_rate": 2.180672268907563e-05, + "loss": 1.4962, + "step": 154000 + }, + { + "epoch": 0.42, + "grad_norm": 1.4963053464889526, + "learning_rate": 2.1775210084033612e-05, + "loss": 1.4939, + "step": 154500 + }, + { + "epoch": 0.42, + "grad_norm": 1.5189534425735474, + "learning_rate": 2.1743697478991597e-05, + "loss": 1.4955, + "step": 155000 + }, + { + "epoch": 0.42, + "grad_norm": 1.844502329826355, + "learning_rate": 2.171218487394958e-05, + "loss": 1.4932, + "step": 155500 + }, + { + "epoch": 0.42, + "grad_norm": 1.6127697229385376, + "learning_rate": 2.1680672268907564e-05, + "loss": 1.4972, + "step": 156000 + }, + { + "epoch": 0.42, + "grad_norm": 2.39309024810791, + "learning_rate": 2.1649159663865546e-05, + "loss": 1.4961, + "step": 156500 + }, + { + "epoch": 0.43, + "grad_norm": 1.7886457443237305, + "learning_rate": 2.161764705882353e-05, + "loss": 1.4981, + "step": 157000 + }, + { + "epoch": 0.43, + "grad_norm": 1.5055351257324219, + "learning_rate": 2.1586134453781513e-05, + "loss": 1.4937, + "step": 157500 + }, + { + "epoch": 0.43, + "grad_norm": 2.2209436893463135, + "learning_rate": 2.1554621848739498e-05, + "loss": 1.4958, + "step": 158000 + }, + { + "epoch": 0.43, + "grad_norm": 1.4863665103912354, + "learning_rate": 2.152310924369748e-05, + "loss": 1.4937, + "step": 158500 + }, + { + "epoch": 0.43, + "grad_norm": 1.6290695667266846, + "learning_rate": 2.1491596638655465e-05, + "loss": 1.4934, + "step": 159000 + }, + { + "epoch": 0.43, + "grad_norm": 1.5069892406463623, + "learning_rate": 2.1460084033613446e-05, + "loss": 1.4966, + "step": 159500 + }, + { + "epoch": 0.43, + "grad_norm": 1.4480432271957397, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.4928, + "step": 160000 + }, + { + "epoch": 0.43, + "grad_norm": 1.4599815607070923, + "learning_rate": 2.1397058823529413e-05, + "loss": 1.4907, + "step": 160500 + }, + { + "epoch": 0.44, + "grad_norm": 1.5667592287063599, + "learning_rate": 2.1365546218487395e-05, + "loss": 1.4946, + "step": 161000 + }, + { + "epoch": 0.44, + "grad_norm": 1.591620683670044, + "learning_rate": 2.133403361344538e-05, + "loss": 1.4932, + "step": 161500 + }, + { + "epoch": 0.44, + "grad_norm": 1.4108275175094604, + "learning_rate": 2.1302521008403362e-05, + "loss": 1.4918, + "step": 162000 + }, + { + "epoch": 0.44, + "grad_norm": 1.3984153270721436, + "learning_rate": 2.1271008403361347e-05, + "loss": 1.4912, + "step": 162500 + }, + { + "epoch": 0.44, + "grad_norm": 1.5187551975250244, + "learning_rate": 2.123949579831933e-05, + "loss": 1.4896, + "step": 163000 + }, + { + "epoch": 0.44, + "grad_norm": 1.4671634435653687, + "learning_rate": 2.1207983193277314e-05, + "loss": 1.4909, + "step": 163500 + }, + { + "epoch": 0.44, + "grad_norm": 1.5398577451705933, + "learning_rate": 2.1176470588235296e-05, + "loss": 1.4898, + "step": 164000 + }, + { + "epoch": 0.45, + "grad_norm": 1.4390913248062134, + "learning_rate": 2.114495798319328e-05, + "loss": 1.4917, + "step": 164500 + }, + { + "epoch": 0.45, + "grad_norm": 1.466871976852417, + "learning_rate": 2.1113445378151263e-05, + "loss": 1.486, + "step": 165000 + }, + { + "epoch": 0.45, + "grad_norm": 1.4268947839736938, + "learning_rate": 2.1081932773109244e-05, + "loss": 1.486, + "step": 165500 + }, + { + "epoch": 0.45, + "grad_norm": 1.473212718963623, + "learning_rate": 2.1050420168067226e-05, + "loss": 1.4906, + "step": 166000 + }, + { + "epoch": 0.45, + "grad_norm": 1.4817694425582886, + "learning_rate": 2.1018907563025208e-05, + "loss": 1.4876, + "step": 166500 + }, + { + "epoch": 0.45, + "grad_norm": 1.4899072647094727, + "learning_rate": 2.0987394957983193e-05, + "loss": 1.4853, + "step": 167000 + }, + { + "epoch": 0.45, + "grad_norm": 1.472068428993225, + "learning_rate": 2.0955882352941175e-05, + "loss": 1.4859, + "step": 167500 + }, + { + "epoch": 0.45, + "grad_norm": 1.4609180688858032, + "learning_rate": 2.092436974789916e-05, + "loss": 1.4867, + "step": 168000 + }, + { + "epoch": 0.46, + "grad_norm": 1.3884390592575073, + "learning_rate": 2.089285714285714e-05, + "loss": 1.4845, + "step": 168500 + }, + { + "epoch": 0.46, + "grad_norm": 1.4505021572113037, + "learning_rate": 2.0861344537815127e-05, + "loss": 1.4804, + "step": 169000 + }, + { + "epoch": 0.46, + "grad_norm": 1.4579660892486572, + "learning_rate": 2.082983193277311e-05, + "loss": 1.4828, + "step": 169500 + }, + { + "epoch": 0.46, + "grad_norm": 1.4193936586380005, + "learning_rate": 2.0798319327731094e-05, + "loss": 1.4846, + "step": 170000 + }, + { + "epoch": 0.46, + "grad_norm": 1.8833608627319336, + "learning_rate": 2.0766806722689075e-05, + "loss": 1.4832, + "step": 170500 + }, + { + "epoch": 0.46, + "grad_norm": 1.394463062286377, + "learning_rate": 2.073529411764706e-05, + "loss": 1.4858, + "step": 171000 + }, + { + "epoch": 0.46, + "grad_norm": 1.4402869939804077, + "learning_rate": 2.0703781512605042e-05, + "loss": 1.4853, + "step": 171500 + }, + { + "epoch": 0.47, + "grad_norm": 1.5677118301391602, + "learning_rate": 2.0672268907563024e-05, + "loss": 1.4828, + "step": 172000 + }, + { + "epoch": 0.47, + "grad_norm": 1.412744402885437, + "learning_rate": 2.064075630252101e-05, + "loss": 1.4861, + "step": 172500 + }, + { + "epoch": 0.47, + "grad_norm": 1.578121542930603, + "learning_rate": 2.060924369747899e-05, + "loss": 1.4825, + "step": 173000 + }, + { + "epoch": 0.47, + "grad_norm": 1.4429398775100708, + "learning_rate": 2.0577731092436976e-05, + "loss": 1.4806, + "step": 173500 + }, + { + "epoch": 0.47, + "grad_norm": 1.5229464769363403, + "learning_rate": 2.0546218487394958e-05, + "loss": 1.4822, + "step": 174000 + }, + { + "epoch": 0.47, + "grad_norm": 1.533868670463562, + "learning_rate": 2.0514705882352943e-05, + "loss": 1.4788, + "step": 174500 + }, + { + "epoch": 0.47, + "grad_norm": 1.4442238807678223, + "learning_rate": 2.0483193277310925e-05, + "loss": 1.4845, + "step": 175000 + }, + { + "epoch": 0.48, + "grad_norm": 1.8768386840820312, + "learning_rate": 2.045168067226891e-05, + "loss": 1.481, + "step": 175500 + }, + { + "epoch": 0.48, + "grad_norm": 1.5719354152679443, + "learning_rate": 2.042016806722689e-05, + "loss": 1.4815, + "step": 176000 + }, + { + "epoch": 0.48, + "grad_norm": 1.6776522397994995, + "learning_rate": 2.0388655462184877e-05, + "loss": 1.4834, + "step": 176500 + }, + { + "epoch": 0.48, + "grad_norm": 1.462403416633606, + "learning_rate": 2.0357142857142858e-05, + "loss": 1.4829, + "step": 177000 + }, + { + "epoch": 0.48, + "grad_norm": 1.441434621810913, + "learning_rate": 2.0325630252100843e-05, + "loss": 1.4817, + "step": 177500 + }, + { + "epoch": 0.48, + "grad_norm": 1.7203949689865112, + "learning_rate": 2.0294117647058825e-05, + "loss": 1.4819, + "step": 178000 + }, + { + "epoch": 0.48, + "grad_norm": 1.6117925643920898, + "learning_rate": 2.0262605042016807e-05, + "loss": 1.48, + "step": 178500 + }, + { + "epoch": 0.48, + "grad_norm": 1.4840322732925415, + "learning_rate": 2.0231092436974792e-05, + "loss": 1.4804, + "step": 179000 + }, + { + "epoch": 0.49, + "grad_norm": 1.4823276996612549, + "learning_rate": 2.0199579831932774e-05, + "loss": 1.4783, + "step": 179500 + }, + { + "epoch": 0.49, + "grad_norm": 1.467035174369812, + "learning_rate": 2.016806722689076e-05, + "loss": 1.4826, + "step": 180000 + }, + { + "epoch": 0.49, + "grad_norm": 1.4519331455230713, + "learning_rate": 2.0136554621848737e-05, + "loss": 1.4793, + "step": 180500 + }, + { + "epoch": 0.49, + "grad_norm": 1.4830392599105835, + "learning_rate": 2.0105042016806722e-05, + "loss": 1.478, + "step": 181000 + }, + { + "epoch": 0.49, + "grad_norm": 1.4889652729034424, + "learning_rate": 2.0073529411764704e-05, + "loss": 1.4825, + "step": 181500 + }, + { + "epoch": 0.49, + "grad_norm": 1.4417020082473755, + "learning_rate": 2.004201680672269e-05, + "loss": 1.4781, + "step": 182000 + }, + { + "epoch": 0.49, + "grad_norm": 1.5612033605575562, + "learning_rate": 2.001050420168067e-05, + "loss": 1.4749, + "step": 182500 + }, + { + "epoch": 0.5, + "grad_norm": 1.923521637916565, + "learning_rate": 1.9978991596638656e-05, + "loss": 1.4742, + "step": 183000 + }, + { + "epoch": 0.5, + "grad_norm": 1.4759869575500488, + "learning_rate": 1.9947478991596638e-05, + "loss": 1.4772, + "step": 183500 + }, + { + "epoch": 0.5, + "grad_norm": 1.4529997110366821, + "learning_rate": 1.9915966386554623e-05, + "loss": 1.4758, + "step": 184000 + }, + { + "epoch": 0.5, + "grad_norm": 1.4907563924789429, + "learning_rate": 1.9884453781512605e-05, + "loss": 1.477, + "step": 184500 + }, + { + "epoch": 0.5, + "grad_norm": 1.4529681205749512, + "learning_rate": 1.9852941176470586e-05, + "loss": 1.4754, + "step": 185000 + }, + { + "epoch": 0.5, + "grad_norm": 1.4950664043426514, + "learning_rate": 1.982142857142857e-05, + "loss": 1.477, + "step": 185500 + }, + { + "epoch": 0.5, + "grad_norm": 1.5445144176483154, + "learning_rate": 1.9789915966386553e-05, + "loss": 1.4763, + "step": 186000 + }, + { + "epoch": 0.5, + "grad_norm": 2.2947561740875244, + "learning_rate": 1.975840336134454e-05, + "loss": 1.4771, + "step": 186500 + }, + { + "epoch": 0.51, + "grad_norm": 1.4762338399887085, + "learning_rate": 1.972689075630252e-05, + "loss": 1.4748, + "step": 187000 + }, + { + "epoch": 0.51, + "grad_norm": 1.5006557703018188, + "learning_rate": 1.9695378151260505e-05, + "loss": 1.474, + "step": 187500 + }, + { + "epoch": 0.51, + "grad_norm": 1.5126187801361084, + "learning_rate": 1.9663865546218487e-05, + "loss": 1.4769, + "step": 188000 + }, + { + "epoch": 0.51, + "grad_norm": 3.9213035106658936, + "learning_rate": 1.9632352941176472e-05, + "loss": 1.4724, + "step": 188500 + }, + { + "epoch": 0.51, + "grad_norm": 1.3832660913467407, + "learning_rate": 1.9600840336134454e-05, + "loss": 1.4743, + "step": 189000 + }, + { + "epoch": 0.51, + "grad_norm": 1.438021183013916, + "learning_rate": 1.956932773109244e-05, + "loss": 1.4732, + "step": 189500 + }, + { + "epoch": 0.51, + "grad_norm": 1.552357792854309, + "learning_rate": 1.953781512605042e-05, + "loss": 1.4693, + "step": 190000 + }, + { + "epoch": 0.52, + "grad_norm": 1.4992841482162476, + "learning_rate": 1.9506302521008406e-05, + "loss": 1.4741, + "step": 190500 + }, + { + "epoch": 0.52, + "grad_norm": 1.4546705484390259, + "learning_rate": 1.9474789915966388e-05, + "loss": 1.4709, + "step": 191000 + }, + { + "epoch": 0.52, + "grad_norm": 1.5536097288131714, + "learning_rate": 1.944327731092437e-05, + "loss": 1.4715, + "step": 191500 + }, + { + "epoch": 0.52, + "grad_norm": 1.4430129528045654, + "learning_rate": 1.9411764705882355e-05, + "loss": 1.4694, + "step": 192000 + }, + { + "epoch": 0.52, + "grad_norm": 1.4931637048721313, + "learning_rate": 1.9380252100840336e-05, + "loss": 1.4704, + "step": 192500 + }, + { + "epoch": 0.52, + "grad_norm": 1.4820243120193481, + "learning_rate": 1.934873949579832e-05, + "loss": 1.4707, + "step": 193000 + }, + { + "epoch": 0.52, + "grad_norm": 1.5232768058776855, + "learning_rate": 1.9317226890756303e-05, + "loss": 1.4692, + "step": 193500 + }, + { + "epoch": 0.53, + "grad_norm": 1.517333745956421, + "learning_rate": 1.928571428571429e-05, + "loss": 1.4731, + "step": 194000 + }, + { + "epoch": 0.53, + "grad_norm": 1.4523952007293701, + "learning_rate": 1.925420168067227e-05, + "loss": 1.4698, + "step": 194500 + }, + { + "epoch": 0.53, + "grad_norm": 1.4807761907577515, + "learning_rate": 1.9222689075630255e-05, + "loss": 1.4719, + "step": 195000 + }, + { + "epoch": 0.53, + "grad_norm": 1.4389820098876953, + "learning_rate": 1.9191176470588234e-05, + "loss": 1.4709, + "step": 195500 + }, + { + "epoch": 0.53, + "grad_norm": 3.7379424571990967, + "learning_rate": 1.915966386554622e-05, + "loss": 1.4663, + "step": 196000 + }, + { + "epoch": 0.53, + "grad_norm": 1.4896109104156494, + "learning_rate": 1.91281512605042e-05, + "loss": 1.4709, + "step": 196500 + }, + { + "epoch": 0.53, + "grad_norm": 5.979303359985352, + "learning_rate": 1.9096638655462186e-05, + "loss": 1.4743, + "step": 197000 + }, + { + "epoch": 0.53, + "grad_norm": 1.4648813009262085, + "learning_rate": 1.9065126050420167e-05, + "loss": 1.4687, + "step": 197500 + }, + { + "epoch": 0.54, + "grad_norm": 1.739353895187378, + "learning_rate": 1.903361344537815e-05, + "loss": 1.4702, + "step": 198000 + }, + { + "epoch": 0.54, + "grad_norm": 1.4263814687728882, + "learning_rate": 1.9002100840336134e-05, + "loss": 1.4695, + "step": 198500 + }, + { + "epoch": 0.54, + "grad_norm": 1.5090336799621582, + "learning_rate": 1.8970588235294116e-05, + "loss": 1.4667, + "step": 199000 + }, + { + "epoch": 0.54, + "grad_norm": 1.4606796503067017, + "learning_rate": 1.89390756302521e-05, + "loss": 1.4665, + "step": 199500 + }, + { + "epoch": 0.54, + "grad_norm": 1.4979524612426758, + "learning_rate": 1.8907563025210083e-05, + "loss": 1.4645, + "step": 200000 + }, + { + "epoch": 0.54, + "grad_norm": 1.5032795667648315, + "learning_rate": 1.8876050420168068e-05, + "loss": 1.4697, + "step": 200500 + }, + { + "epoch": 0.54, + "grad_norm": 1.4917629957199097, + "learning_rate": 1.884453781512605e-05, + "loss": 1.4654, + "step": 201000 + }, + { + "epoch": 0.55, + "grad_norm": 1.5047801733016968, + "learning_rate": 1.8813025210084035e-05, + "loss": 1.4665, + "step": 201500 + }, + { + "epoch": 0.55, + "grad_norm": 1.5550223588943481, + "learning_rate": 1.8781512605042017e-05, + "loss": 1.4669, + "step": 202000 + }, + { + "epoch": 0.55, + "grad_norm": 1.4432892799377441, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.4652, + "step": 202500 + }, + { + "epoch": 0.55, + "grad_norm": 1.4227643013000488, + "learning_rate": 1.8718487394957983e-05, + "loss": 1.465, + "step": 203000 + }, + { + "epoch": 0.55, + "grad_norm": 1.5878413915634155, + "learning_rate": 1.868697478991597e-05, + "loss": 1.4675, + "step": 203500 + }, + { + "epoch": 0.55, + "grad_norm": 1.5786782503128052, + "learning_rate": 1.865546218487395e-05, + "loss": 1.4596, + "step": 204000 + }, + { + "epoch": 0.55, + "grad_norm": 1.4224051237106323, + "learning_rate": 1.8623949579831932e-05, + "loss": 1.462, + "step": 204500 + }, + { + "epoch": 0.55, + "grad_norm": 1.7678115367889404, + "learning_rate": 1.8592436974789917e-05, + "loss": 1.4614, + "step": 205000 + }, + { + "epoch": 0.56, + "grad_norm": 1.4170020818710327, + "learning_rate": 1.85609243697479e-05, + "loss": 1.4649, + "step": 205500 + }, + { + "epoch": 0.56, + "grad_norm": 1.5474693775177002, + "learning_rate": 1.8529411764705884e-05, + "loss": 1.464, + "step": 206000 + }, + { + "epoch": 0.56, + "grad_norm": 1.4655749797821045, + "learning_rate": 1.8497899159663866e-05, + "loss": 1.4654, + "step": 206500 + }, + { + "epoch": 0.56, + "grad_norm": 1.6294610500335693, + "learning_rate": 1.846638655462185e-05, + "loss": 1.4616, + "step": 207000 + }, + { + "epoch": 0.56, + "grad_norm": 1.4760308265686035, + "learning_rate": 1.8434873949579833e-05, + "loss": 1.4643, + "step": 207500 + }, + { + "epoch": 0.56, + "grad_norm": 1.4796357154846191, + "learning_rate": 1.8403361344537818e-05, + "loss": 1.4659, + "step": 208000 + }, + { + "epoch": 0.56, + "grad_norm": 1.9592546224594116, + "learning_rate": 1.83718487394958e-05, + "loss": 1.4611, + "step": 208500 + }, + { + "epoch": 0.57, + "grad_norm": 1.493324637413025, + "learning_rate": 1.8340336134453785e-05, + "loss": 1.4626, + "step": 209000 + }, + { + "epoch": 0.57, + "grad_norm": 1.453369379043579, + "learning_rate": 1.8308823529411766e-05, + "loss": 1.4603, + "step": 209500 + }, + { + "epoch": 0.57, + "grad_norm": 1.5146046876907349, + "learning_rate": 1.8277310924369748e-05, + "loss": 1.4594, + "step": 210000 + }, + { + "epoch": 0.57, + "grad_norm": 1.424707293510437, + "learning_rate": 1.824579831932773e-05, + "loss": 1.4631, + "step": 210500 + }, + { + "epoch": 0.57, + "grad_norm": 1.464998722076416, + "learning_rate": 1.8214285714285712e-05, + "loss": 1.4617, + "step": 211000 + }, + { + "epoch": 0.57, + "grad_norm": 1.4314439296722412, + "learning_rate": 1.8182773109243697e-05, + "loss": 1.4611, + "step": 211500 + }, + { + "epoch": 0.57, + "grad_norm": 1.4533342123031616, + "learning_rate": 1.815126050420168e-05, + "loss": 1.4591, + "step": 212000 + }, + { + "epoch": 0.58, + "grad_norm": 1.5328502655029297, + "learning_rate": 1.8119747899159664e-05, + "loss": 1.4606, + "step": 212500 + }, + { + "epoch": 0.58, + "grad_norm": 1.4684851169586182, + "learning_rate": 1.8088235294117645e-05, + "loss": 1.463, + "step": 213000 + }, + { + "epoch": 0.58, + "grad_norm": 1.512421727180481, + "learning_rate": 1.805672268907563e-05, + "loss": 1.4585, + "step": 213500 + }, + { + "epoch": 0.58, + "grad_norm": 1.5069866180419922, + "learning_rate": 1.8025210084033612e-05, + "loss": 1.4565, + "step": 214000 + }, + { + "epoch": 0.58, + "grad_norm": 1.4224152565002441, + "learning_rate": 1.7993697478991597e-05, + "loss": 1.4575, + "step": 214500 + }, + { + "epoch": 0.58, + "grad_norm": 1.6329984664916992, + "learning_rate": 1.796218487394958e-05, + "loss": 1.4541, + "step": 215000 + }, + { + "epoch": 0.58, + "grad_norm": 1.587007761001587, + "learning_rate": 1.7930672268907564e-05, + "loss": 1.4572, + "step": 215500 + }, + { + "epoch": 0.58, + "grad_norm": 1.4805065393447876, + "learning_rate": 1.7899159663865546e-05, + "loss": 1.4618, + "step": 216000 + }, + { + "epoch": 0.59, + "grad_norm": 1.517993450164795, + "learning_rate": 1.786764705882353e-05, + "loss": 1.4538, + "step": 216500 + }, + { + "epoch": 0.59, + "grad_norm": 1.4399406909942627, + "learning_rate": 1.7836134453781513e-05, + "loss": 1.4576, + "step": 217000 + }, + { + "epoch": 0.59, + "grad_norm": 1.4458235502243042, + "learning_rate": 1.7804621848739495e-05, + "loss": 1.4558, + "step": 217500 + }, + { + "epoch": 0.59, + "grad_norm": 1.5840320587158203, + "learning_rate": 1.777310924369748e-05, + "loss": 1.4562, + "step": 218000 + }, + { + "epoch": 0.59, + "grad_norm": 1.4832299947738647, + "learning_rate": 1.774159663865546e-05, + "loss": 1.456, + "step": 218500 + }, + { + "epoch": 0.59, + "grad_norm": 1.4003788232803345, + "learning_rate": 1.7710084033613447e-05, + "loss": 1.4555, + "step": 219000 + }, + { + "epoch": 0.59, + "grad_norm": 1.5091036558151245, + "learning_rate": 1.767857142857143e-05, + "loss": 1.4596, + "step": 219500 + }, + { + "epoch": 0.6, + "grad_norm": 1.4758837223052979, + "learning_rate": 1.7647058823529414e-05, + "loss": 1.4566, + "step": 220000 + }, + { + "epoch": 0.6, + "grad_norm": 1.4372687339782715, + "learning_rate": 1.7615546218487395e-05, + "loss": 1.4524, + "step": 220500 + }, + { + "epoch": 0.6, + "grad_norm": 1.4391896724700928, + "learning_rate": 1.758403361344538e-05, + "loss": 1.4565, + "step": 221000 + }, + { + "epoch": 0.6, + "grad_norm": 1.4493831396102905, + "learning_rate": 1.7552521008403362e-05, + "loss": 1.4543, + "step": 221500 + }, + { + "epoch": 0.6, + "grad_norm": 2.0319833755493164, + "learning_rate": 1.7521008403361347e-05, + "loss": 1.4536, + "step": 222000 + }, + { + "epoch": 0.6, + "grad_norm": 1.4861342906951904, + "learning_rate": 1.748949579831933e-05, + "loss": 1.454, + "step": 222500 + }, + { + "epoch": 0.6, + "grad_norm": 1.4432348012924194, + "learning_rate": 1.7457983193277314e-05, + "loss": 1.4546, + "step": 223000 + }, + { + "epoch": 0.61, + "grad_norm": 1.4457755088806152, + "learning_rate": 1.7426470588235296e-05, + "loss": 1.4542, + "step": 223500 + }, + { + "epoch": 0.61, + "grad_norm": 1.4785292148590088, + "learning_rate": 1.7394957983193278e-05, + "loss": 1.4539, + "step": 224000 + }, + { + "epoch": 0.61, + "grad_norm": 1.4646965265274048, + "learning_rate": 1.7363445378151263e-05, + "loss": 1.4557, + "step": 224500 + }, + { + "epoch": 0.61, + "grad_norm": 1.3340420722961426, + "learning_rate": 1.733193277310924e-05, + "loss": 1.4512, + "step": 225000 + }, + { + "epoch": 0.61, + "grad_norm": 1.4864197969436646, + "learning_rate": 1.7300420168067226e-05, + "loss": 1.4514, + "step": 225500 + }, + { + "epoch": 0.61, + "grad_norm": 1.441954493522644, + "learning_rate": 1.7268907563025208e-05, + "loss": 1.4565, + "step": 226000 + }, + { + "epoch": 0.61, + "grad_norm": 1.4796494245529175, + "learning_rate": 1.7237394957983193e-05, + "loss": 1.4549, + "step": 226500 + }, + { + "epoch": 0.61, + "grad_norm": 1.5095195770263672, + "learning_rate": 1.7205882352941175e-05, + "loss": 1.4538, + "step": 227000 + }, + { + "epoch": 0.62, + "grad_norm": 1.6988993883132935, + "learning_rate": 1.717436974789916e-05, + "loss": 1.4552, + "step": 227500 + }, + { + "epoch": 0.62, + "grad_norm": 1.4422426223754883, + "learning_rate": 1.7142857142857142e-05, + "loss": 1.4514, + "step": 228000 + }, + { + "epoch": 0.62, + "grad_norm": 1.4488030672073364, + "learning_rate": 1.7111344537815127e-05, + "loss": 1.4545, + "step": 228500 + }, + { + "epoch": 0.62, + "grad_norm": 1.4784460067749023, + "learning_rate": 1.707983193277311e-05, + "loss": 1.4527, + "step": 229000 + }, + { + "epoch": 0.62, + "grad_norm": 1.4642586708068848, + "learning_rate": 1.7048319327731094e-05, + "loss": 1.4483, + "step": 229500 + }, + { + "epoch": 0.62, + "grad_norm": 1.509343147277832, + "learning_rate": 1.7016806722689076e-05, + "loss": 1.4543, + "step": 230000 + }, + { + "epoch": 0.62, + "grad_norm": 1.3862849473953247, + "learning_rate": 1.6985294117647057e-05, + "loss": 1.4531, + "step": 230500 + }, + { + "epoch": 0.63, + "grad_norm": 1.4223895072937012, + "learning_rate": 1.6953781512605042e-05, + "loss": 1.451, + "step": 231000 + }, + { + "epoch": 0.63, + "grad_norm": 1.4616318941116333, + "learning_rate": 1.6922268907563024e-05, + "loss": 1.4511, + "step": 231500 + }, + { + "epoch": 0.63, + "grad_norm": 1.4746378660202026, + "learning_rate": 1.689075630252101e-05, + "loss": 1.4497, + "step": 232000 + }, + { + "epoch": 0.63, + "grad_norm": 1.461519479751587, + "learning_rate": 1.685924369747899e-05, + "loss": 1.4516, + "step": 232500 + }, + { + "epoch": 0.63, + "grad_norm": 1.3925315141677856, + "learning_rate": 1.6827731092436976e-05, + "loss": 1.4507, + "step": 233000 + }, + { + "epoch": 0.63, + "grad_norm": 1.4032963514328003, + "learning_rate": 1.6796218487394958e-05, + "loss": 1.4497, + "step": 233500 + }, + { + "epoch": 0.63, + "grad_norm": 1.4162888526916504, + "learning_rate": 1.6764705882352943e-05, + "loss": 1.4482, + "step": 234000 + }, + { + "epoch": 0.63, + "grad_norm": 1.3672780990600586, + "learning_rate": 1.6733193277310925e-05, + "loss": 1.4518, + "step": 234500 + }, + { + "epoch": 0.64, + "grad_norm": 1.522310733795166, + "learning_rate": 1.670168067226891e-05, + "loss": 1.4516, + "step": 235000 + }, + { + "epoch": 0.64, + "grad_norm": 1.3994154930114746, + "learning_rate": 1.6670168067226892e-05, + "loss": 1.4468, + "step": 235500 + }, + { + "epoch": 0.64, + "grad_norm": 1.4941591024398804, + "learning_rate": 1.6638655462184877e-05, + "loss": 1.4491, + "step": 236000 + }, + { + "epoch": 0.64, + "grad_norm": 1.4521230459213257, + "learning_rate": 1.660714285714286e-05, + "loss": 1.4475, + "step": 236500 + }, + { + "epoch": 0.64, + "grad_norm": 1.528152585029602, + "learning_rate": 1.657563025210084e-05, + "loss": 1.4473, + "step": 237000 + }, + { + "epoch": 0.64, + "grad_norm": 1.4769060611724854, + "learning_rate": 1.6544117647058825e-05, + "loss": 1.4463, + "step": 237500 + }, + { + "epoch": 0.64, + "grad_norm": 1.4506659507751465, + "learning_rate": 1.6512605042016807e-05, + "loss": 1.4458, + "step": 238000 + }, + { + "epoch": 0.65, + "grad_norm": 1.491810917854309, + "learning_rate": 1.6481092436974792e-05, + "loss": 1.4498, + "step": 238500 + }, + { + "epoch": 0.65, + "grad_norm": 1.4600553512573242, + "learning_rate": 1.6449579831932774e-05, + "loss": 1.4444, + "step": 239000 + }, + { + "epoch": 0.65, + "grad_norm": 1.4451686143875122, + "learning_rate": 1.641806722689076e-05, + "loss": 1.4441, + "step": 239500 + }, + { + "epoch": 0.65, + "grad_norm": 1.4227120876312256, + "learning_rate": 1.6386554621848738e-05, + "loss": 1.4448, + "step": 240000 + }, + { + "epoch": 0.65, + "grad_norm": 1.5668320655822754, + "learning_rate": 1.6355042016806723e-05, + "loss": 1.4456, + "step": 240500 + }, + { + "epoch": 0.65, + "grad_norm": 1.3923659324645996, + "learning_rate": 1.6323529411764704e-05, + "loss": 1.4477, + "step": 241000 + }, + { + "epoch": 0.65, + "grad_norm": 1.4962598085403442, + "learning_rate": 1.629201680672269e-05, + "loss": 1.4454, + "step": 241500 + }, + { + "epoch": 0.66, + "grad_norm": 1.4878734350204468, + "learning_rate": 1.626050420168067e-05, + "loss": 1.4461, + "step": 242000 + }, + { + "epoch": 0.66, + "grad_norm": 1.4973180294036865, + "learning_rate": 1.6228991596638656e-05, + "loss": 1.4464, + "step": 242500 + }, + { + "epoch": 0.66, + "grad_norm": 1.4737753868103027, + "learning_rate": 1.6197478991596638e-05, + "loss": 1.444, + "step": 243000 + }, + { + "epoch": 0.66, + "grad_norm": 1.4609256982803345, + "learning_rate": 1.616596638655462e-05, + "loss": 1.4479, + "step": 243500 + }, + { + "epoch": 0.66, + "grad_norm": 1.4048258066177368, + "learning_rate": 1.6134453781512605e-05, + "loss": 1.4428, + "step": 244000 + }, + { + "epoch": 0.66, + "grad_norm": 1.399703025817871, + "learning_rate": 1.6102941176470587e-05, + "loss": 1.4433, + "step": 244500 + }, + { + "epoch": 0.66, + "grad_norm": 1.5445500612258911, + "learning_rate": 1.6071428571428572e-05, + "loss": 1.4455, + "step": 245000 + }, + { + "epoch": 0.66, + "grad_norm": 1.4742292165756226, + "learning_rate": 1.6039915966386554e-05, + "loss": 1.4428, + "step": 245500 + }, + { + "epoch": 0.67, + "grad_norm": 1.4535382986068726, + "learning_rate": 1.600840336134454e-05, + "loss": 1.4453, + "step": 246000 + }, + { + "epoch": 0.67, + "grad_norm": 1.467373013496399, + "learning_rate": 1.597689075630252e-05, + "loss": 1.4459, + "step": 246500 + }, + { + "epoch": 0.67, + "grad_norm": 1.4863603115081787, + "learning_rate": 1.5945378151260506e-05, + "loss": 1.4444, + "step": 247000 + }, + { + "epoch": 0.67, + "grad_norm": 1.5373426675796509, + "learning_rate": 1.5913865546218487e-05, + "loss": 1.4418, + "step": 247500 + }, + { + "epoch": 0.67, + "grad_norm": 1.4747397899627686, + "learning_rate": 1.5882352941176473e-05, + "loss": 1.4423, + "step": 248000 + }, + { + "epoch": 0.67, + "grad_norm": 1.5024008750915527, + "learning_rate": 1.5850840336134454e-05, + "loss": 1.4466, + "step": 248500 + }, + { + "epoch": 0.67, + "grad_norm": 1.481330394744873, + "learning_rate": 1.581932773109244e-05, + "loss": 1.4395, + "step": 249000 + }, + { + "epoch": 0.68, + "grad_norm": 1.419636607170105, + "learning_rate": 1.578781512605042e-05, + "loss": 1.4416, + "step": 249500 + }, + { + "epoch": 0.68, + "grad_norm": 1.4620583057403564, + "learning_rate": 1.5756302521008403e-05, + "loss": 1.447, + "step": 250000 + }, + { + "epoch": 0.68, + "grad_norm": 1.4666600227355957, + "learning_rate": 1.5724789915966388e-05, + "loss": 1.4378, + "step": 250500 + }, + { + "epoch": 0.68, + "grad_norm": 1.4554154872894287, + "learning_rate": 1.569327731092437e-05, + "loss": 1.4439, + "step": 251000 + }, + { + "epoch": 0.68, + "grad_norm": 1.4908123016357422, + "learning_rate": 1.5661764705882355e-05, + "loss": 1.4427, + "step": 251500 + }, + { + "epoch": 0.68, + "grad_norm": 1.471479892730713, + "learning_rate": 1.5630252100840337e-05, + "loss": 1.4433, + "step": 252000 + }, + { + "epoch": 0.68, + "grad_norm": 1.4541757106781006, + "learning_rate": 1.5598739495798322e-05, + "loss": 1.4438, + "step": 252500 + }, + { + "epoch": 0.68, + "grad_norm": 1.7064818143844604, + "learning_rate": 1.5567226890756304e-05, + "loss": 1.4409, + "step": 253000 + }, + { + "epoch": 0.69, + "grad_norm": 1.5056750774383545, + "learning_rate": 1.553571428571429e-05, + "loss": 1.4405, + "step": 253500 + }, + { + "epoch": 0.69, + "grad_norm": 1.4601994752883911, + "learning_rate": 1.550420168067227e-05, + "loss": 1.4407, + "step": 254000 + }, + { + "epoch": 0.69, + "grad_norm": 1.4508180618286133, + "learning_rate": 1.5472689075630256e-05, + "loss": 1.4471, + "step": 254500 + }, + { + "epoch": 0.69, + "grad_norm": 1.476529598236084, + "learning_rate": 1.5441176470588234e-05, + "loss": 1.4416, + "step": 255000 + }, + { + "epoch": 0.69, + "grad_norm": 1.5242764949798584, + "learning_rate": 1.540966386554622e-05, + "loss": 1.4406, + "step": 255500 + }, + { + "epoch": 0.69, + "grad_norm": 1.405678153038025, + "learning_rate": 1.53781512605042e-05, + "loss": 1.4399, + "step": 256000 + }, + { + "epoch": 0.69, + "grad_norm": 1.4689253568649292, + "learning_rate": 1.5346638655462183e-05, + "loss": 1.4409, + "step": 256500 + }, + { + "epoch": 0.7, + "grad_norm": 1.5302820205688477, + "learning_rate": 1.5315126050420168e-05, + "loss": 1.4435, + "step": 257000 + }, + { + "epoch": 0.7, + "grad_norm": 1.4745590686798096, + "learning_rate": 1.528361344537815e-05, + "loss": 1.4411, + "step": 257500 + }, + { + "epoch": 0.7, + "grad_norm": 1.5703048706054688, + "learning_rate": 1.5252100840336135e-05, + "loss": 1.4372, + "step": 258000 + }, + { + "epoch": 0.7, + "grad_norm": 1.4982346296310425, + "learning_rate": 1.5220588235294118e-05, + "loss": 1.4342, + "step": 258500 + }, + { + "epoch": 0.7, + "grad_norm": 1.4562139511108398, + "learning_rate": 1.51890756302521e-05, + "loss": 1.4403, + "step": 259000 + }, + { + "epoch": 0.7, + "grad_norm": 1.5004678964614868, + "learning_rate": 1.5157563025210083e-05, + "loss": 1.4405, + "step": 259500 + }, + { + "epoch": 0.7, + "grad_norm": 1.4451349973678589, + "learning_rate": 1.5126050420168067e-05, + "loss": 1.436, + "step": 260000 + }, + { + "epoch": 0.71, + "grad_norm": 1.420857548713684, + "learning_rate": 1.509453781512605e-05, + "loss": 1.4402, + "step": 260500 + }, + { + "epoch": 0.71, + "grad_norm": 1.4772206544876099, + "learning_rate": 1.5063025210084034e-05, + "loss": 1.4373, + "step": 261000 + }, + { + "epoch": 0.71, + "grad_norm": 1.4933620691299438, + "learning_rate": 1.5031512605042017e-05, + "loss": 1.4392, + "step": 261500 + }, + { + "epoch": 0.71, + "grad_norm": 1.5023765563964844, + "learning_rate": 1.5e-05, + "loss": 1.438, + "step": 262000 + }, + { + "epoch": 0.71, + "grad_norm": 1.4560567140579224, + "learning_rate": 1.4968487394957984e-05, + "loss": 1.439, + "step": 262500 + }, + { + "epoch": 0.71, + "grad_norm": 1.5497692823410034, + "learning_rate": 1.4936974789915967e-05, + "loss": 1.4347, + "step": 263000 + }, + { + "epoch": 0.71, + "grad_norm": 1.5201669931411743, + "learning_rate": 1.490546218487395e-05, + "loss": 1.4365, + "step": 263500 + }, + { + "epoch": 0.71, + "grad_norm": 1.4907211065292358, + "learning_rate": 1.4873949579831934e-05, + "loss": 1.4334, + "step": 264000 + }, + { + "epoch": 0.72, + "grad_norm": 1.4821357727050781, + "learning_rate": 1.4842436974789918e-05, + "loss": 1.4361, + "step": 264500 + }, + { + "epoch": 0.72, + "grad_norm": 1.4968074560165405, + "learning_rate": 1.4810924369747901e-05, + "loss": 1.4352, + "step": 265000 + }, + { + "epoch": 0.72, + "grad_norm": 1.475728154182434, + "learning_rate": 1.4779411764705883e-05, + "loss": 1.4365, + "step": 265500 + }, + { + "epoch": 0.72, + "grad_norm": 1.560935378074646, + "learning_rate": 1.4747899159663864e-05, + "loss": 1.4381, + "step": 266000 + }, + { + "epoch": 0.72, + "grad_norm": 1.4216580390930176, + "learning_rate": 1.4716386554621848e-05, + "loss": 1.4322, + "step": 266500 + }, + { + "epoch": 0.72, + "grad_norm": 1.499648094177246, + "learning_rate": 1.4684873949579831e-05, + "loss": 1.4378, + "step": 267000 + }, + { + "epoch": 0.72, + "grad_norm": 1.4971799850463867, + "learning_rate": 1.4653361344537815e-05, + "loss": 1.4334, + "step": 267500 + }, + { + "epoch": 0.73, + "grad_norm": 1.5106513500213623, + "learning_rate": 1.4621848739495798e-05, + "loss": 1.4347, + "step": 268000 + }, + { + "epoch": 0.73, + "grad_norm": 1.488006353378296, + "learning_rate": 1.4590336134453782e-05, + "loss": 1.4361, + "step": 268500 + }, + { + "epoch": 0.73, + "grad_norm": 1.484994888305664, + "learning_rate": 1.4558823529411765e-05, + "loss": 1.4389, + "step": 269000 + }, + { + "epoch": 0.73, + "grad_norm": 1.4334303140640259, + "learning_rate": 1.4527310924369749e-05, + "loss": 1.4366, + "step": 269500 + }, + { + "epoch": 0.73, + "grad_norm": 1.4980212450027466, + "learning_rate": 1.4495798319327732e-05, + "loss": 1.4335, + "step": 270000 + }, + { + "epoch": 0.73, + "grad_norm": 1.4758628606796265, + "learning_rate": 1.4464285714285715e-05, + "loss": 1.4367, + "step": 270500 + }, + { + "epoch": 0.73, + "grad_norm": 1.4914411306381226, + "learning_rate": 1.4432773109243699e-05, + "loss": 1.4373, + "step": 271000 + }, + { + "epoch": 0.73, + "grad_norm": 1.5274006128311157, + "learning_rate": 1.4401260504201682e-05, + "loss": 1.4364, + "step": 271500 + }, + { + "epoch": 0.74, + "grad_norm": 1.4571418762207031, + "learning_rate": 1.4369747899159664e-05, + "loss": 1.4354, + "step": 272000 + }, + { + "epoch": 0.74, + "grad_norm": 1.5726255178451538, + "learning_rate": 1.4338235294117647e-05, + "loss": 1.4338, + "step": 272500 + }, + { + "epoch": 0.74, + "grad_norm": 1.5626286268234253, + "learning_rate": 1.4306722689075631e-05, + "loss": 1.4345, + "step": 273000 + }, + { + "epoch": 0.74, + "grad_norm": 1.4581658840179443, + "learning_rate": 1.4275210084033613e-05, + "loss": 1.4339, + "step": 273500 + }, + { + "epoch": 0.74, + "grad_norm": 1.4836556911468506, + "learning_rate": 1.4243697478991596e-05, + "loss": 1.4331, + "step": 274000 + }, + { + "epoch": 0.74, + "grad_norm": 1.4955805540084839, + "learning_rate": 1.421218487394958e-05, + "loss": 1.434, + "step": 274500 + }, + { + "epoch": 0.74, + "grad_norm": 1.5095798969268799, + "learning_rate": 1.4180672268907563e-05, + "loss": 1.4335, + "step": 275000 + }, + { + "epoch": 0.75, + "grad_norm": 1.517565131187439, + "learning_rate": 1.4149159663865546e-05, + "loss": 1.4339, + "step": 275500 + }, + { + "epoch": 0.75, + "grad_norm": 1.5089333057403564, + "learning_rate": 1.411764705882353e-05, + "loss": 1.4303, + "step": 276000 + }, + { + "epoch": 0.75, + "grad_norm": 1.490110993385315, + "learning_rate": 1.4086134453781513e-05, + "loss": 1.4378, + "step": 276500 + }, + { + "epoch": 0.75, + "grad_norm": 1.4934676885604858, + "learning_rate": 1.4054621848739497e-05, + "loss": 1.4309, + "step": 277000 + }, + { + "epoch": 0.75, + "grad_norm": 1.453904628753662, + "learning_rate": 1.402310924369748e-05, + "loss": 1.4345, + "step": 277500 + }, + { + "epoch": 0.75, + "grad_norm": 1.4364333152770996, + "learning_rate": 1.3991596638655464e-05, + "loss": 1.4347, + "step": 278000 + }, + { + "epoch": 0.75, + "grad_norm": 1.5105829238891602, + "learning_rate": 1.3960084033613445e-05, + "loss": 1.4373, + "step": 278500 + }, + { + "epoch": 0.76, + "grad_norm": 1.5879383087158203, + "learning_rate": 1.3928571428571429e-05, + "loss": 1.4337, + "step": 279000 + }, + { + "epoch": 0.76, + "grad_norm": 1.4907859563827515, + "learning_rate": 1.3897058823529412e-05, + "loss": 1.4378, + "step": 279500 + }, + { + "epoch": 0.76, + "grad_norm": 1.4965413808822632, + "learning_rate": 1.3865546218487396e-05, + "loss": 1.4332, + "step": 280000 + }, + { + "epoch": 0.76, + "grad_norm": 1.4512360095977783, + "learning_rate": 1.3834033613445379e-05, + "loss": 1.4293, + "step": 280500 + }, + { + "epoch": 0.76, + "grad_norm": 1.5323312282562256, + "learning_rate": 1.3802521008403361e-05, + "loss": 1.4348, + "step": 281000 + }, + { + "epoch": 0.76, + "grad_norm": 1.515937089920044, + "learning_rate": 1.3771008403361344e-05, + "loss": 1.435, + "step": 281500 + }, + { + "epoch": 0.76, + "grad_norm": 1.5589243173599243, + "learning_rate": 1.3739495798319328e-05, + "loss": 1.4276, + "step": 282000 + }, + { + "epoch": 0.76, + "grad_norm": 1.4904866218566895, + "learning_rate": 1.3707983193277311e-05, + "loss": 1.4317, + "step": 282500 + }, + { + "epoch": 0.77, + "grad_norm": 1.4851187467575073, + "learning_rate": 1.3676470588235295e-05, + "loss": 1.4297, + "step": 283000 + }, + { + "epoch": 0.77, + "grad_norm": 1.3728834390640259, + "learning_rate": 1.3644957983193278e-05, + "loss": 1.4322, + "step": 283500 + }, + { + "epoch": 0.77, + "grad_norm": 1.738533854484558, + "learning_rate": 1.3613445378151261e-05, + "loss": 1.4293, + "step": 284000 + }, + { + "epoch": 0.77, + "grad_norm": 1.5092045068740845, + "learning_rate": 1.3581932773109245e-05, + "loss": 1.4292, + "step": 284500 + }, + { + "epoch": 0.77, + "grad_norm": 1.5049362182617188, + "learning_rate": 1.3550420168067227e-05, + "loss": 1.4286, + "step": 285000 + }, + { + "epoch": 0.77, + "grad_norm": 1.4427067041397095, + "learning_rate": 1.351890756302521e-05, + "loss": 1.4279, + "step": 285500 + }, + { + "epoch": 0.77, + "grad_norm": 1.4460445642471313, + "learning_rate": 1.3487394957983194e-05, + "loss": 1.4301, + "step": 286000 + }, + { + "epoch": 0.78, + "grad_norm": 1.5012342929840088, + "learning_rate": 1.3455882352941177e-05, + "loss": 1.4287, + "step": 286500 + }, + { + "epoch": 0.78, + "grad_norm": 1.4399917125701904, + "learning_rate": 1.342436974789916e-05, + "loss": 1.4308, + "step": 287000 + }, + { + "epoch": 0.78, + "grad_norm": 1.4089640378952026, + "learning_rate": 1.3392857142857144e-05, + "loss": 1.4264, + "step": 287500 + }, + { + "epoch": 0.78, + "grad_norm": 1.5012991428375244, + "learning_rate": 1.3361344537815127e-05, + "loss": 1.4296, + "step": 288000 + }, + { + "epoch": 0.78, + "grad_norm": 1.4144240617752075, + "learning_rate": 1.3329831932773109e-05, + "loss": 1.4259, + "step": 288500 + }, + { + "epoch": 0.78, + "grad_norm": 1.4895191192626953, + "learning_rate": 1.3298319327731092e-05, + "loss": 1.4312, + "step": 289000 + }, + { + "epoch": 0.78, + "grad_norm": 1.5855236053466797, + "learning_rate": 1.3266806722689076e-05, + "loss": 1.4275, + "step": 289500 + }, + { + "epoch": 0.79, + "grad_norm": 1.4119740724563599, + "learning_rate": 1.323529411764706e-05, + "loss": 1.428, + "step": 290000 + }, + { + "epoch": 0.79, + "grad_norm": 1.5101768970489502, + "learning_rate": 1.3203781512605043e-05, + "loss": 1.4289, + "step": 290500 + }, + { + "epoch": 0.79, + "grad_norm": 1.4803494215011597, + "learning_rate": 1.3172268907563025e-05, + "loss": 1.4273, + "step": 291000 + }, + { + "epoch": 0.79, + "grad_norm": 1.5688806772232056, + "learning_rate": 1.3140756302521008e-05, + "loss": 1.4276, + "step": 291500 + }, + { + "epoch": 0.79, + "grad_norm": 2.2357559204101562, + "learning_rate": 1.3109243697478991e-05, + "loss": 1.4294, + "step": 292000 + }, + { + "epoch": 0.79, + "grad_norm": 1.4668666124343872, + "learning_rate": 1.3077731092436975e-05, + "loss": 1.4293, + "step": 292500 + }, + { + "epoch": 0.79, + "grad_norm": 1.46941339969635, + "learning_rate": 1.3046218487394958e-05, + "loss": 1.4321, + "step": 293000 + }, + { + "epoch": 0.79, + "grad_norm": 1.633657455444336, + "learning_rate": 1.3014705882352942e-05, + "loss": 1.4272, + "step": 293500 + }, + { + "epoch": 0.8, + "grad_norm": 1.6233292818069458, + "learning_rate": 1.2983193277310925e-05, + "loss": 1.4268, + "step": 294000 + }, + { + "epoch": 0.8, + "grad_norm": 1.4441863298416138, + "learning_rate": 1.2951680672268909e-05, + "loss": 1.4262, + "step": 294500 + }, + { + "epoch": 0.8, + "grad_norm": 1.5020571947097778, + "learning_rate": 1.2920168067226892e-05, + "loss": 1.4247, + "step": 295000 + }, + { + "epoch": 0.8, + "grad_norm": 1.476090669631958, + "learning_rate": 1.2888655462184874e-05, + "loss": 1.426, + "step": 295500 + }, + { + "epoch": 0.8, + "grad_norm": 1.4784507751464844, + "learning_rate": 1.2857142857142857e-05, + "loss": 1.4262, + "step": 296000 + }, + { + "epoch": 0.8, + "grad_norm": 1.4484635591506958, + "learning_rate": 1.282563025210084e-05, + "loss": 1.426, + "step": 296500 + }, + { + "epoch": 0.8, + "grad_norm": 1.5106843709945679, + "learning_rate": 1.2794117647058824e-05, + "loss": 1.4282, + "step": 297000 + }, + { + "epoch": 0.81, + "grad_norm": 1.401078701019287, + "learning_rate": 1.2762605042016806e-05, + "loss": 1.4229, + "step": 297500 + }, + { + "epoch": 0.81, + "grad_norm": 1.4721170663833618, + "learning_rate": 1.273109243697479e-05, + "loss": 1.4281, + "step": 298000 + }, + { + "epoch": 0.81, + "grad_norm": 1.5121667385101318, + "learning_rate": 1.2699579831932773e-05, + "loss": 1.4272, + "step": 298500 + }, + { + "epoch": 0.81, + "grad_norm": 1.4307163953781128, + "learning_rate": 1.2668067226890756e-05, + "loss": 1.4269, + "step": 299000 + }, + { + "epoch": 0.81, + "grad_norm": 1.520992398262024, + "learning_rate": 1.263655462184874e-05, + "loss": 1.426, + "step": 299500 + }, + { + "epoch": 0.81, + "grad_norm": 1.4671803712844849, + "learning_rate": 1.2605042016806723e-05, + "loss": 1.4207, + "step": 300000 + }, + { + "epoch": 0.81, + "grad_norm": 1.4773739576339722, + "learning_rate": 1.2573529411764706e-05, + "loss": 1.4248, + "step": 300500 + }, + { + "epoch": 0.81, + "grad_norm": 1.4782676696777344, + "learning_rate": 1.254201680672269e-05, + "loss": 1.4265, + "step": 301000 + }, + { + "epoch": 0.82, + "grad_norm": 1.5411614179611206, + "learning_rate": 1.2510504201680673e-05, + "loss": 1.4223, + "step": 301500 + }, + { + "epoch": 0.82, + "grad_norm": 1.4932873249053955, + "learning_rate": 1.2478991596638657e-05, + "loss": 1.4252, + "step": 302000 + }, + { + "epoch": 0.82, + "grad_norm": 1.451866626739502, + "learning_rate": 1.244747899159664e-05, + "loss": 1.4234, + "step": 302500 + }, + { + "epoch": 0.82, + "grad_norm": 1.4181545972824097, + "learning_rate": 1.2415966386554622e-05, + "loss": 1.4249, + "step": 303000 + }, + { + "epoch": 0.82, + "grad_norm": 1.460598349571228, + "learning_rate": 1.2384453781512605e-05, + "loss": 1.4237, + "step": 303500 + }, + { + "epoch": 0.82, + "grad_norm": 1.4560647010803223, + "learning_rate": 1.2352941176470587e-05, + "loss": 1.4199, + "step": 304000 + }, + { + "epoch": 0.82, + "grad_norm": 1.4535589218139648, + "learning_rate": 1.232142857142857e-05, + "loss": 1.4248, + "step": 304500 + }, + { + "epoch": 0.83, + "grad_norm": 1.4643712043762207, + "learning_rate": 1.2289915966386554e-05, + "loss": 1.4257, + "step": 305000 + }, + { + "epoch": 0.83, + "grad_norm": 1.5106630325317383, + "learning_rate": 1.2258403361344537e-05, + "loss": 1.4248, + "step": 305500 + }, + { + "epoch": 0.83, + "grad_norm": 1.489579439163208, + "learning_rate": 1.2226890756302521e-05, + "loss": 1.4215, + "step": 306000 + }, + { + "epoch": 0.83, + "grad_norm": 1.4746323823928833, + "learning_rate": 1.2195378151260504e-05, + "loss": 1.4202, + "step": 306500 + }, + { + "epoch": 0.83, + "grad_norm": 1.4702941179275513, + "learning_rate": 1.2163865546218488e-05, + "loss": 1.4214, + "step": 307000 + }, + { + "epoch": 0.83, + "grad_norm": 1.5852062702178955, + "learning_rate": 1.2132352941176471e-05, + "loss": 1.4229, + "step": 307500 + }, + { + "epoch": 0.83, + "grad_norm": 1.5045883655548096, + "learning_rate": 1.2100840336134455e-05, + "loss": 1.4245, + "step": 308000 + }, + { + "epoch": 0.84, + "grad_norm": 1.4635881185531616, + "learning_rate": 1.2069327731092438e-05, + "loss": 1.425, + "step": 308500 + }, + { + "epoch": 0.84, + "grad_norm": 1.4574062824249268, + "learning_rate": 1.2037815126050422e-05, + "loss": 1.4241, + "step": 309000 + }, + { + "epoch": 0.84, + "grad_norm": 1.4566025733947754, + "learning_rate": 1.2006302521008405e-05, + "loss": 1.4204, + "step": 309500 + }, + { + "epoch": 0.84, + "grad_norm": 1.525225281715393, + "learning_rate": 1.1974789915966388e-05, + "loss": 1.4218, + "step": 310000 + }, + { + "epoch": 0.84, + "grad_norm": 1.4726413488388062, + "learning_rate": 1.1943277310924368e-05, + "loss": 1.422, + "step": 310500 + }, + { + "epoch": 0.84, + "grad_norm": 1.4462370872497559, + "learning_rate": 1.1911764705882352e-05, + "loss": 1.4174, + "step": 311000 + }, + { + "epoch": 0.84, + "grad_norm": 1.4930446147918701, + "learning_rate": 1.1880252100840335e-05, + "loss": 1.4168, + "step": 311500 + }, + { + "epoch": 0.84, + "grad_norm": 2.050973892211914, + "learning_rate": 1.1848739495798319e-05, + "loss": 1.4205, + "step": 312000 + }, + { + "epoch": 0.85, + "grad_norm": 1.514642596244812, + "learning_rate": 1.1817226890756302e-05, + "loss": 1.42, + "step": 312500 + }, + { + "epoch": 0.85, + "grad_norm": 1.4417085647583008, + "learning_rate": 1.1785714285714286e-05, + "loss": 1.42, + "step": 313000 + }, + { + "epoch": 0.85, + "grad_norm": 1.473029375076294, + "learning_rate": 1.1754201680672269e-05, + "loss": 1.4228, + "step": 313500 + }, + { + "epoch": 0.85, + "grad_norm": 1.573533296585083, + "learning_rate": 1.1722689075630253e-05, + "loss": 1.4193, + "step": 314000 + }, + { + "epoch": 0.85, + "grad_norm": 1.5040185451507568, + "learning_rate": 1.1691176470588236e-05, + "loss": 1.4209, + "step": 314500 + }, + { + "epoch": 0.85, + "grad_norm": 1.472280740737915, + "learning_rate": 1.165966386554622e-05, + "loss": 1.4203, + "step": 315000 + }, + { + "epoch": 0.85, + "grad_norm": 1.4371939897537231, + "learning_rate": 1.1628151260504203e-05, + "loss": 1.4197, + "step": 315500 + }, + { + "epoch": 0.86, + "grad_norm": 1.74043607711792, + "learning_rate": 1.1596638655462186e-05, + "loss": 1.4189, + "step": 316000 + }, + { + "epoch": 0.86, + "grad_norm": 1.5340248346328735, + "learning_rate": 1.156512605042017e-05, + "loss": 1.4178, + "step": 316500 + }, + { + "epoch": 0.86, + "grad_norm": 1.4650968313217163, + "learning_rate": 1.1533613445378151e-05, + "loss": 1.4157, + "step": 317000 + }, + { + "epoch": 0.86, + "grad_norm": 1.6052621603012085, + "learning_rate": 1.1502100840336135e-05, + "loss": 1.4221, + "step": 317500 + }, + { + "epoch": 0.86, + "grad_norm": 1.4934183359146118, + "learning_rate": 1.1470588235294117e-05, + "loss": 1.4219, + "step": 318000 + }, + { + "epoch": 0.86, + "grad_norm": 1.6604057550430298, + "learning_rate": 1.14390756302521e-05, + "loss": 1.4165, + "step": 318500 + }, + { + "epoch": 0.86, + "grad_norm": 1.448686957359314, + "learning_rate": 1.1407563025210084e-05, + "loss": 1.4167, + "step": 319000 + }, + { + "epoch": 0.86, + "grad_norm": 1.4600298404693604, + "learning_rate": 1.1376050420168067e-05, + "loss": 1.4196, + "step": 319500 + }, + { + "epoch": 0.87, + "grad_norm": 1.4856675863265991, + "learning_rate": 1.134453781512605e-05, + "loss": 1.4188, + "step": 320000 + }, + { + "epoch": 0.87, + "grad_norm": 1.5987657308578491, + "learning_rate": 1.1313025210084034e-05, + "loss": 1.4176, + "step": 320500 + }, + { + "epoch": 0.87, + "grad_norm": 1.4707138538360596, + "learning_rate": 1.1281512605042017e-05, + "loss": 1.4177, + "step": 321000 + }, + { + "epoch": 0.87, + "grad_norm": 1.4592325687408447, + "learning_rate": 1.125e-05, + "loss": 1.419, + "step": 321500 + }, + { + "epoch": 0.87, + "grad_norm": 1.477171540260315, + "learning_rate": 1.1218487394957984e-05, + "loss": 1.4118, + "step": 322000 + }, + { + "epoch": 0.87, + "grad_norm": 1.5284925699234009, + "learning_rate": 1.1186974789915968e-05, + "loss": 1.418, + "step": 322500 + }, + { + "epoch": 0.87, + "grad_norm": 1.5696572065353394, + "learning_rate": 1.1155462184873951e-05, + "loss": 1.4175, + "step": 323000 + }, + { + "epoch": 0.88, + "grad_norm": 1.5421068668365479, + "learning_rate": 1.1123949579831933e-05, + "loss": 1.4134, + "step": 323500 + }, + { + "epoch": 0.88, + "grad_norm": 1.5944511890411377, + "learning_rate": 1.1092436974789916e-05, + "loss": 1.4139, + "step": 324000 + }, + { + "epoch": 0.88, + "grad_norm": 1.4496880769729614, + "learning_rate": 1.10609243697479e-05, + "loss": 1.4131, + "step": 324500 + }, + { + "epoch": 0.88, + "grad_norm": 1.5021952390670776, + "learning_rate": 1.1029411764705883e-05, + "loss": 1.4144, + "step": 325000 + }, + { + "epoch": 0.88, + "grad_norm": 1.5261799097061157, + "learning_rate": 1.0997899159663865e-05, + "loss": 1.4149, + "step": 325500 + }, + { + "epoch": 0.88, + "grad_norm": 1.396974802017212, + "learning_rate": 1.0966386554621848e-05, + "loss": 1.4149, + "step": 326000 + }, + { + "epoch": 0.88, + "grad_norm": 1.561023235321045, + "learning_rate": 1.0934873949579832e-05, + "loss": 1.4183, + "step": 326500 + }, + { + "epoch": 0.89, + "grad_norm": 1.509398102760315, + "learning_rate": 1.0903361344537815e-05, + "loss": 1.4158, + "step": 327000 + }, + { + "epoch": 0.89, + "grad_norm": 1.5046377182006836, + "learning_rate": 1.0871848739495799e-05, + "loss": 1.4137, + "step": 327500 + }, + { + "epoch": 0.89, + "grad_norm": 1.504531979560852, + "learning_rate": 1.0840336134453782e-05, + "loss": 1.4155, + "step": 328000 + }, + { + "epoch": 0.89, + "grad_norm": 1.6807337999343872, + "learning_rate": 1.0808823529411765e-05, + "loss": 1.4161, + "step": 328500 + }, + { + "epoch": 0.89, + "grad_norm": 1.4374127388000488, + "learning_rate": 1.0777310924369749e-05, + "loss": 1.4162, + "step": 329000 + }, + { + "epoch": 0.89, + "grad_norm": 1.4737296104431152, + "learning_rate": 1.0745798319327732e-05, + "loss": 1.4176, + "step": 329500 + }, + { + "epoch": 0.89, + "grad_norm": 1.5063775777816772, + "learning_rate": 1.0714285714285714e-05, + "loss": 1.4128, + "step": 330000 + }, + { + "epoch": 0.89, + "grad_norm": 1.506156325340271, + "learning_rate": 1.0682773109243698e-05, + "loss": 1.4176, + "step": 330500 + }, + { + "epoch": 0.9, + "grad_norm": 1.5394564867019653, + "learning_rate": 1.0651260504201681e-05, + "loss": 1.4119, + "step": 331000 + }, + { + "epoch": 0.9, + "grad_norm": 1.4483675956726074, + "learning_rate": 1.0619747899159664e-05, + "loss": 1.4138, + "step": 331500 + }, + { + "epoch": 0.9, + "grad_norm": 2.412644147872925, + "learning_rate": 1.0588235294117648e-05, + "loss": 1.4146, + "step": 332000 + }, + { + "epoch": 0.9, + "grad_norm": 1.9123421907424927, + "learning_rate": 1.0556722689075631e-05, + "loss": 1.4194, + "step": 332500 + }, + { + "epoch": 0.9, + "grad_norm": 1.4911080598831177, + "learning_rate": 1.0525210084033613e-05, + "loss": 1.418, + "step": 333000 + }, + { + "epoch": 0.9, + "grad_norm": 1.511194109916687, + "learning_rate": 1.0493697478991596e-05, + "loss": 1.4114, + "step": 333500 + }, + { + "epoch": 0.9, + "grad_norm": 1.4733537435531616, + "learning_rate": 1.046218487394958e-05, + "loss": 1.4149, + "step": 334000 + }, + { + "epoch": 0.91, + "grad_norm": 1.4742454290390015, + "learning_rate": 1.0430672268907563e-05, + "loss": 1.4163, + "step": 334500 + }, + { + "epoch": 0.91, + "grad_norm": 1.4842146635055542, + "learning_rate": 1.0399159663865547e-05, + "loss": 1.4118, + "step": 335000 + }, + { + "epoch": 0.91, + "grad_norm": 1.5346875190734863, + "learning_rate": 1.036764705882353e-05, + "loss": 1.4148, + "step": 335500 + }, + { + "epoch": 0.91, + "grad_norm": 1.6554747819900513, + "learning_rate": 1.0336134453781512e-05, + "loss": 1.416, + "step": 336000 + }, + { + "epoch": 0.91, + "grad_norm": 1.5015145540237427, + "learning_rate": 1.0304621848739495e-05, + "loss": 1.4146, + "step": 336500 + }, + { + "epoch": 0.91, + "grad_norm": 1.4634381532669067, + "learning_rate": 1.0273109243697479e-05, + "loss": 1.4199, + "step": 337000 + }, + { + "epoch": 0.91, + "grad_norm": 1.7802950143814087, + "learning_rate": 1.0241596638655462e-05, + "loss": 1.4127, + "step": 337500 + }, + { + "epoch": 0.91, + "grad_norm": 3.0422604084014893, + "learning_rate": 1.0210084033613446e-05, + "loss": 1.4121, + "step": 338000 + }, + { + "epoch": 0.92, + "grad_norm": 1.4957752227783203, + "learning_rate": 1.0178571428571429e-05, + "loss": 1.4151, + "step": 338500 + }, + { + "epoch": 0.92, + "grad_norm": 1.6368649005889893, + "learning_rate": 1.0147058823529413e-05, + "loss": 1.4211, + "step": 339000 + }, + { + "epoch": 0.92, + "grad_norm": 1.493455410003662, + "learning_rate": 1.0115546218487396e-05, + "loss": 1.4131, + "step": 339500 + }, + { + "epoch": 0.92, + "grad_norm": 1.5789108276367188, + "learning_rate": 1.008403361344538e-05, + "loss": 1.413, + "step": 340000 + }, + { + "epoch": 0.92, + "grad_norm": 1.4984022378921509, + "learning_rate": 1.0052521008403361e-05, + "loss": 1.4156, + "step": 340500 + }, + { + "epoch": 0.92, + "grad_norm": 1.443871021270752, + "learning_rate": 1.0021008403361345e-05, + "loss": 1.4123, + "step": 341000 + }, + { + "epoch": 0.92, + "grad_norm": 1.532205581665039, + "learning_rate": 9.989495798319328e-06, + "loss": 1.4145, + "step": 341500 + }, + { + "epoch": 0.93, + "grad_norm": 1.487888216972351, + "learning_rate": 9.957983193277312e-06, + "loss": 1.4132, + "step": 342000 + }, + { + "epoch": 0.93, + "grad_norm": 1.5009286403656006, + "learning_rate": 9.926470588235293e-06, + "loss": 1.4132, + "step": 342500 + }, + { + "epoch": 0.93, + "grad_norm": 1.53665292263031, + "learning_rate": 9.894957983193277e-06, + "loss": 1.4114, + "step": 343000 + }, + { + "epoch": 0.93, + "grad_norm": 1.4559004306793213, + "learning_rate": 9.86344537815126e-06, + "loss": 1.4128, + "step": 343500 + }, + { + "epoch": 0.93, + "grad_norm": 1.472882628440857, + "learning_rate": 9.831932773109244e-06, + "loss": 1.4106, + "step": 344000 + }, + { + "epoch": 0.93, + "grad_norm": 1.528029203414917, + "learning_rate": 9.800420168067227e-06, + "loss": 1.4133, + "step": 344500 + }, + { + "epoch": 0.93, + "grad_norm": 1.4509416818618774, + "learning_rate": 9.76890756302521e-06, + "loss": 1.4099, + "step": 345000 + }, + { + "epoch": 0.94, + "grad_norm": 1.644581913948059, + "learning_rate": 9.737394957983194e-06, + "loss": 1.4102, + "step": 345500 + }, + { + "epoch": 0.94, + "grad_norm": 1.5054335594177246, + "learning_rate": 9.705882352941177e-06, + "loss": 1.4119, + "step": 346000 + }, + { + "epoch": 0.94, + "grad_norm": 1.47361421585083, + "learning_rate": 9.67436974789916e-06, + "loss": 1.4094, + "step": 346500 + }, + { + "epoch": 0.94, + "grad_norm": 1.461796522140503, + "learning_rate": 9.642857142857144e-06, + "loss": 1.4108, + "step": 347000 + }, + { + "epoch": 0.94, + "grad_norm": 1.6115666627883911, + "learning_rate": 9.611344537815128e-06, + "loss": 1.4096, + "step": 347500 + }, + { + "epoch": 0.94, + "grad_norm": 1.526082992553711, + "learning_rate": 9.57983193277311e-06, + "loss": 1.4094, + "step": 348000 + }, + { + "epoch": 0.94, + "grad_norm": 1.4482905864715576, + "learning_rate": 9.548319327731093e-06, + "loss": 1.4082, + "step": 348500 + }, + { + "epoch": 0.94, + "grad_norm": 1.5066174268722534, + "learning_rate": 9.516806722689075e-06, + "loss": 1.4122, + "step": 349000 + }, + { + "epoch": 0.95, + "grad_norm": 1.5225650072097778, + "learning_rate": 9.485294117647058e-06, + "loss": 1.4069, + "step": 349500 + }, + { + "epoch": 0.95, + "grad_norm": 1.4794243574142456, + "learning_rate": 9.453781512605041e-06, + "loss": 1.4087, + "step": 350000 + }, + { + "epoch": 0.95, + "grad_norm": 1.4825611114501953, + "learning_rate": 9.422268907563025e-06, + "loss": 1.4098, + "step": 350500 + }, + { + "epoch": 0.95, + "grad_norm": 1.50911283493042, + "learning_rate": 9.390756302521008e-06, + "loss": 1.4066, + "step": 351000 + }, + { + "epoch": 0.95, + "grad_norm": 1.5070313215255737, + "learning_rate": 9.359243697478992e-06, + "loss": 1.4067, + "step": 351500 + }, + { + "epoch": 0.95, + "grad_norm": 1.4434587955474854, + "learning_rate": 9.327731092436975e-06, + "loss": 1.4074, + "step": 352000 + }, + { + "epoch": 0.95, + "grad_norm": 1.4484858512878418, + "learning_rate": 9.296218487394959e-06, + "loss": 1.4056, + "step": 352500 + }, + { + "epoch": 0.96, + "grad_norm": 1.6141736507415771, + "learning_rate": 9.264705882352942e-06, + "loss": 1.4084, + "step": 353000 + }, + { + "epoch": 0.96, + "grad_norm": 1.4847619533538818, + "learning_rate": 9.233193277310925e-06, + "loss": 1.4092, + "step": 353500 + }, + { + "epoch": 0.96, + "grad_norm": 1.4862167835235596, + "learning_rate": 9.201680672268909e-06, + "loss": 1.4086, + "step": 354000 + }, + { + "epoch": 0.96, + "grad_norm": 1.5454356670379639, + "learning_rate": 9.170168067226892e-06, + "loss": 1.4088, + "step": 354500 + }, + { + "epoch": 0.96, + "grad_norm": 1.4676494598388672, + "learning_rate": 9.138655462184874e-06, + "loss": 1.4094, + "step": 355000 + }, + { + "epoch": 0.96, + "grad_norm": 1.4859504699707031, + "learning_rate": 9.107142857142856e-06, + "loss": 1.4076, + "step": 355500 + }, + { + "epoch": 0.96, + "grad_norm": 1.499040961265564, + "learning_rate": 9.07563025210084e-06, + "loss": 1.4104, + "step": 356000 + }, + { + "epoch": 0.97, + "grad_norm": 1.4864604473114014, + "learning_rate": 9.044117647058823e-06, + "loss": 1.4061, + "step": 356500 + }, + { + "epoch": 0.97, + "grad_norm": 1.4507191181182861, + "learning_rate": 9.012605042016806e-06, + "loss": 1.4062, + "step": 357000 + }, + { + "epoch": 0.97, + "grad_norm": 1.468526840209961, + "learning_rate": 8.98109243697479e-06, + "loss": 1.4081, + "step": 357500 + }, + { + "epoch": 0.97, + "grad_norm": 1.6709305047988892, + "learning_rate": 8.949579831932773e-06, + "loss": 1.4126, + "step": 358000 + }, + { + "epoch": 0.97, + "grad_norm": 1.9611443281173706, + "learning_rate": 8.918067226890756e-06, + "loss": 1.4079, + "step": 358500 + }, + { + "epoch": 0.97, + "grad_norm": 1.6809275150299072, + "learning_rate": 8.88655462184874e-06, + "loss": 1.4114, + "step": 359000 + }, + { + "epoch": 0.97, + "grad_norm": 5.746359825134277, + "learning_rate": 8.855042016806723e-06, + "loss": 1.4084, + "step": 359500 + }, + { + "epoch": 0.97, + "grad_norm": 5.197726726531982, + "learning_rate": 8.823529411764707e-06, + "loss": 1.4066, + "step": 360000 + }, + { + "epoch": 0.98, + "grad_norm": 1.4346739053726196, + "learning_rate": 8.79201680672269e-06, + "loss": 1.4066, + "step": 360500 + }, + { + "epoch": 0.98, + "grad_norm": 1.571542739868164, + "learning_rate": 8.760504201680674e-06, + "loss": 1.4097, + "step": 361000 + }, + { + "epoch": 0.98, + "grad_norm": 1.5356281995773315, + "learning_rate": 8.728991596638657e-06, + "loss": 1.4045, + "step": 361500 + }, + { + "epoch": 0.98, + "grad_norm": 1.7401924133300781, + "learning_rate": 8.697478991596639e-06, + "loss": 1.4067, + "step": 362000 + }, + { + "epoch": 0.98, + "grad_norm": 1.5491187572479248, + "learning_rate": 8.66596638655462e-06, + "loss": 1.4042, + "step": 362500 + }, + { + "epoch": 0.98, + "grad_norm": 1.5863696336746216, + "learning_rate": 8.634453781512604e-06, + "loss": 1.4074, + "step": 363000 + }, + { + "epoch": 0.98, + "grad_norm": 1.450952410697937, + "learning_rate": 8.602941176470587e-06, + "loss": 1.4076, + "step": 363500 + }, + { + "epoch": 0.99, + "grad_norm": 1.5750932693481445, + "learning_rate": 8.571428571428571e-06, + "loss": 1.41, + "step": 364000 + }, + { + "epoch": 0.99, + "grad_norm": 1.4661774635314941, + "learning_rate": 8.539915966386554e-06, + "loss": 1.4091, + "step": 364500 + }, + { + "epoch": 0.99, + "grad_norm": 1.540864109992981, + "learning_rate": 8.508403361344538e-06, + "loss": 1.4052, + "step": 365000 + }, + { + "epoch": 0.99, + "grad_norm": 1.5120595693588257, + "learning_rate": 8.476890756302521e-06, + "loss": 1.4072, + "step": 365500 + }, + { + "epoch": 0.99, + "grad_norm": 1.5357037782669067, + "learning_rate": 8.445378151260505e-06, + "loss": 1.4097, + "step": 366000 + }, + { + "epoch": 0.99, + "grad_norm": 1.5010443925857544, + "learning_rate": 8.413865546218488e-06, + "loss": 1.4094, + "step": 366500 + }, + { + "epoch": 0.99, + "grad_norm": 1.4643309116363525, + "learning_rate": 8.382352941176472e-06, + "loss": 1.4077, + "step": 367000 + }, + { + "epoch": 0.99, + "grad_norm": 1.4524095058441162, + "learning_rate": 8.350840336134455e-06, + "loss": 1.4065, + "step": 367500 + }, + { + "epoch": 1.0, + "grad_norm": 1.5203324556350708, + "learning_rate": 8.319327731092438e-06, + "loss": 1.4035, + "step": 368000 + }, + { + "epoch": 1.0, + "grad_norm": 1.4688167572021484, + "learning_rate": 8.28781512605042e-06, + "loss": 1.4067, + "step": 368500 + }, + { + "epoch": 1.0, + "grad_norm": 1.5595752000808716, + "learning_rate": 8.256302521008404e-06, + "loss": 1.4059, + "step": 369000 + }, + { + "epoch": 1.0, + "grad_norm": 1.4404747486114502, + "learning_rate": 8.224789915966387e-06, + "loss": 1.4035, + "step": 369500 + }, + { + "epoch": 1.0, + "grad_norm": 1.6032897233963013, + "learning_rate": 8.193277310924369e-06, + "loss": 1.4001, + "step": 370000 + }, + { + "epoch": 1.0, + "grad_norm": 1.6836262941360474, + "learning_rate": 8.161764705882352e-06, + "loss": 1.3981, + "step": 370500 + }, + { + "epoch": 1.0, + "grad_norm": 1.5205241441726685, + "learning_rate": 8.130252100840336e-06, + "loss": 1.3994, + "step": 371000 + }, + { + "epoch": 1.01, + "grad_norm": 1.7194490432739258, + "learning_rate": 8.098739495798319e-06, + "loss": 1.4027, + "step": 371500 + }, + { + "epoch": 1.01, + "grad_norm": 1.4517977237701416, + "learning_rate": 8.067226890756303e-06, + "loss": 1.4022, + "step": 372000 + }, + { + "epoch": 1.01, + "grad_norm": 1.6818935871124268, + "learning_rate": 8.035714285714286e-06, + "loss": 1.4028, + "step": 372500 + }, + { + "epoch": 1.01, + "grad_norm": 1.5117074251174927, + "learning_rate": 8.00420168067227e-06, + "loss": 1.4021, + "step": 373000 + }, + { + "epoch": 1.01, + "grad_norm": 1.4689205884933472, + "learning_rate": 7.972689075630253e-06, + "loss": 1.4057, + "step": 373500 + }, + { + "epoch": 1.01, + "grad_norm": 1.525889277458191, + "learning_rate": 7.941176470588236e-06, + "loss": 1.4041, + "step": 374000 + }, + { + "epoch": 1.01, + "grad_norm": 1.4896938800811768, + "learning_rate": 7.90966386554622e-06, + "loss": 1.4027, + "step": 374500 + }, + { + "epoch": 1.02, + "grad_norm": 1.4765034914016724, + "learning_rate": 7.878151260504201e-06, + "loss": 1.4005, + "step": 375000 + }, + { + "epoch": 1.02, + "grad_norm": 1.5386637449264526, + "learning_rate": 7.846638655462185e-06, + "loss": 1.397, + "step": 375500 + }, + { + "epoch": 1.02, + "grad_norm": 1.4808331727981567, + "learning_rate": 7.815126050420168e-06, + "loss": 1.401, + "step": 376000 + }, + { + "epoch": 1.02, + "grad_norm": 1.517560362815857, + "learning_rate": 7.783613445378152e-06, + "loss": 1.4037, + "step": 376500 + }, + { + "epoch": 1.02, + "grad_norm": 1.6733453273773193, + "learning_rate": 7.752100840336135e-06, + "loss": 1.3976, + "step": 377000 + }, + { + "epoch": 1.02, + "grad_norm": 1.480815052986145, + "learning_rate": 7.720588235294117e-06, + "loss": 1.4, + "step": 377500 + }, + { + "epoch": 1.02, + "grad_norm": 1.4836503267288208, + "learning_rate": 7.6890756302521e-06, + "loss": 1.3977, + "step": 378000 + }, + { + "epoch": 1.02, + "grad_norm": 1.442256212234497, + "learning_rate": 7.657563025210084e-06, + "loss": 1.399, + "step": 378500 + }, + { + "epoch": 1.03, + "grad_norm": 1.8496633768081665, + "learning_rate": 7.626050420168067e-06, + "loss": 1.4038, + "step": 379000 + }, + { + "epoch": 1.03, + "grad_norm": 1.4886460304260254, + "learning_rate": 7.59453781512605e-06, + "loss": 1.4061, + "step": 379500 + }, + { + "epoch": 1.03, + "grad_norm": 1.550764799118042, + "learning_rate": 7.563025210084033e-06, + "loss": 1.4003, + "step": 380000 + }, + { + "epoch": 1.03, + "grad_norm": 1.5111615657806396, + "learning_rate": 7.531512605042017e-06, + "loss": 1.4021, + "step": 380500 + }, + { + "epoch": 1.03, + "grad_norm": 1.5873339176177979, + "learning_rate": 7.5e-06, + "loss": 1.4003, + "step": 381000 + }, + { + "epoch": 1.03, + "grad_norm": 1.5139081478118896, + "learning_rate": 7.468487394957984e-06, + "loss": 1.3974, + "step": 381500 + }, + { + "epoch": 1.03, + "grad_norm": 1.4700753688812256, + "learning_rate": 7.436974789915967e-06, + "loss": 1.4009, + "step": 382000 + }, + { + "epoch": 1.04, + "grad_norm": 1.4294934272766113, + "learning_rate": 7.4054621848739505e-06, + "loss": 1.3997, + "step": 382500 + }, + { + "epoch": 1.04, + "grad_norm": 1.432667851448059, + "learning_rate": 7.373949579831932e-06, + "loss": 1.3992, + "step": 383000 + }, + { + "epoch": 1.04, + "grad_norm": 1.6012872457504272, + "learning_rate": 7.342436974789916e-06, + "loss": 1.3988, + "step": 383500 + }, + { + "epoch": 1.04, + "grad_norm": 1.5000537633895874, + "learning_rate": 7.310924369747899e-06, + "loss": 1.399, + "step": 384000 + }, + { + "epoch": 1.04, + "grad_norm": 1.5064808130264282, + "learning_rate": 7.2794117647058826e-06, + "loss": 1.4022, + "step": 384500 + }, + { + "epoch": 1.04, + "grad_norm": 1.5001455545425415, + "learning_rate": 7.247899159663866e-06, + "loss": 1.3947, + "step": 385000 + }, + { + "epoch": 1.04, + "grad_norm": 1.4360790252685547, + "learning_rate": 7.2163865546218494e-06, + "loss": 1.3983, + "step": 385500 + }, + { + "epoch": 1.04, + "grad_norm": 1.4993146657943726, + "learning_rate": 7.184873949579832e-06, + "loss": 1.3987, + "step": 386000 + }, + { + "epoch": 1.05, + "grad_norm": 1.4621449708938599, + "learning_rate": 7.1533613445378155e-06, + "loss": 1.3974, + "step": 386500 + }, + { + "epoch": 1.05, + "grad_norm": 1.7409414052963257, + "learning_rate": 7.121848739495798e-06, + "loss": 1.4004, + "step": 387000 + }, + { + "epoch": 1.05, + "grad_norm": 1.4486150741577148, + "learning_rate": 7.0903361344537815e-06, + "loss": 1.3982, + "step": 387500 + }, + { + "epoch": 1.05, + "grad_norm": 1.5252596139907837, + "learning_rate": 7.058823529411765e-06, + "loss": 1.4013, + "step": 388000 + }, + { + "epoch": 1.05, + "grad_norm": 1.4874343872070312, + "learning_rate": 7.027310924369748e-06, + "loss": 1.3995, + "step": 388500 + }, + { + "epoch": 1.05, + "grad_norm": 1.5078623294830322, + "learning_rate": 6.995798319327732e-06, + "loss": 1.3985, + "step": 389000 + }, + { + "epoch": 1.05, + "grad_norm": 1.5256296396255493, + "learning_rate": 6.964285714285714e-06, + "loss": 1.4005, + "step": 389500 + }, + { + "epoch": 1.06, + "grad_norm": 1.5369598865509033, + "learning_rate": 6.932773109243698e-06, + "loss": 1.3929, + "step": 390000 + }, + { + "epoch": 1.06, + "grad_norm": 1.4955265522003174, + "learning_rate": 6.9012605042016804e-06, + "loss": 1.3968, + "step": 390500 + }, + { + "epoch": 1.06, + "grad_norm": 1.501406192779541, + "learning_rate": 6.869747899159664e-06, + "loss": 1.3982, + "step": 391000 + }, + { + "epoch": 1.06, + "grad_norm": 1.5695279836654663, + "learning_rate": 6.838235294117647e-06, + "loss": 1.3986, + "step": 391500 + }, + { + "epoch": 1.06, + "grad_norm": 1.590920329093933, + "learning_rate": 6.806722689075631e-06, + "loss": 1.3989, + "step": 392000 + }, + { + "epoch": 1.06, + "grad_norm": 1.4469817876815796, + "learning_rate": 6.775210084033613e-06, + "loss": 1.3958, + "step": 392500 + }, + { + "epoch": 1.06, + "grad_norm": 1.4517157077789307, + "learning_rate": 6.743697478991597e-06, + "loss": 1.3948, + "step": 393000 + }, + { + "epoch": 1.07, + "grad_norm": 1.477184534072876, + "learning_rate": 6.71218487394958e-06, + "loss": 1.3955, + "step": 393500 + }, + { + "epoch": 1.07, + "grad_norm": 2.1850063800811768, + "learning_rate": 6.680672268907564e-06, + "loss": 1.3977, + "step": 394000 + }, + { + "epoch": 1.07, + "grad_norm": 1.4544538259506226, + "learning_rate": 6.649159663865546e-06, + "loss": 1.3974, + "step": 394500 + }, + { + "epoch": 1.07, + "grad_norm": 1.4682557582855225, + "learning_rate": 6.61764705882353e-06, + "loss": 1.3976, + "step": 395000 + }, + { + "epoch": 1.07, + "grad_norm": 1.4401472806930542, + "learning_rate": 6.586134453781512e-06, + "loss": 1.4002, + "step": 395500 + }, + { + "epoch": 1.07, + "grad_norm": 1.5497291088104248, + "learning_rate": 6.554621848739496e-06, + "loss": 1.3945, + "step": 396000 + }, + { + "epoch": 1.07, + "grad_norm": 1.525145173072815, + "learning_rate": 6.523109243697479e-06, + "loss": 1.4006, + "step": 396500 + }, + { + "epoch": 1.07, + "grad_norm": 1.5119032859802246, + "learning_rate": 6.491596638655463e-06, + "loss": 1.3984, + "step": 397000 + }, + { + "epoch": 1.08, + "grad_norm": 1.7145532369613647, + "learning_rate": 6.460084033613446e-06, + "loss": 1.398, + "step": 397500 + }, + { + "epoch": 1.08, + "grad_norm": 1.5175354480743408, + "learning_rate": 6.428571428571429e-06, + "loss": 1.3971, + "step": 398000 + }, + { + "epoch": 1.08, + "grad_norm": 1.4529006481170654, + "learning_rate": 6.397058823529412e-06, + "loss": 1.3986, + "step": 398500 + }, + { + "epoch": 1.08, + "grad_norm": 1.4779740571975708, + "learning_rate": 6.365546218487395e-06, + "loss": 1.3985, + "step": 399000 + }, + { + "epoch": 1.08, + "grad_norm": 1.591557502746582, + "learning_rate": 6.334033613445378e-06, + "loss": 1.3971, + "step": 399500 + }, + { + "epoch": 1.08, + "grad_norm": 1.5829887390136719, + "learning_rate": 6.3025210084033615e-06, + "loss": 1.3989, + "step": 400000 + }, + { + "epoch": 1.08, + "grad_norm": 1.546576976776123, + "learning_rate": 6.271008403361345e-06, + "loss": 1.398, + "step": 400500 + }, + { + "epoch": 1.09, + "grad_norm": 1.4360915422439575, + "learning_rate": 6.239495798319328e-06, + "loss": 1.3933, + "step": 401000 + }, + { + "epoch": 1.09, + "grad_norm": 1.555240273475647, + "learning_rate": 6.207983193277311e-06, + "loss": 1.3964, + "step": 401500 + }, + { + "epoch": 1.09, + "grad_norm": 1.5486465692520142, + "learning_rate": 6.176470588235294e-06, + "loss": 1.3922, + "step": 402000 + }, + { + "epoch": 1.09, + "grad_norm": 1.6140353679656982, + "learning_rate": 6.144957983193277e-06, + "loss": 1.3941, + "step": 402500 + }, + { + "epoch": 1.09, + "grad_norm": 1.422938346862793, + "learning_rate": 6.1134453781512605e-06, + "loss": 1.3946, + "step": 403000 + }, + { + "epoch": 1.09, + "grad_norm": 1.673789620399475, + "learning_rate": 6.081932773109244e-06, + "loss": 1.3965, + "step": 403500 + }, + { + "epoch": 1.09, + "grad_norm": 1.52051842212677, + "learning_rate": 6.050420168067227e-06, + "loss": 1.3935, + "step": 404000 + }, + { + "epoch": 1.09, + "grad_norm": 1.5157978534698486, + "learning_rate": 6.018907563025211e-06, + "loss": 1.3938, + "step": 404500 + }, + { + "epoch": 1.1, + "grad_norm": 1.5434610843658447, + "learning_rate": 5.987394957983194e-06, + "loss": 1.3931, + "step": 405000 + }, + { + "epoch": 1.1, + "grad_norm": 1.7399873733520508, + "learning_rate": 5.955882352941176e-06, + "loss": 1.3924, + "step": 405500 + }, + { + "epoch": 1.1, + "grad_norm": 1.482820749282837, + "learning_rate": 5.924369747899159e-06, + "loss": 1.3923, + "step": 406000 + }, + { + "epoch": 1.1, + "grad_norm": 4.893394947052002, + "learning_rate": 5.892857142857143e-06, + "loss": 1.393, + "step": 406500 + }, + { + "epoch": 1.1, + "grad_norm": 1.538550615310669, + "learning_rate": 5.861344537815126e-06, + "loss": 1.3938, + "step": 407000 + }, + { + "epoch": 1.1, + "grad_norm": 1.4997118711471558, + "learning_rate": 5.82983193277311e-06, + "loss": 1.3934, + "step": 407500 + }, + { + "epoch": 1.1, + "grad_norm": 1.5265237092971802, + "learning_rate": 5.798319327731093e-06, + "loss": 1.3915, + "step": 408000 + }, + { + "epoch": 1.11, + "grad_norm": 1.6841180324554443, + "learning_rate": 5.766806722689076e-06, + "loss": 1.3946, + "step": 408500 + }, + { + "epoch": 1.11, + "grad_norm": 1.4722718000411987, + "learning_rate": 5.735294117647058e-06, + "loss": 1.3949, + "step": 409000 + }, + { + "epoch": 1.11, + "grad_norm": 2.087042808532715, + "learning_rate": 5.703781512605042e-06, + "loss": 1.3925, + "step": 409500 + }, + { + "epoch": 1.11, + "grad_norm": 1.4858590364456177, + "learning_rate": 5.672268907563025e-06, + "loss": 1.3943, + "step": 410000 + }, + { + "epoch": 1.11, + "grad_norm": 1.4591546058654785, + "learning_rate": 5.640756302521009e-06, + "loss": 1.3924, + "step": 410500 + }, + { + "epoch": 1.11, + "grad_norm": 1.4490437507629395, + "learning_rate": 5.609243697478992e-06, + "loss": 1.3949, + "step": 411000 + }, + { + "epoch": 1.11, + "grad_norm": 1.5795851945877075, + "learning_rate": 5.5777310924369755e-06, + "loss": 1.3951, + "step": 411500 + }, + { + "epoch": 1.12, + "grad_norm": 1.5447410345077515, + "learning_rate": 5.546218487394958e-06, + "loss": 1.396, + "step": 412000 + }, + { + "epoch": 1.12, + "grad_norm": 1.510696530342102, + "learning_rate": 5.5147058823529415e-06, + "loss": 1.3929, + "step": 412500 + }, + { + "epoch": 1.12, + "grad_norm": 1.52991783618927, + "learning_rate": 5.483193277310924e-06, + "loss": 1.393, + "step": 413000 + }, + { + "epoch": 1.12, + "grad_norm": 1.5724798440933228, + "learning_rate": 5.4516806722689076e-06, + "loss": 1.3933, + "step": 413500 + }, + { + "epoch": 1.12, + "grad_norm": 1.9198040962219238, + "learning_rate": 5.420168067226891e-06, + "loss": 1.3934, + "step": 414000 + }, + { + "epoch": 1.12, + "grad_norm": 1.5322943925857544, + "learning_rate": 5.3886554621848744e-06, + "loss": 1.3925, + "step": 414500 + }, + { + "epoch": 1.12, + "grad_norm": 1.4684040546417236, + "learning_rate": 5.357142857142857e-06, + "loss": 1.3933, + "step": 415000 + }, + { + "epoch": 1.12, + "grad_norm": 1.4797214269638062, + "learning_rate": 5.3256302521008405e-06, + "loss": 1.3925, + "step": 415500 + }, + { + "epoch": 1.13, + "grad_norm": 1.524305820465088, + "learning_rate": 5.294117647058824e-06, + "loss": 1.3929, + "step": 416000 + }, + { + "epoch": 1.13, + "grad_norm": 1.4858139753341675, + "learning_rate": 5.2626050420168065e-06, + "loss": 1.3881, + "step": 416500 + }, + { + "epoch": 1.13, + "grad_norm": 1.5586313009262085, + "learning_rate": 5.23109243697479e-06, + "loss": 1.393, + "step": 417000 + }, + { + "epoch": 1.13, + "grad_norm": 1.54250168800354, + "learning_rate": 5.199579831932773e-06, + "loss": 1.3926, + "step": 417500 + }, + { + "epoch": 1.13, + "grad_norm": 9.902482986450195, + "learning_rate": 5.168067226890756e-06, + "loss": 1.3923, + "step": 418000 + }, + { + "epoch": 1.13, + "grad_norm": 3.239046573638916, + "learning_rate": 5.136554621848739e-06, + "loss": 1.3925, + "step": 418500 + }, + { + "epoch": 1.13, + "grad_norm": 1.5059127807617188, + "learning_rate": 5.105042016806723e-06, + "loss": 1.3936, + "step": 419000 + }, + { + "epoch": 1.14, + "grad_norm": 1.5107486248016357, + "learning_rate": 5.073529411764706e-06, + "loss": 1.3942, + "step": 419500 + }, + { + "epoch": 1.14, + "grad_norm": 1.577019214630127, + "learning_rate": 5.04201680672269e-06, + "loss": 1.3896, + "step": 420000 + }, + { + "epoch": 1.14, + "grad_norm": 1.4538390636444092, + "learning_rate": 5.010504201680672e-06, + "loss": 1.387, + "step": 420500 + }, + { + "epoch": 1.14, + "grad_norm": 1.593549132347107, + "learning_rate": 4.978991596638656e-06, + "loss": 1.3908, + "step": 421000 + }, + { + "epoch": 1.14, + "grad_norm": 1.4725204706192017, + "learning_rate": 4.947478991596638e-06, + "loss": 1.3904, + "step": 421500 + }, + { + "epoch": 1.14, + "grad_norm": 1.4892488718032837, + "learning_rate": 4.915966386554622e-06, + "loss": 1.3896, + "step": 422000 + }, + { + "epoch": 1.14, + "grad_norm": 1.503003478050232, + "learning_rate": 4.884453781512605e-06, + "loss": 1.3901, + "step": 422500 + }, + { + "epoch": 1.15, + "grad_norm": 1.5650583505630493, + "learning_rate": 4.852941176470589e-06, + "loss": 1.3879, + "step": 423000 + }, + { + "epoch": 1.15, + "grad_norm": 1.5746469497680664, + "learning_rate": 4.821428571428572e-06, + "loss": 1.3898, + "step": 423500 + }, + { + "epoch": 1.15, + "grad_norm": 1.4636718034744263, + "learning_rate": 4.789915966386555e-06, + "loss": 1.3946, + "step": 424000 + }, + { + "epoch": 1.15, + "grad_norm": 1.5072635412216187, + "learning_rate": 4.758403361344537e-06, + "loss": 1.3936, + "step": 424500 + }, + { + "epoch": 1.15, + "grad_norm": 1.9211359024047852, + "learning_rate": 4.726890756302521e-06, + "loss": 1.3919, + "step": 425000 + }, + { + "epoch": 1.15, + "grad_norm": 1.6186763048171997, + "learning_rate": 4.695378151260504e-06, + "loss": 1.3874, + "step": 425500 + }, + { + "epoch": 1.15, + "grad_norm": 1.6086759567260742, + "learning_rate": 4.663865546218488e-06, + "loss": 1.3911, + "step": 426000 + }, + { + "epoch": 1.15, + "grad_norm": 1.4456268548965454, + "learning_rate": 4.632352941176471e-06, + "loss": 1.3888, + "step": 426500 + }, + { + "epoch": 1.16, + "grad_norm": 1.5766582489013672, + "learning_rate": 4.6008403361344545e-06, + "loss": 1.3884, + "step": 427000 + }, + { + "epoch": 1.16, + "grad_norm": 1.4081532955169678, + "learning_rate": 4.569327731092437e-06, + "loss": 1.3904, + "step": 427500 + }, + { + "epoch": 1.16, + "grad_norm": 1.4901301860809326, + "learning_rate": 4.53781512605042e-06, + "loss": 1.389, + "step": 428000 + }, + { + "epoch": 1.16, + "grad_norm": 1.5027050971984863, + "learning_rate": 4.506302521008403e-06, + "loss": 1.3931, + "step": 428500 + }, + { + "epoch": 1.16, + "grad_norm": 1.4869219064712524, + "learning_rate": 4.4747899159663865e-06, + "loss": 1.3888, + "step": 429000 + }, + { + "epoch": 1.16, + "grad_norm": 1.439729928970337, + "learning_rate": 4.44327731092437e-06, + "loss": 1.3897, + "step": 429500 + }, + { + "epoch": 1.16, + "grad_norm": 1.5325324535369873, + "learning_rate": 4.411764705882353e-06, + "loss": 1.3891, + "step": 430000 + }, + { + "epoch": 1.17, + "grad_norm": 1.5293645858764648, + "learning_rate": 4.380252100840337e-06, + "loss": 1.3902, + "step": 430500 + }, + { + "epoch": 1.17, + "grad_norm": 1.4475960731506348, + "learning_rate": 4.3487394957983194e-06, + "loss": 1.388, + "step": 431000 + }, + { + "epoch": 1.17, + "grad_norm": 1.5612802505493164, + "learning_rate": 4.317226890756302e-06, + "loss": 1.3885, + "step": 431500 + }, + { + "epoch": 1.17, + "grad_norm": 1.682928204536438, + "learning_rate": 4.2857142857142855e-06, + "loss": 1.3899, + "step": 432000 + }, + { + "epoch": 1.17, + "grad_norm": 1.5231236219406128, + "learning_rate": 4.254201680672269e-06, + "loss": 1.3877, + "step": 432500 + }, + { + "epoch": 1.17, + "grad_norm": 1.446148157119751, + "learning_rate": 4.222689075630252e-06, + "loss": 1.3901, + "step": 433000 + }, + { + "epoch": 1.17, + "grad_norm": 1.4778817892074585, + "learning_rate": 4.191176470588236e-06, + "loss": 1.3865, + "step": 433500 + }, + { + "epoch": 1.17, + "grad_norm": 1.5888080596923828, + "learning_rate": 4.159663865546219e-06, + "loss": 1.3872, + "step": 434000 + }, + { + "epoch": 1.18, + "grad_norm": 1.6371558904647827, + "learning_rate": 4.128151260504202e-06, + "loss": 1.3893, + "step": 434500 + }, + { + "epoch": 1.18, + "grad_norm": 1.4442592859268188, + "learning_rate": 4.096638655462184e-06, + "loss": 1.3934, + "step": 435000 + }, + { + "epoch": 1.18, + "grad_norm": 1.7637091875076294, + "learning_rate": 4.065126050420168e-06, + "loss": 1.3892, + "step": 435500 + }, + { + "epoch": 1.18, + "grad_norm": 1.4838693141937256, + "learning_rate": 4.033613445378151e-06, + "loss": 1.3866, + "step": 436000 + }, + { + "epoch": 1.18, + "grad_norm": 1.5558868646621704, + "learning_rate": 4.002100840336135e-06, + "loss": 1.3914, + "step": 436500 + }, + { + "epoch": 1.18, + "grad_norm": 1.8331657648086548, + "learning_rate": 3.970588235294118e-06, + "loss": 1.389, + "step": 437000 + }, + { + "epoch": 1.18, + "grad_norm": 1.7367424964904785, + "learning_rate": 3.939075630252101e-06, + "loss": 1.3897, + "step": 437500 + }, + { + "epoch": 1.19, + "grad_norm": 1.5316094160079956, + "learning_rate": 3.907563025210084e-06, + "loss": 1.3871, + "step": 438000 + }, + { + "epoch": 1.19, + "grad_norm": 1.5062899589538574, + "learning_rate": 3.876050420168068e-06, + "loss": 1.3876, + "step": 438500 + }, + { + "epoch": 1.19, + "grad_norm": 1.5399343967437744, + "learning_rate": 3.84453781512605e-06, + "loss": 1.3873, + "step": 439000 + }, + { + "epoch": 1.19, + "grad_norm": 1.8311206102371216, + "learning_rate": 3.8130252100840336e-06, + "loss": 1.3886, + "step": 439500 + }, + { + "epoch": 1.19, + "grad_norm": 1.5011011362075806, + "learning_rate": 3.7815126050420167e-06, + "loss": 1.3877, + "step": 440000 + }, + { + "epoch": 1.19, + "grad_norm": 1.5647181272506714, + "learning_rate": 3.75e-06, + "loss": 1.3895, + "step": 440500 + }, + { + "epoch": 1.19, + "grad_norm": 1.9663615226745605, + "learning_rate": 3.7184873949579835e-06, + "loss": 1.3884, + "step": 441000 + }, + { + "epoch": 1.2, + "grad_norm": 2.4808692932128906, + "learning_rate": 3.686974789915966e-06, + "loss": 1.3879, + "step": 441500 + }, + { + "epoch": 1.2, + "grad_norm": 1.4271633625030518, + "learning_rate": 3.6554621848739496e-06, + "loss": 1.3913, + "step": 442000 + }, + { + "epoch": 1.2, + "grad_norm": 1.5341715812683105, + "learning_rate": 3.623949579831933e-06, + "loss": 1.3874, + "step": 442500 + }, + { + "epoch": 1.2, + "grad_norm": 1.4926517009735107, + "learning_rate": 3.592436974789916e-06, + "loss": 1.3873, + "step": 443000 + }, + { + "epoch": 1.2, + "grad_norm": 1.4709627628326416, + "learning_rate": 3.560924369747899e-06, + "loss": 1.3856, + "step": 443500 + }, + { + "epoch": 1.2, + "grad_norm": 1.4797513484954834, + "learning_rate": 3.5294117647058825e-06, + "loss": 1.3874, + "step": 444000 + }, + { + "epoch": 1.2, + "grad_norm": 1.506548523902893, + "learning_rate": 3.497899159663866e-06, + "loss": 1.3859, + "step": 444500 + }, + { + "epoch": 1.2, + "grad_norm": 1.4667857885360718, + "learning_rate": 3.466386554621849e-06, + "loss": 1.3889, + "step": 445000 + }, + { + "epoch": 1.21, + "grad_norm": 1.4796762466430664, + "learning_rate": 3.434873949579832e-06, + "loss": 1.3912, + "step": 445500 + }, + { + "epoch": 1.21, + "grad_norm": 1.534725546836853, + "learning_rate": 3.4033613445378154e-06, + "loss": 1.3881, + "step": 446000 + }, + { + "epoch": 1.21, + "grad_norm": 1.6512054204940796, + "learning_rate": 3.3718487394957984e-06, + "loss": 1.3874, + "step": 446500 + }, + { + "epoch": 1.21, + "grad_norm": 1.4926962852478027, + "learning_rate": 3.340336134453782e-06, + "loss": 1.3844, + "step": 447000 + }, + { + "epoch": 1.21, + "grad_norm": 1.479819416999817, + "learning_rate": 3.308823529411765e-06, + "loss": 1.3862, + "step": 447500 + }, + { + "epoch": 1.21, + "grad_norm": 1.429606318473816, + "learning_rate": 3.277310924369748e-06, + "loss": 1.3864, + "step": 448000 + }, + { + "epoch": 1.21, + "grad_norm": 1.526227593421936, + "learning_rate": 3.2457983193277313e-06, + "loss": 1.388, + "step": 448500 + }, + { + "epoch": 1.22, + "grad_norm": 1.5270380973815918, + "learning_rate": 3.2142857142857143e-06, + "loss": 1.3898, + "step": 449000 + }, + { + "epoch": 1.22, + "grad_norm": 1.6459033489227295, + "learning_rate": 3.1827731092436973e-06, + "loss": 1.3872, + "step": 449500 + }, + { + "epoch": 1.22, + "grad_norm": 1.5082780122756958, + "learning_rate": 3.1512605042016808e-06, + "loss": 1.3864, + "step": 450000 + }, + { + "epoch": 1.22, + "grad_norm": 1.4675207138061523, + "learning_rate": 3.119747899159664e-06, + "loss": 1.3858, + "step": 450500 + }, + { + "epoch": 1.22, + "grad_norm": 1.5487087965011597, + "learning_rate": 3.088235294117647e-06, + "loss": 1.3859, + "step": 451000 + }, + { + "epoch": 1.22, + "grad_norm": 1.5166810750961304, + "learning_rate": 3.0567226890756302e-06, + "loss": 1.3838, + "step": 451500 + }, + { + "epoch": 1.22, + "grad_norm": 1.4788706302642822, + "learning_rate": 3.0252100840336137e-06, + "loss": 1.3836, + "step": 452000 + }, + { + "epoch": 1.22, + "grad_norm": 1.6381962299346924, + "learning_rate": 2.993697478991597e-06, + "loss": 1.3853, + "step": 452500 + }, + { + "epoch": 1.23, + "grad_norm": 1.4548882246017456, + "learning_rate": 2.9621848739495797e-06, + "loss": 1.3878, + "step": 453000 + }, + { + "epoch": 1.23, + "grad_norm": 1.5543279647827148, + "learning_rate": 2.930672268907563e-06, + "loss": 1.3885, + "step": 453500 + }, + { + "epoch": 1.23, + "grad_norm": 1.5119037628173828, + "learning_rate": 2.8991596638655466e-06, + "loss": 1.3865, + "step": 454000 + }, + { + "epoch": 1.23, + "grad_norm": 1.5338330268859863, + "learning_rate": 2.867647058823529e-06, + "loss": 1.3825, + "step": 454500 + }, + { + "epoch": 1.23, + "grad_norm": 2.100884437561035, + "learning_rate": 2.8361344537815126e-06, + "loss": 1.3894, + "step": 455000 + }, + { + "epoch": 1.23, + "grad_norm": 1.4853757619857788, + "learning_rate": 2.804621848739496e-06, + "loss": 1.385, + "step": 455500 + }, + { + "epoch": 1.23, + "grad_norm": 1.545937180519104, + "learning_rate": 2.773109243697479e-06, + "loss": 1.3875, + "step": 456000 + }, + { + "epoch": 1.24, + "grad_norm": 1.4860107898712158, + "learning_rate": 2.741596638655462e-06, + "loss": 1.3839, + "step": 456500 + }, + { + "epoch": 1.24, + "grad_norm": 1.5260435342788696, + "learning_rate": 2.7100840336134455e-06, + "loss": 1.3815, + "step": 457000 + }, + { + "epoch": 1.24, + "grad_norm": 1.5752997398376465, + "learning_rate": 2.6785714285714285e-06, + "loss": 1.3845, + "step": 457500 + }, + { + "epoch": 1.24, + "grad_norm": 1.5157984495162964, + "learning_rate": 2.647058823529412e-06, + "loss": 1.3831, + "step": 458000 + }, + { + "epoch": 1.24, + "grad_norm": 1.5206942558288574, + "learning_rate": 2.615546218487395e-06, + "loss": 1.3866, + "step": 458500 + }, + { + "epoch": 1.24, + "grad_norm": 1.524672508239746, + "learning_rate": 2.584033613445378e-06, + "loss": 1.3869, + "step": 459000 + }, + { + "epoch": 1.24, + "grad_norm": 6.727693557739258, + "learning_rate": 2.5525210084033614e-06, + "loss": 1.3805, + "step": 459500 + }, + { + "epoch": 1.25, + "grad_norm": 1.5827701091766357, + "learning_rate": 2.521008403361345e-06, + "loss": 1.39, + "step": 460000 + }, + { + "epoch": 1.25, + "grad_norm": 1.4831866025924683, + "learning_rate": 2.489495798319328e-06, + "loss": 1.3886, + "step": 460500 + }, + { + "epoch": 1.25, + "grad_norm": 1.5272330045700073, + "learning_rate": 2.457983193277311e-06, + "loss": 1.3889, + "step": 461000 + }, + { + "epoch": 1.25, + "grad_norm": 1.478623628616333, + "learning_rate": 2.4264705882352943e-06, + "loss": 1.3878, + "step": 461500 + }, + { + "epoch": 1.25, + "grad_norm": 1.5272207260131836, + "learning_rate": 2.3949579831932773e-06, + "loss": 1.3834, + "step": 462000 + }, + { + "epoch": 1.25, + "grad_norm": 1.574120044708252, + "learning_rate": 2.3634453781512604e-06, + "loss": 1.3852, + "step": 462500 + }, + { + "epoch": 1.25, + "grad_norm": 1.5751044750213623, + "learning_rate": 2.331932773109244e-06, + "loss": 1.3829, + "step": 463000 + }, + { + "epoch": 1.25, + "grad_norm": 1.4704902172088623, + "learning_rate": 2.3004201680672272e-06, + "loss": 1.3817, + "step": 463500 + }, + { + "epoch": 1.26, + "grad_norm": 2.406973123550415, + "learning_rate": 2.26890756302521e-06, + "loss": 1.3872, + "step": 464000 + }, + { + "epoch": 1.26, + "grad_norm": 1.4869129657745361, + "learning_rate": 2.2373949579831933e-06, + "loss": 1.3825, + "step": 464500 + }, + { + "epoch": 1.26, + "grad_norm": 1.5050959587097168, + "learning_rate": 2.2058823529411767e-06, + "loss": 1.3821, + "step": 465000 + }, + { + "epoch": 1.26, + "grad_norm": 1.4652327299118042, + "learning_rate": 2.1743697478991597e-06, + "loss": 1.3831, + "step": 465500 + }, + { + "epoch": 1.26, + "grad_norm": 1.6011298894882202, + "learning_rate": 2.1428571428571427e-06, + "loss": 1.3824, + "step": 466000 + }, + { + "epoch": 1.26, + "grad_norm": 1.589460015296936, + "learning_rate": 2.111344537815126e-06, + "loss": 1.3816, + "step": 466500 + }, + { + "epoch": 1.26, + "grad_norm": 1.679612636566162, + "learning_rate": 2.0798319327731096e-06, + "loss": 1.383, + "step": 467000 + }, + { + "epoch": 1.27, + "grad_norm": 5.37538480758667, + "learning_rate": 2.048319327731092e-06, + "loss": 1.3818, + "step": 467500 + }, + { + "epoch": 1.27, + "grad_norm": 1.5256156921386719, + "learning_rate": 2.0168067226890756e-06, + "loss": 1.383, + "step": 468000 + }, + { + "epoch": 1.27, + "grad_norm": 1.546476125717163, + "learning_rate": 1.985294117647059e-06, + "loss": 1.3841, + "step": 468500 + }, + { + "epoch": 1.27, + "grad_norm": 1.429592251777649, + "learning_rate": 1.953781512605042e-06, + "loss": 1.3828, + "step": 469000 + }, + { + "epoch": 1.27, + "grad_norm": 1.4674160480499268, + "learning_rate": 1.922268907563025e-06, + "loss": 1.3847, + "step": 469500 + }, + { + "epoch": 1.27, + "grad_norm": 2.370859384536743, + "learning_rate": 1.8907563025210083e-06, + "loss": 1.3816, + "step": 470000 + }, + { + "epoch": 1.27, + "grad_norm": 1.5106278657913208, + "learning_rate": 1.8592436974789918e-06, + "loss": 1.3783, + "step": 470500 + }, + { + "epoch": 1.28, + "grad_norm": 1.5777826309204102, + "learning_rate": 1.8277310924369748e-06, + "loss": 1.3817, + "step": 471000 + }, + { + "epoch": 1.28, + "grad_norm": 1.4805636405944824, + "learning_rate": 1.796218487394958e-06, + "loss": 1.3831, + "step": 471500 + }, + { + "epoch": 1.28, + "grad_norm": 1.5154469013214111, + "learning_rate": 1.7647058823529412e-06, + "loss": 1.383, + "step": 472000 + }, + { + "epoch": 1.28, + "grad_norm": 1.54281747341156, + "learning_rate": 1.7331932773109245e-06, + "loss": 1.3852, + "step": 472500 + }, + { + "epoch": 1.28, + "grad_norm": 1.7247158288955688, + "learning_rate": 1.7016806722689077e-06, + "loss": 1.3819, + "step": 473000 + }, + { + "epoch": 1.28, + "grad_norm": 1.4723429679870605, + "learning_rate": 1.670168067226891e-06, + "loss": 1.38, + "step": 473500 + }, + { + "epoch": 1.28, + "grad_norm": 1.5267595052719116, + "learning_rate": 1.638655462184874e-06, + "loss": 1.3822, + "step": 474000 + }, + { + "epoch": 1.28, + "grad_norm": 1.566758155822754, + "learning_rate": 1.6071428571428572e-06, + "loss": 1.3837, + "step": 474500 + }, + { + "epoch": 1.29, + "grad_norm": 2.029449939727783, + "learning_rate": 1.5756302521008404e-06, + "loss": 1.3853, + "step": 475000 + }, + { + "epoch": 1.29, + "grad_norm": 1.4750381708145142, + "learning_rate": 1.5441176470588234e-06, + "loss": 1.3838, + "step": 475500 + }, + { + "epoch": 1.29, + "grad_norm": 1.5221339464187622, + "learning_rate": 1.5126050420168068e-06, + "loss": 1.3859, + "step": 476000 + }, + { + "epoch": 1.29, + "grad_norm": 1.518754243850708, + "learning_rate": 1.4810924369747898e-06, + "loss": 1.3783, + "step": 476500 + }, + { + "epoch": 1.29, + "grad_norm": 1.4300239086151123, + "learning_rate": 1.4495798319327733e-06, + "loss": 1.3769, + "step": 477000 + }, + { + "epoch": 1.29, + "grad_norm": 1.5566083192825317, + "learning_rate": 1.4180672268907563e-06, + "loss": 1.3788, + "step": 477500 + }, + { + "epoch": 1.29, + "grad_norm": 1.415859580039978, + "learning_rate": 1.3865546218487395e-06, + "loss": 1.385, + "step": 478000 + }, + { + "epoch": 1.3, + "grad_norm": 1.4944028854370117, + "learning_rate": 1.3550420168067228e-06, + "loss": 1.3815, + "step": 478500 + }, + { + "epoch": 1.3, + "grad_norm": 1.4514822959899902, + "learning_rate": 1.323529411764706e-06, + "loss": 1.3827, + "step": 479000 + }, + { + "epoch": 1.3, + "grad_norm": 1.5512882471084595, + "learning_rate": 1.292016806722689e-06, + "loss": 1.384, + "step": 479500 + }, + { + "epoch": 1.3, + "grad_norm": 1.574981689453125, + "learning_rate": 1.2605042016806724e-06, + "loss": 1.382, + "step": 480000 + }, + { + "epoch": 1.3, + "grad_norm": 1.570827603340149, + "learning_rate": 1.2289915966386554e-06, + "loss": 1.382, + "step": 480500 + }, + { + "epoch": 1.3, + "grad_norm": 1.5336010456085205, + "learning_rate": 1.1974789915966387e-06, + "loss": 1.3803, + "step": 481000 + }, + { + "epoch": 1.3, + "grad_norm": 1.4452096223831177, + "learning_rate": 1.165966386554622e-06, + "loss": 1.3804, + "step": 481500 + }, + { + "epoch": 1.3, + "grad_norm": 1.5529412031173706, + "learning_rate": 1.134453781512605e-06, + "loss": 1.3813, + "step": 482000 + }, + { + "epoch": 1.31, + "grad_norm": 1.5553141832351685, + "learning_rate": 1.1029411764705884e-06, + "loss": 1.3822, + "step": 482500 + }, + { + "epoch": 1.31, + "grad_norm": 1.5250602960586548, + "learning_rate": 1.0714285714285714e-06, + "loss": 1.379, + "step": 483000 + }, + { + "epoch": 1.31, + "grad_norm": 1.4803342819213867, + "learning_rate": 1.0399159663865548e-06, + "loss": 1.3846, + "step": 483500 + }, + { + "epoch": 1.31, + "grad_norm": 1.4097282886505127, + "learning_rate": 1.0084033613445378e-06, + "loss": 1.3855, + "step": 484000 + }, + { + "epoch": 1.31, + "grad_norm": 1.535632848739624, + "learning_rate": 9.76890756302521e-07, + "loss": 1.3807, + "step": 484500 + }, + { + "epoch": 1.31, + "grad_norm": 1.5535025596618652, + "learning_rate": 9.453781512605042e-07, + "loss": 1.379, + "step": 485000 + }, + { + "epoch": 1.31, + "grad_norm": 1.5092753171920776, + "learning_rate": 9.138655462184874e-07, + "loss": 1.3777, + "step": 485500 + }, + { + "epoch": 1.32, + "grad_norm": 1.5026346445083618, + "learning_rate": 8.823529411764706e-07, + "loss": 1.3844, + "step": 486000 + }, + { + "epoch": 1.32, + "grad_norm": 2.1724424362182617, + "learning_rate": 8.508403361344538e-07, + "loss": 1.3808, + "step": 486500 + }, + { + "epoch": 1.32, + "grad_norm": 1.5653128623962402, + "learning_rate": 8.19327731092437e-07, + "loss": 1.3826, + "step": 487000 + }, + { + "epoch": 1.32, + "grad_norm": 1.8672337532043457, + "learning_rate": 7.878151260504202e-07, + "loss": 1.3804, + "step": 487500 + }, + { + "epoch": 1.32, + "grad_norm": 1.5125828981399536, + "learning_rate": 7.563025210084034e-07, + "loss": 1.3785, + "step": 488000 + }, + { + "epoch": 1.32, + "grad_norm": 1.5895177125930786, + "learning_rate": 7.247899159663866e-07, + "loss": 1.3806, + "step": 488500 + }, + { + "epoch": 1.32, + "grad_norm": 1.505618929862976, + "learning_rate": 6.932773109243698e-07, + "loss": 1.3822, + "step": 489000 + }, + { + "epoch": 1.33, + "grad_norm": 1.4767976999282837, + "learning_rate": 6.61764705882353e-07, + "loss": 1.3809, + "step": 489500 + }, + { + "epoch": 1.33, + "grad_norm": 1.4713040590286255, + "learning_rate": 6.302521008403362e-07, + "loss": 1.38, + "step": 490000 + }, + { + "epoch": 1.33, + "grad_norm": 1.5712190866470337, + "learning_rate": 5.987394957983193e-07, + "loss": 1.3821, + "step": 490500 + }, + { + "epoch": 1.33, + "grad_norm": 1.520726203918457, + "learning_rate": 5.672268907563025e-07, + "loss": 1.3817, + "step": 491000 + }, + { + "epoch": 1.33, + "grad_norm": 1.4978504180908203, + "learning_rate": 5.357142857142857e-07, + "loss": 1.3825, + "step": 491500 + }, + { + "epoch": 1.33, + "grad_norm": 1.5783872604370117, + "learning_rate": 5.042016806722689e-07, + "loss": 1.3825, + "step": 492000 + }, + { + "epoch": 1.33, + "grad_norm": 1.5126821994781494, + "learning_rate": 4.726890756302521e-07, + "loss": 1.3803, + "step": 492500 + }, + { + "epoch": 1.33, + "grad_norm": 1.4677457809448242, + "learning_rate": 4.411764705882353e-07, + "loss": 1.3804, + "step": 493000 + }, + { + "epoch": 1.34, + "grad_norm": 1.5842092037200928, + "learning_rate": 4.096638655462185e-07, + "loss": 1.3818, + "step": 493500 + }, + { + "epoch": 1.34, + "grad_norm": 1.5152337551116943, + "learning_rate": 3.781512605042017e-07, + "loss": 1.3797, + "step": 494000 + }, + { + "epoch": 1.34, + "grad_norm": 1.5868217945098877, + "learning_rate": 3.466386554621849e-07, + "loss": 1.3829, + "step": 494500 + }, + { + "epoch": 1.34, + "grad_norm": 1.4543733596801758, + "learning_rate": 3.151260504201681e-07, + "loss": 1.3811, + "step": 495000 + }, + { + "epoch": 1.34, + "grad_norm": 1.5251801013946533, + "learning_rate": 2.8361344537815123e-07, + "loss": 1.3793, + "step": 495500 + }, + { + "epoch": 1.34, + "grad_norm": 1.5227956771850586, + "learning_rate": 2.5210084033613445e-07, + "loss": 1.3848, + "step": 496000 + }, + { + "epoch": 1.34, + "grad_norm": 1.506102204322815, + "learning_rate": 2.2058823529411765e-07, + "loss": 1.3789, + "step": 496500 + }, + { + "epoch": 1.35, + "grad_norm": 1.4776455163955688, + "learning_rate": 1.8907563025210085e-07, + "loss": 1.3837, + "step": 497000 + }, + { + "epoch": 1.35, + "grad_norm": 1.5449495315551758, + "learning_rate": 1.5756302521008405e-07, + "loss": 1.3823, + "step": 497500 + }, + { + "epoch": 1.35, + "grad_norm": 1.4903110265731812, + "learning_rate": 1.2605042016806723e-07, + "loss": 1.3816, + "step": 498000 + }, + { + "epoch": 1.35, + "grad_norm": 1.4964358806610107, + "learning_rate": 9.453781512605043e-08, + "loss": 1.3783, + "step": 498500 + }, + { + "epoch": 1.35, + "grad_norm": 1.6141352653503418, + "learning_rate": 6.302521008403361e-08, + "loss": 1.3819, + "step": 499000 + }, + { + "epoch": 1.35, + "grad_norm": 1.5006154775619507, + "learning_rate": 3.151260504201681e-08, + "loss": 1.3771, + "step": 499500 + }, + { + "epoch": 1.35, + "grad_norm": 1.5279935598373413, + "learning_rate": 0.0, + "loss": 1.3805, + "step": 500000 + }, + { + "epoch": 1.35, + "step": 500000, + "total_flos": 2.9824904071075946e+19, + "train_loss": 1.5473345408935546, + "train_runtime": 243315.0329, + "train_samples_per_second": 526.067, + "train_steps_per_second": 2.055 + } + ], + "logging_steps": 500, + "max_steps": 500000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "total_flos": 2.9824904071075946e+19, + "train_batch_size": 256, + "trial_name": null, + "trial_params": null +}