robert_BPE_pubchem10M / trainer_state.json
rifkat's picture
Pubchem 10M madel
0abbfed
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.6479533061976808,
"global_step": 190000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 4.9952000614392135e-05,
"loss": 2.8985,
"step": 500
},
{
"epoch": 0.02,
"learning_rate": 4.9904001228784275e-05,
"loss": 1.3069,
"step": 1000
},
{
"epoch": 0.03,
"learning_rate": 4.985600184317641e-05,
"loss": 1.0065,
"step": 1500
},
{
"epoch": 0.04,
"learning_rate": 4.980800245756855e-05,
"loss": 0.8738,
"step": 2000
},
{
"epoch": 0.05,
"learning_rate": 4.976000307196068e-05,
"loss": 0.79,
"step": 2500
},
{
"epoch": 0.06,
"learning_rate": 4.971200368635281e-05,
"loss": 0.7345,
"step": 3000
},
{
"epoch": 0.07,
"learning_rate": 4.966400430074495e-05,
"loss": 0.6848,
"step": 3500
},
{
"epoch": 0.08,
"learning_rate": 4.9616004915137085e-05,
"loss": 0.6475,
"step": 4000
},
{
"epoch": 0.09,
"learning_rate": 4.9568005529529224e-05,
"loss": 0.6275,
"step": 4500
},
{
"epoch": 0.1,
"learning_rate": 4.9520006143921364e-05,
"loss": 0.5992,
"step": 5000
},
{
"epoch": 0.11,
"learning_rate": 4.9472006758313496e-05,
"loss": 0.5797,
"step": 5500
},
{
"epoch": 0.12,
"learning_rate": 4.9424007372705636e-05,
"loss": 0.5654,
"step": 6000
},
{
"epoch": 0.12,
"learning_rate": 4.937600798709777e-05,
"loss": 0.5496,
"step": 6500
},
{
"epoch": 0.13,
"learning_rate": 4.932800860148991e-05,
"loss": 0.5348,
"step": 7000
},
{
"epoch": 0.14,
"learning_rate": 4.928000921588204e-05,
"loss": 0.5257,
"step": 7500
},
{
"epoch": 0.15,
"learning_rate": 4.9232009830274174e-05,
"loss": 0.5132,
"step": 8000
},
{
"epoch": 0.16,
"learning_rate": 4.918401044466631e-05,
"loss": 0.5042,
"step": 8500
},
{
"epoch": 0.17,
"learning_rate": 4.9136011059058446e-05,
"loss": 0.4923,
"step": 9000
},
{
"epoch": 0.18,
"learning_rate": 4.9088011673450585e-05,
"loss": 0.4898,
"step": 9500
},
{
"epoch": 0.19,
"learning_rate": 4.904001228784272e-05,
"loss": 0.4834,
"step": 10000
},
{
"epoch": 0.2,
"learning_rate": 4.899201290223485e-05,
"loss": 0.4757,
"step": 10500
},
{
"epoch": 0.21,
"learning_rate": 4.894401351662699e-05,
"loss": 0.4744,
"step": 11000
},
{
"epoch": 0.22,
"learning_rate": 4.889601413101912e-05,
"loss": 0.4641,
"step": 11500
},
{
"epoch": 0.23,
"learning_rate": 4.884801474541126e-05,
"loss": 0.4578,
"step": 12000
},
{
"epoch": 0.24,
"learning_rate": 4.8800015359803395e-05,
"loss": 0.4527,
"step": 12500
},
{
"epoch": 0.25,
"learning_rate": 4.875201597419553e-05,
"loss": 0.4507,
"step": 13000
},
{
"epoch": 0.26,
"learning_rate": 4.870401658858767e-05,
"loss": 0.4498,
"step": 13500
},
{
"epoch": 0.27,
"learning_rate": 4.86560172029798e-05,
"loss": 0.4407,
"step": 14000
},
{
"epoch": 0.28,
"learning_rate": 4.860801781737194e-05,
"loss": 0.4354,
"step": 14500
},
{
"epoch": 0.29,
"learning_rate": 4.856001843176408e-05,
"loss": 0.4319,
"step": 15000
},
{
"epoch": 0.3,
"learning_rate": 4.851201904615621e-05,
"loss": 0.4318,
"step": 15500
},
{
"epoch": 0.31,
"learning_rate": 4.846401966054835e-05,
"loss": 0.426,
"step": 16000
},
{
"epoch": 0.32,
"learning_rate": 4.8416020274940484e-05,
"loss": 0.4256,
"step": 16500
},
{
"epoch": 0.33,
"learning_rate": 4.8368020889332624e-05,
"loss": 0.4176,
"step": 17000
},
{
"epoch": 0.34,
"learning_rate": 4.8320021503724757e-05,
"loss": 0.419,
"step": 17500
},
{
"epoch": 0.35,
"learning_rate": 4.827202211811689e-05,
"loss": 0.4162,
"step": 18000
},
{
"epoch": 0.36,
"learning_rate": 4.822402273250903e-05,
"loss": 0.4127,
"step": 18500
},
{
"epoch": 0.36,
"learning_rate": 4.817602334690116e-05,
"loss": 0.408,
"step": 19000
},
{
"epoch": 0.37,
"learning_rate": 4.81280239612933e-05,
"loss": 0.4112,
"step": 19500
},
{
"epoch": 0.38,
"learning_rate": 4.8080024575685434e-05,
"loss": 0.4062,
"step": 20000
},
{
"epoch": 0.39,
"learning_rate": 4.8032025190077567e-05,
"loss": 0.4052,
"step": 20500
},
{
"epoch": 0.4,
"learning_rate": 4.7984025804469706e-05,
"loss": 0.3976,
"step": 21000
},
{
"epoch": 0.41,
"learning_rate": 4.793602641886184e-05,
"loss": 0.396,
"step": 21500
},
{
"epoch": 0.42,
"learning_rate": 4.788802703325398e-05,
"loss": 0.3962,
"step": 22000
},
{
"epoch": 0.43,
"learning_rate": 4.784002764764611e-05,
"loss": 0.395,
"step": 22500
},
{
"epoch": 0.44,
"learning_rate": 4.7792028262038244e-05,
"loss": 0.3942,
"step": 23000
},
{
"epoch": 0.45,
"learning_rate": 4.774402887643038e-05,
"loss": 0.3905,
"step": 23500
},
{
"epoch": 0.46,
"learning_rate": 4.7696029490822516e-05,
"loss": 0.3879,
"step": 24000
},
{
"epoch": 0.47,
"learning_rate": 4.7648030105214656e-05,
"loss": 0.386,
"step": 24500
},
{
"epoch": 0.48,
"learning_rate": 4.7600030719606795e-05,
"loss": 0.3841,
"step": 25000
},
{
"epoch": 0.49,
"learning_rate": 4.755203133399893e-05,
"loss": 0.3825,
"step": 25500
},
{
"epoch": 0.5,
"learning_rate": 4.750403194839107e-05,
"loss": 0.3796,
"step": 26000
},
{
"epoch": 0.51,
"learning_rate": 4.74560325627832e-05,
"loss": 0.3794,
"step": 26500
},
{
"epoch": 0.52,
"learning_rate": 4.740803317717533e-05,
"loss": 0.3779,
"step": 27000
},
{
"epoch": 0.53,
"learning_rate": 4.736003379156747e-05,
"loss": 0.3742,
"step": 27500
},
{
"epoch": 0.54,
"learning_rate": 4.7312034405959605e-05,
"loss": 0.3773,
"step": 28000
},
{
"epoch": 0.55,
"learning_rate": 4.7264035020351745e-05,
"loss": 0.3748,
"step": 28500
},
{
"epoch": 0.56,
"learning_rate": 4.721603563474388e-05,
"loss": 0.3742,
"step": 29000
},
{
"epoch": 0.57,
"learning_rate": 4.716803624913601e-05,
"loss": 0.3713,
"step": 29500
},
{
"epoch": 0.58,
"learning_rate": 4.712003686352815e-05,
"loss": 0.3689,
"step": 30000
},
{
"epoch": 0.59,
"learning_rate": 4.707203747792028e-05,
"loss": 0.3671,
"step": 30500
},
{
"epoch": 0.6,
"learning_rate": 4.702403809231242e-05,
"loss": 0.3667,
"step": 31000
},
{
"epoch": 0.6,
"learning_rate": 4.6976038706704554e-05,
"loss": 0.3692,
"step": 31500
},
{
"epoch": 0.61,
"learning_rate": 4.692803932109669e-05,
"loss": 0.3656,
"step": 32000
},
{
"epoch": 0.62,
"learning_rate": 4.688003993548883e-05,
"loss": 0.363,
"step": 32500
},
{
"epoch": 0.63,
"learning_rate": 4.683204054988096e-05,
"loss": 0.3603,
"step": 33000
},
{
"epoch": 0.64,
"learning_rate": 4.67840411642731e-05,
"loss": 0.3639,
"step": 33500
},
{
"epoch": 0.65,
"learning_rate": 4.673604177866523e-05,
"loss": 0.3613,
"step": 34000
},
{
"epoch": 0.66,
"learning_rate": 4.668804239305737e-05,
"loss": 0.3605,
"step": 34500
},
{
"epoch": 0.67,
"learning_rate": 4.664004300744951e-05,
"loss": 0.3561,
"step": 35000
},
{
"epoch": 0.68,
"learning_rate": 4.6592043621841643e-05,
"loss": 0.3601,
"step": 35500
},
{
"epoch": 0.69,
"learning_rate": 4.654404423623378e-05,
"loss": 0.3591,
"step": 36000
},
{
"epoch": 0.7,
"learning_rate": 4.6496044850625916e-05,
"loss": 0.3537,
"step": 36500
},
{
"epoch": 0.71,
"learning_rate": 4.644804546501805e-05,
"loss": 0.3565,
"step": 37000
},
{
"epoch": 0.72,
"learning_rate": 4.640004607941019e-05,
"loss": 0.3515,
"step": 37500
},
{
"epoch": 0.73,
"learning_rate": 4.635204669380232e-05,
"loss": 0.3524,
"step": 38000
},
{
"epoch": 0.74,
"learning_rate": 4.630404730819446e-05,
"loss": 0.3488,
"step": 38500
},
{
"epoch": 0.75,
"learning_rate": 4.625604792258659e-05,
"loss": 0.3506,
"step": 39000
},
{
"epoch": 0.76,
"learning_rate": 4.6208048536978726e-05,
"loss": 0.3509,
"step": 39500
},
{
"epoch": 0.77,
"learning_rate": 4.6160049151370865e-05,
"loss": 0.3485,
"step": 40000
},
{
"epoch": 0.78,
"learning_rate": 4.6112049765763e-05,
"loss": 0.3475,
"step": 40500
},
{
"epoch": 0.79,
"learning_rate": 4.606405038015514e-05,
"loss": 0.3473,
"step": 41000
},
{
"epoch": 0.8,
"learning_rate": 4.601605099454727e-05,
"loss": 0.3495,
"step": 41500
},
{
"epoch": 0.81,
"learning_rate": 4.59680516089394e-05,
"loss": 0.345,
"step": 42000
},
{
"epoch": 0.82,
"learning_rate": 4.592005222333154e-05,
"loss": 0.3432,
"step": 42500
},
{
"epoch": 0.83,
"learning_rate": 4.5872052837723675e-05,
"loss": 0.341,
"step": 43000
},
{
"epoch": 0.84,
"learning_rate": 4.5824053452115815e-05,
"loss": 0.3449,
"step": 43500
},
{
"epoch": 0.84,
"learning_rate": 4.577605406650795e-05,
"loss": 0.3425,
"step": 44000
},
{
"epoch": 0.85,
"learning_rate": 4.572805468090009e-05,
"loss": 0.3418,
"step": 44500
},
{
"epoch": 0.86,
"learning_rate": 4.5680055295292226e-05,
"loss": 0.3396,
"step": 45000
},
{
"epoch": 0.87,
"learning_rate": 4.563205590968436e-05,
"loss": 0.343,
"step": 45500
},
{
"epoch": 0.88,
"learning_rate": 4.55840565240765e-05,
"loss": 0.3408,
"step": 46000
},
{
"epoch": 0.89,
"learning_rate": 4.553605713846863e-05,
"loss": 0.3396,
"step": 46500
},
{
"epoch": 0.9,
"learning_rate": 4.5488057752860764e-05,
"loss": 0.3372,
"step": 47000
},
{
"epoch": 0.91,
"learning_rate": 4.5440058367252904e-05,
"loss": 0.3387,
"step": 47500
},
{
"epoch": 0.92,
"learning_rate": 4.5392058981645036e-05,
"loss": 0.3379,
"step": 48000
},
{
"epoch": 0.93,
"learning_rate": 4.5344059596037176e-05,
"loss": 0.333,
"step": 48500
},
{
"epoch": 0.94,
"learning_rate": 4.529606021042931e-05,
"loss": 0.3372,
"step": 49000
},
{
"epoch": 0.95,
"learning_rate": 4.524806082482144e-05,
"loss": 0.3333,
"step": 49500
},
{
"epoch": 0.96,
"learning_rate": 4.520006143921358e-05,
"loss": 0.3348,
"step": 50000
},
{
"epoch": 0.97,
"learning_rate": 4.5152062053605714e-05,
"loss": 0.3321,
"step": 50500
},
{
"epoch": 0.98,
"learning_rate": 4.510406266799785e-05,
"loss": 0.3342,
"step": 51000
},
{
"epoch": 0.99,
"learning_rate": 4.5056063282389986e-05,
"loss": 0.3334,
"step": 51500
},
{
"epoch": 1.0,
"learning_rate": 4.500806389678212e-05,
"loss": 0.3343,
"step": 52000
},
{
"epoch": 1.01,
"learning_rate": 4.496006451117426e-05,
"loss": 0.3306,
"step": 52500
},
{
"epoch": 1.02,
"learning_rate": 4.491206512556639e-05,
"loss": 0.3316,
"step": 53000
},
{
"epoch": 1.03,
"learning_rate": 4.486406573995853e-05,
"loss": 0.3289,
"step": 53500
},
{
"epoch": 1.04,
"learning_rate": 4.481606635435066e-05,
"loss": 0.3277,
"step": 54000
},
{
"epoch": 1.05,
"learning_rate": 4.47680669687428e-05,
"loss": 0.3288,
"step": 54500
},
{
"epoch": 1.06,
"learning_rate": 4.472006758313494e-05,
"loss": 0.3279,
"step": 55000
},
{
"epoch": 1.07,
"learning_rate": 4.4672068197527075e-05,
"loss": 0.3273,
"step": 55500
},
{
"epoch": 1.08,
"learning_rate": 4.4624068811919214e-05,
"loss": 0.3288,
"step": 56000
},
{
"epoch": 1.08,
"learning_rate": 4.457606942631135e-05,
"loss": 0.3257,
"step": 56500
},
{
"epoch": 1.09,
"learning_rate": 4.452807004070348e-05,
"loss": 0.3273,
"step": 57000
},
{
"epoch": 1.1,
"learning_rate": 4.448007065509562e-05,
"loss": 0.3269,
"step": 57500
},
{
"epoch": 1.11,
"learning_rate": 4.443207126948775e-05,
"loss": 0.3246,
"step": 58000
},
{
"epoch": 1.12,
"learning_rate": 4.438407188387989e-05,
"loss": 0.3232,
"step": 58500
},
{
"epoch": 1.13,
"learning_rate": 4.4336072498272024e-05,
"loss": 0.3248,
"step": 59000
},
{
"epoch": 1.14,
"learning_rate": 4.428807311266416e-05,
"loss": 0.3234,
"step": 59500
},
{
"epoch": 1.15,
"learning_rate": 4.4240073727056297e-05,
"loss": 0.3235,
"step": 60000
},
{
"epoch": 1.16,
"learning_rate": 4.419207434144843e-05,
"loss": 0.3232,
"step": 60500
},
{
"epoch": 1.17,
"learning_rate": 4.414407495584057e-05,
"loss": 0.3236,
"step": 61000
},
{
"epoch": 1.18,
"learning_rate": 4.40960755702327e-05,
"loss": 0.3218,
"step": 61500
},
{
"epoch": 1.19,
"learning_rate": 4.4048076184624834e-05,
"loss": 0.3213,
"step": 62000
},
{
"epoch": 1.2,
"learning_rate": 4.4000076799016974e-05,
"loss": 0.3185,
"step": 62500
},
{
"epoch": 1.21,
"learning_rate": 4.3952077413409107e-05,
"loss": 0.3204,
"step": 63000
},
{
"epoch": 1.22,
"learning_rate": 4.3904078027801246e-05,
"loss": 0.3204,
"step": 63500
},
{
"epoch": 1.23,
"learning_rate": 4.3856078642193386e-05,
"loss": 0.319,
"step": 64000
},
{
"epoch": 1.24,
"learning_rate": 4.380807925658552e-05,
"loss": 0.3193,
"step": 64500
},
{
"epoch": 1.25,
"learning_rate": 4.376007987097766e-05,
"loss": 0.3193,
"step": 65000
},
{
"epoch": 1.26,
"learning_rate": 4.371208048536979e-05,
"loss": 0.3183,
"step": 65500
},
{
"epoch": 1.27,
"learning_rate": 4.366408109976193e-05,
"loss": 0.3175,
"step": 66000
},
{
"epoch": 1.28,
"learning_rate": 4.361608171415406e-05,
"loss": 0.3154,
"step": 66500
},
{
"epoch": 1.29,
"learning_rate": 4.3568082328546196e-05,
"loss": 0.3154,
"step": 67000
},
{
"epoch": 1.3,
"learning_rate": 4.3520082942938335e-05,
"loss": 0.3164,
"step": 67500
},
{
"epoch": 1.31,
"learning_rate": 4.347208355733047e-05,
"loss": 0.3136,
"step": 68000
},
{
"epoch": 1.32,
"learning_rate": 4.342408417172261e-05,
"loss": 0.3172,
"step": 68500
},
{
"epoch": 1.32,
"learning_rate": 4.337608478611474e-05,
"loss": 0.3154,
"step": 69000
},
{
"epoch": 1.33,
"learning_rate": 4.332808540050687e-05,
"loss": 0.3145,
"step": 69500
},
{
"epoch": 1.34,
"learning_rate": 4.328008601489901e-05,
"loss": 0.3166,
"step": 70000
},
{
"epoch": 1.35,
"learning_rate": 4.3232086629291145e-05,
"loss": 0.3156,
"step": 70500
},
{
"epoch": 1.36,
"learning_rate": 4.3184087243683285e-05,
"loss": 0.3138,
"step": 71000
},
{
"epoch": 1.37,
"learning_rate": 4.313608785807542e-05,
"loss": 0.3138,
"step": 71500
},
{
"epoch": 1.38,
"learning_rate": 4.308808847246755e-05,
"loss": 0.3139,
"step": 72000
},
{
"epoch": 1.39,
"learning_rate": 4.304008908685969e-05,
"loss": 0.3097,
"step": 72500
},
{
"epoch": 1.4,
"learning_rate": 4.299208970125182e-05,
"loss": 0.3126,
"step": 73000
},
{
"epoch": 1.41,
"learning_rate": 4.294409031564396e-05,
"loss": 0.312,
"step": 73500
},
{
"epoch": 1.42,
"learning_rate": 4.28960909300361e-05,
"loss": 0.3109,
"step": 74000
},
{
"epoch": 1.43,
"learning_rate": 4.2848091544428234e-05,
"loss": 0.3093,
"step": 74500
},
{
"epoch": 1.44,
"learning_rate": 4.2800092158820374e-05,
"loss": 0.3122,
"step": 75000
},
{
"epoch": 1.45,
"learning_rate": 4.2752092773212506e-05,
"loss": 0.3118,
"step": 75500
},
{
"epoch": 1.46,
"learning_rate": 4.2704093387604646e-05,
"loss": 0.3106,
"step": 76000
},
{
"epoch": 1.47,
"learning_rate": 4.265609400199678e-05,
"loss": 0.3098,
"step": 76500
},
{
"epoch": 1.48,
"learning_rate": 4.260809461638891e-05,
"loss": 0.3096,
"step": 77000
},
{
"epoch": 1.49,
"learning_rate": 4.256009523078105e-05,
"loss": 0.3054,
"step": 77500
},
{
"epoch": 1.5,
"learning_rate": 4.2512095845173184e-05,
"loss": 0.3077,
"step": 78000
},
{
"epoch": 1.51,
"learning_rate": 4.246409645956532e-05,
"loss": 0.308,
"step": 78500
},
{
"epoch": 1.52,
"learning_rate": 4.2416097073957456e-05,
"loss": 0.3059,
"step": 79000
},
{
"epoch": 1.53,
"learning_rate": 4.236809768834959e-05,
"loss": 0.3057,
"step": 79500
},
{
"epoch": 1.54,
"learning_rate": 4.232009830274173e-05,
"loss": 0.3083,
"step": 80000
},
{
"epoch": 1.55,
"learning_rate": 4.227209891713386e-05,
"loss": 0.3089,
"step": 80500
},
{
"epoch": 1.56,
"learning_rate": 4.2224099531525993e-05,
"loss": 0.3069,
"step": 81000
},
{
"epoch": 1.56,
"learning_rate": 4.217610014591813e-05,
"loss": 0.3052,
"step": 81500
},
{
"epoch": 1.57,
"learning_rate": 4.2128100760310266e-05,
"loss": 0.3073,
"step": 82000
},
{
"epoch": 1.58,
"learning_rate": 4.2080101374702405e-05,
"loss": 0.3044,
"step": 82500
},
{
"epoch": 1.59,
"learning_rate": 4.203210198909454e-05,
"loss": 0.3058,
"step": 83000
},
{
"epoch": 1.6,
"learning_rate": 4.198410260348668e-05,
"loss": 0.3072,
"step": 83500
},
{
"epoch": 1.61,
"learning_rate": 4.193610321787882e-05,
"loss": 0.3038,
"step": 84000
},
{
"epoch": 1.62,
"learning_rate": 4.188810383227095e-05,
"loss": 0.3028,
"step": 84500
},
{
"epoch": 1.63,
"learning_rate": 4.184010444666309e-05,
"loss": 0.3038,
"step": 85000
},
{
"epoch": 1.64,
"learning_rate": 4.179210506105522e-05,
"loss": 0.3041,
"step": 85500
},
{
"epoch": 1.65,
"learning_rate": 4.1744105675447355e-05,
"loss": 0.3039,
"step": 86000
},
{
"epoch": 1.66,
"learning_rate": 4.1696106289839494e-05,
"loss": 0.2987,
"step": 86500
},
{
"epoch": 1.67,
"learning_rate": 4.164810690423163e-05,
"loss": 0.3023,
"step": 87000
},
{
"epoch": 1.68,
"learning_rate": 4.1600107518623766e-05,
"loss": 0.3011,
"step": 87500
},
{
"epoch": 1.69,
"learning_rate": 4.15521081330159e-05,
"loss": 0.3021,
"step": 88000
},
{
"epoch": 1.7,
"learning_rate": 4.150410874740803e-05,
"loss": 0.3022,
"step": 88500
},
{
"epoch": 1.71,
"learning_rate": 4.145610936180017e-05,
"loss": 0.3014,
"step": 89000
},
{
"epoch": 1.72,
"learning_rate": 4.1408109976192304e-05,
"loss": 0.3029,
"step": 89500
},
{
"epoch": 1.73,
"learning_rate": 4.1360110590584444e-05,
"loss": 0.3003,
"step": 90000
},
{
"epoch": 1.74,
"learning_rate": 4.1312111204976576e-05,
"loss": 0.2989,
"step": 90500
},
{
"epoch": 1.75,
"learning_rate": 4.126411181936871e-05,
"loss": 0.3008,
"step": 91000
},
{
"epoch": 1.76,
"learning_rate": 4.121611243376085e-05,
"loss": 0.3007,
"step": 91500
},
{
"epoch": 1.77,
"learning_rate": 4.116811304815298e-05,
"loss": 0.3003,
"step": 92000
},
{
"epoch": 1.78,
"learning_rate": 4.112011366254512e-05,
"loss": 0.3015,
"step": 92500
},
{
"epoch": 1.79,
"learning_rate": 4.1072114276937254e-05,
"loss": 0.2995,
"step": 93000
},
{
"epoch": 1.8,
"learning_rate": 4.102411489132939e-05,
"loss": 0.2946,
"step": 93500
},
{
"epoch": 1.8,
"learning_rate": 4.097611550572153e-05,
"loss": 0.2982,
"step": 94000
},
{
"epoch": 1.81,
"learning_rate": 4.0928116120113665e-05,
"loss": 0.2993,
"step": 94500
},
{
"epoch": 1.82,
"learning_rate": 4.0880116734505805e-05,
"loss": 0.2957,
"step": 95000
},
{
"epoch": 1.83,
"learning_rate": 4.083211734889794e-05,
"loss": 0.3004,
"step": 95500
},
{
"epoch": 1.84,
"learning_rate": 4.078411796329007e-05,
"loss": 0.2954,
"step": 96000
},
{
"epoch": 1.85,
"learning_rate": 4.073611857768221e-05,
"loss": 0.298,
"step": 96500
},
{
"epoch": 1.86,
"learning_rate": 4.068811919207434e-05,
"loss": 0.2967,
"step": 97000
},
{
"epoch": 1.87,
"learning_rate": 4.064011980646648e-05,
"loss": 0.2974,
"step": 97500
},
{
"epoch": 1.88,
"learning_rate": 4.0592120420858615e-05,
"loss": 0.2968,
"step": 98000
},
{
"epoch": 1.89,
"learning_rate": 4.054412103525075e-05,
"loss": 0.297,
"step": 98500
},
{
"epoch": 1.9,
"learning_rate": 4.049612164964289e-05,
"loss": 0.2933,
"step": 99000
},
{
"epoch": 1.91,
"learning_rate": 4.044812226403502e-05,
"loss": 0.2935,
"step": 99500
},
{
"epoch": 1.92,
"learning_rate": 4.040012287842716e-05,
"loss": 0.297,
"step": 100000
},
{
"epoch": 1.93,
"learning_rate": 4.035212349281929e-05,
"loss": 0.2956,
"step": 100500
},
{
"epoch": 1.94,
"learning_rate": 4.0304124107211425e-05,
"loss": 0.2947,
"step": 101000
},
{
"epoch": 1.95,
"learning_rate": 4.0256124721603564e-05,
"loss": 0.2954,
"step": 101500
},
{
"epoch": 1.96,
"learning_rate": 4.02081253359957e-05,
"loss": 0.2914,
"step": 102000
},
{
"epoch": 1.97,
"learning_rate": 4.016012595038784e-05,
"loss": 0.2936,
"step": 102500
},
{
"epoch": 1.98,
"learning_rate": 4.011212656477997e-05,
"loss": 0.2943,
"step": 103000
},
{
"epoch": 1.99,
"learning_rate": 4.006412717917211e-05,
"loss": 0.2929,
"step": 103500
},
{
"epoch": 2.0,
"learning_rate": 4.001612779356425e-05,
"loss": 0.2946,
"step": 104000
},
{
"epoch": 2.01,
"learning_rate": 3.996812840795638e-05,
"loss": 0.2911,
"step": 104500
},
{
"epoch": 2.02,
"learning_rate": 3.992012902234852e-05,
"loss": 0.2931,
"step": 105000
},
{
"epoch": 2.03,
"learning_rate": 3.9872129636740653e-05,
"loss": 0.2916,
"step": 105500
},
{
"epoch": 2.04,
"learning_rate": 3.9824130251132786e-05,
"loss": 0.2927,
"step": 106000
},
{
"epoch": 2.04,
"learning_rate": 3.9776130865524926e-05,
"loss": 0.2888,
"step": 106500
},
{
"epoch": 2.05,
"learning_rate": 3.972813147991706e-05,
"loss": 0.2887,
"step": 107000
},
{
"epoch": 2.06,
"learning_rate": 3.96801320943092e-05,
"loss": 0.2892,
"step": 107500
},
{
"epoch": 2.07,
"learning_rate": 3.963213270870133e-05,
"loss": 0.2909,
"step": 108000
},
{
"epoch": 2.08,
"learning_rate": 3.958413332309346e-05,
"loss": 0.2908,
"step": 108500
},
{
"epoch": 2.09,
"learning_rate": 3.95361339374856e-05,
"loss": 0.2913,
"step": 109000
},
{
"epoch": 2.1,
"learning_rate": 3.9488134551877736e-05,
"loss": 0.2895,
"step": 109500
},
{
"epoch": 2.11,
"learning_rate": 3.9440135166269875e-05,
"loss": 0.2897,
"step": 110000
},
{
"epoch": 2.12,
"learning_rate": 3.939213578066201e-05,
"loss": 0.2895,
"step": 110500
},
{
"epoch": 2.13,
"learning_rate": 3.934413639505414e-05,
"loss": 0.2902,
"step": 111000
},
{
"epoch": 2.14,
"learning_rate": 3.929613700944628e-05,
"loss": 0.2894,
"step": 111500
},
{
"epoch": 2.15,
"learning_rate": 3.924813762383841e-05,
"loss": 0.289,
"step": 112000
},
{
"epoch": 2.16,
"learning_rate": 3.920013823823055e-05,
"loss": 0.2871,
"step": 112500
},
{
"epoch": 2.17,
"learning_rate": 3.9152138852622685e-05,
"loss": 0.287,
"step": 113000
},
{
"epoch": 2.18,
"learning_rate": 3.9104139467014825e-05,
"loss": 0.2886,
"step": 113500
},
{
"epoch": 2.19,
"learning_rate": 3.9056140081406964e-05,
"loss": 0.2862,
"step": 114000
},
{
"epoch": 2.2,
"learning_rate": 3.90081406957991e-05,
"loss": 0.2866,
"step": 114500
},
{
"epoch": 2.21,
"learning_rate": 3.8960141310191236e-05,
"loss": 0.2863,
"step": 115000
},
{
"epoch": 2.22,
"learning_rate": 3.891214192458337e-05,
"loss": 0.289,
"step": 115500
},
{
"epoch": 2.23,
"learning_rate": 3.88641425389755e-05,
"loss": 0.2866,
"step": 116000
},
{
"epoch": 2.24,
"learning_rate": 3.881614315336764e-05,
"loss": 0.2867,
"step": 116500
},
{
"epoch": 2.25,
"learning_rate": 3.8768143767759774e-05,
"loss": 0.289,
"step": 117000
},
{
"epoch": 2.26,
"learning_rate": 3.8720144382151914e-05,
"loss": 0.2859,
"step": 117500
},
{
"epoch": 2.27,
"learning_rate": 3.8672144996544046e-05,
"loss": 0.2846,
"step": 118000
},
{
"epoch": 2.28,
"learning_rate": 3.862414561093618e-05,
"loss": 0.2873,
"step": 118500
},
{
"epoch": 2.28,
"learning_rate": 3.857614622532832e-05,
"loss": 0.2877,
"step": 119000
},
{
"epoch": 2.29,
"learning_rate": 3.852814683972045e-05,
"loss": 0.2875,
"step": 119500
},
{
"epoch": 2.3,
"learning_rate": 3.848014745411259e-05,
"loss": 0.2838,
"step": 120000
},
{
"epoch": 2.31,
"learning_rate": 3.8432148068504724e-05,
"loss": 0.2835,
"step": 120500
},
{
"epoch": 2.32,
"learning_rate": 3.8384148682896856e-05,
"loss": 0.2865,
"step": 121000
},
{
"epoch": 2.33,
"learning_rate": 3.8336149297288996e-05,
"loss": 0.2835,
"step": 121500
},
{
"epoch": 2.34,
"learning_rate": 3.828814991168113e-05,
"loss": 0.2863,
"step": 122000
},
{
"epoch": 2.35,
"learning_rate": 3.824015052607327e-05,
"loss": 0.2867,
"step": 122500
},
{
"epoch": 2.36,
"learning_rate": 3.819215114046541e-05,
"loss": 0.2845,
"step": 123000
},
{
"epoch": 2.37,
"learning_rate": 3.814415175485754e-05,
"loss": 0.2851,
"step": 123500
},
{
"epoch": 2.38,
"learning_rate": 3.809615236924968e-05,
"loss": 0.2826,
"step": 124000
},
{
"epoch": 2.39,
"learning_rate": 3.804815298364181e-05,
"loss": 0.2842,
"step": 124500
},
{
"epoch": 2.4,
"learning_rate": 3.800015359803395e-05,
"loss": 0.2843,
"step": 125000
},
{
"epoch": 2.41,
"learning_rate": 3.7952154212426085e-05,
"loss": 0.2828,
"step": 125500
},
{
"epoch": 2.42,
"learning_rate": 3.790415482681822e-05,
"loss": 0.2832,
"step": 126000
},
{
"epoch": 2.43,
"learning_rate": 3.785615544121036e-05,
"loss": 0.2842,
"step": 126500
},
{
"epoch": 2.44,
"learning_rate": 3.780815605560249e-05,
"loss": 0.2821,
"step": 127000
},
{
"epoch": 2.45,
"learning_rate": 3.776015666999463e-05,
"loss": 0.2818,
"step": 127500
},
{
"epoch": 2.46,
"learning_rate": 3.771215728438676e-05,
"loss": 0.2824,
"step": 128000
},
{
"epoch": 2.47,
"learning_rate": 3.7664157898778895e-05,
"loss": 0.2812,
"step": 128500
},
{
"epoch": 2.48,
"learning_rate": 3.7616158513171034e-05,
"loss": 0.2836,
"step": 129000
},
{
"epoch": 2.49,
"learning_rate": 3.756815912756317e-05,
"loss": 0.2776,
"step": 129500
},
{
"epoch": 2.5,
"learning_rate": 3.7520159741955307e-05,
"loss": 0.2836,
"step": 130000
},
{
"epoch": 2.51,
"learning_rate": 3.747216035634744e-05,
"loss": 0.2813,
"step": 130500
},
{
"epoch": 2.52,
"learning_rate": 3.742416097073957e-05,
"loss": 0.2803,
"step": 131000
},
{
"epoch": 2.52,
"learning_rate": 3.737616158513171e-05,
"loss": 0.2819,
"step": 131500
},
{
"epoch": 2.53,
"learning_rate": 3.7328162199523844e-05,
"loss": 0.2785,
"step": 132000
},
{
"epoch": 2.54,
"learning_rate": 3.7280162813915984e-05,
"loss": 0.2821,
"step": 132500
},
{
"epoch": 2.55,
"learning_rate": 3.723216342830812e-05,
"loss": 0.2815,
"step": 133000
},
{
"epoch": 2.56,
"learning_rate": 3.7184164042700256e-05,
"loss": 0.2794,
"step": 133500
},
{
"epoch": 2.57,
"learning_rate": 3.7136164657092396e-05,
"loss": 0.281,
"step": 134000
},
{
"epoch": 2.58,
"learning_rate": 3.708816527148453e-05,
"loss": 0.2801,
"step": 134500
},
{
"epoch": 2.59,
"learning_rate": 3.704016588587666e-05,
"loss": 0.282,
"step": 135000
},
{
"epoch": 2.6,
"learning_rate": 3.69921665002688e-05,
"loss": 0.2813,
"step": 135500
},
{
"epoch": 2.61,
"learning_rate": 3.694416711466093e-05,
"loss": 0.2779,
"step": 136000
},
{
"epoch": 2.62,
"learning_rate": 3.689616772905307e-05,
"loss": 0.2788,
"step": 136500
},
{
"epoch": 2.63,
"learning_rate": 3.6848168343445205e-05,
"loss": 0.2828,
"step": 137000
},
{
"epoch": 2.64,
"learning_rate": 3.680016895783734e-05,
"loss": 0.279,
"step": 137500
},
{
"epoch": 2.65,
"learning_rate": 3.675216957222948e-05,
"loss": 0.2767,
"step": 138000
},
{
"epoch": 2.66,
"learning_rate": 3.670417018662161e-05,
"loss": 0.2773,
"step": 138500
},
{
"epoch": 2.67,
"learning_rate": 3.665617080101375e-05,
"loss": 0.2789,
"step": 139000
},
{
"epoch": 2.68,
"learning_rate": 3.660817141540588e-05,
"loss": 0.281,
"step": 139500
},
{
"epoch": 2.69,
"learning_rate": 3.6560172029798015e-05,
"loss": 0.2795,
"step": 140000
},
{
"epoch": 2.7,
"learning_rate": 3.6512172644190155e-05,
"loss": 0.2772,
"step": 140500
},
{
"epoch": 2.71,
"learning_rate": 3.646417325858229e-05,
"loss": 0.2779,
"step": 141000
},
{
"epoch": 2.72,
"learning_rate": 3.641617387297443e-05,
"loss": 0.2818,
"step": 141500
},
{
"epoch": 2.73,
"learning_rate": 3.636817448736656e-05,
"loss": 0.2807,
"step": 142000
},
{
"epoch": 2.74,
"learning_rate": 3.63201751017587e-05,
"loss": 0.2786,
"step": 142500
},
{
"epoch": 2.75,
"learning_rate": 3.627217571615084e-05,
"loss": 0.2776,
"step": 143000
},
{
"epoch": 2.76,
"learning_rate": 3.622417633054297e-05,
"loss": 0.2782,
"step": 143500
},
{
"epoch": 2.76,
"learning_rate": 3.617617694493511e-05,
"loss": 0.2767,
"step": 144000
},
{
"epoch": 2.77,
"learning_rate": 3.6128177559327244e-05,
"loss": 0.2773,
"step": 144500
},
{
"epoch": 2.78,
"learning_rate": 3.608017817371938e-05,
"loss": 0.2798,
"step": 145000
},
{
"epoch": 2.79,
"learning_rate": 3.6032178788111516e-05,
"loss": 0.2789,
"step": 145500
},
{
"epoch": 2.8,
"learning_rate": 3.598417940250365e-05,
"loss": 0.2725,
"step": 146000
},
{
"epoch": 2.81,
"learning_rate": 3.593618001689579e-05,
"loss": 0.2765,
"step": 146500
},
{
"epoch": 2.82,
"learning_rate": 3.588818063128792e-05,
"loss": 0.2764,
"step": 147000
},
{
"epoch": 2.83,
"learning_rate": 3.5840181245680054e-05,
"loss": 0.2748,
"step": 147500
},
{
"epoch": 2.84,
"learning_rate": 3.5792181860072193e-05,
"loss": 0.2764,
"step": 148000
},
{
"epoch": 2.85,
"learning_rate": 3.5744182474464326e-05,
"loss": 0.2753,
"step": 148500
},
{
"epoch": 2.86,
"learning_rate": 3.5696183088856466e-05,
"loss": 0.278,
"step": 149000
},
{
"epoch": 2.87,
"learning_rate": 3.56481837032486e-05,
"loss": 0.2757,
"step": 149500
},
{
"epoch": 2.88,
"learning_rate": 3.560018431764073e-05,
"loss": 0.277,
"step": 150000
},
{
"epoch": 2.89,
"learning_rate": 3.555218493203287e-05,
"loss": 0.2774,
"step": 150500
},
{
"epoch": 2.9,
"learning_rate": 3.5504185546425003e-05,
"loss": 0.2765,
"step": 151000
},
{
"epoch": 2.91,
"learning_rate": 3.545618616081714e-05,
"loss": 0.2783,
"step": 151500
},
{
"epoch": 2.92,
"learning_rate": 3.5408186775209276e-05,
"loss": 0.2764,
"step": 152000
},
{
"epoch": 2.93,
"learning_rate": 3.5360187389601415e-05,
"loss": 0.2756,
"step": 152500
},
{
"epoch": 2.94,
"learning_rate": 3.5312188003993555e-05,
"loss": 0.2783,
"step": 153000
},
{
"epoch": 2.95,
"learning_rate": 3.526418861838569e-05,
"loss": 0.2748,
"step": 153500
},
{
"epoch": 2.96,
"learning_rate": 3.521618923277783e-05,
"loss": 0.2741,
"step": 154000
},
{
"epoch": 2.97,
"learning_rate": 3.516818984716996e-05,
"loss": 0.273,
"step": 154500
},
{
"epoch": 2.98,
"learning_rate": 3.512019046156209e-05,
"loss": 0.2719,
"step": 155000
},
{
"epoch": 2.99,
"learning_rate": 3.507219107595423e-05,
"loss": 0.2726,
"step": 155500
},
{
"epoch": 3.0,
"learning_rate": 3.5024191690346365e-05,
"loss": 0.2718,
"step": 156000
},
{
"epoch": 3.0,
"learning_rate": 3.4976192304738504e-05,
"loss": 0.2729,
"step": 156500
},
{
"epoch": 3.01,
"learning_rate": 3.492819291913064e-05,
"loss": 0.2712,
"step": 157000
},
{
"epoch": 3.02,
"learning_rate": 3.488019353352277e-05,
"loss": 0.2742,
"step": 157500
},
{
"epoch": 3.03,
"learning_rate": 3.483219414791491e-05,
"loss": 0.2721,
"step": 158000
},
{
"epoch": 3.04,
"learning_rate": 3.478419476230704e-05,
"loss": 0.2731,
"step": 158500
},
{
"epoch": 3.05,
"learning_rate": 3.473619537669918e-05,
"loss": 0.2718,
"step": 159000
},
{
"epoch": 3.06,
"learning_rate": 3.4688195991091314e-05,
"loss": 0.2762,
"step": 159500
},
{
"epoch": 3.07,
"learning_rate": 3.464019660548345e-05,
"loss": 0.2748,
"step": 160000
},
{
"epoch": 3.08,
"learning_rate": 3.4592197219875586e-05,
"loss": 0.2726,
"step": 160500
},
{
"epoch": 3.09,
"learning_rate": 3.454419783426772e-05,
"loss": 0.2729,
"step": 161000
},
{
"epoch": 3.1,
"learning_rate": 3.449619844865986e-05,
"loss": 0.2754,
"step": 161500
},
{
"epoch": 3.11,
"learning_rate": 3.444819906305199e-05,
"loss": 0.2734,
"step": 162000
},
{
"epoch": 3.12,
"learning_rate": 3.440019967744413e-05,
"loss": 0.273,
"step": 162500
},
{
"epoch": 3.13,
"learning_rate": 3.435220029183627e-05,
"loss": 0.2708,
"step": 163000
},
{
"epoch": 3.14,
"learning_rate": 3.43042009062284e-05,
"loss": 0.2713,
"step": 163500
},
{
"epoch": 3.15,
"learning_rate": 3.425620152062054e-05,
"loss": 0.2714,
"step": 164000
},
{
"epoch": 3.16,
"learning_rate": 3.4208202135012675e-05,
"loss": 0.2731,
"step": 164500
},
{
"epoch": 3.17,
"learning_rate": 3.416020274940481e-05,
"loss": 0.2696,
"step": 165000
},
{
"epoch": 3.18,
"learning_rate": 3.411220336379695e-05,
"loss": 0.2721,
"step": 165500
},
{
"epoch": 3.19,
"learning_rate": 3.406420397818908e-05,
"loss": 0.2731,
"step": 166000
},
{
"epoch": 3.2,
"learning_rate": 3.401620459258122e-05,
"loss": 0.2694,
"step": 166500
},
{
"epoch": 3.21,
"learning_rate": 3.396820520697335e-05,
"loss": 0.2717,
"step": 167000
},
{
"epoch": 3.22,
"learning_rate": 3.3920205821365485e-05,
"loss": 0.2703,
"step": 167500
},
{
"epoch": 3.23,
"learning_rate": 3.3872206435757625e-05,
"loss": 0.2707,
"step": 168000
},
{
"epoch": 3.24,
"learning_rate": 3.382420705014976e-05,
"loss": 0.2709,
"step": 168500
},
{
"epoch": 3.24,
"learning_rate": 3.37762076645419e-05,
"loss": 0.2702,
"step": 169000
},
{
"epoch": 3.25,
"learning_rate": 3.372820827893403e-05,
"loss": 0.2691,
"step": 169500
},
{
"epoch": 3.26,
"learning_rate": 3.368020889332616e-05,
"loss": 0.2693,
"step": 170000
},
{
"epoch": 3.27,
"learning_rate": 3.36322095077183e-05,
"loss": 0.2704,
"step": 170500
},
{
"epoch": 3.28,
"learning_rate": 3.3584210122110435e-05,
"loss": 0.2726,
"step": 171000
},
{
"epoch": 3.29,
"learning_rate": 3.3536210736502574e-05,
"loss": 0.2679,
"step": 171500
},
{
"epoch": 3.3,
"learning_rate": 3.348821135089471e-05,
"loss": 0.2683,
"step": 172000
},
{
"epoch": 3.31,
"learning_rate": 3.3440211965286847e-05,
"loss": 0.2699,
"step": 172500
},
{
"epoch": 3.32,
"learning_rate": 3.3392212579678986e-05,
"loss": 0.2686,
"step": 173000
},
{
"epoch": 3.33,
"learning_rate": 3.334421319407112e-05,
"loss": 0.2695,
"step": 173500
},
{
"epoch": 3.34,
"learning_rate": 3.329621380846326e-05,
"loss": 0.2686,
"step": 174000
},
{
"epoch": 3.35,
"learning_rate": 3.324821442285539e-05,
"loss": 0.2673,
"step": 174500
},
{
"epoch": 3.36,
"learning_rate": 3.3200215037247524e-05,
"loss": 0.2682,
"step": 175000
},
{
"epoch": 3.37,
"learning_rate": 3.315221565163966e-05,
"loss": 0.2703,
"step": 175500
},
{
"epoch": 3.38,
"learning_rate": 3.3104216266031796e-05,
"loss": 0.267,
"step": 176000
},
{
"epoch": 3.39,
"learning_rate": 3.3056216880423936e-05,
"loss": 0.2674,
"step": 176500
},
{
"epoch": 3.4,
"learning_rate": 3.300821749481607e-05,
"loss": 0.2679,
"step": 177000
},
{
"epoch": 3.41,
"learning_rate": 3.29602181092082e-05,
"loss": 0.2679,
"step": 177500
},
{
"epoch": 3.42,
"learning_rate": 3.291221872360034e-05,
"loss": 0.2641,
"step": 178000
},
{
"epoch": 3.43,
"learning_rate": 3.286421933799247e-05,
"loss": 0.2666,
"step": 178500
},
{
"epoch": 3.44,
"learning_rate": 3.281621995238461e-05,
"loss": 0.2682,
"step": 179000
},
{
"epoch": 3.45,
"learning_rate": 3.2768220566776746e-05,
"loss": 0.2672,
"step": 179500
},
{
"epoch": 3.46,
"learning_rate": 3.272022118116888e-05,
"loss": 0.2659,
"step": 180000
},
{
"epoch": 3.47,
"learning_rate": 3.267222179556102e-05,
"loss": 0.2648,
"step": 180500
},
{
"epoch": 3.48,
"learning_rate": 3.262422240995315e-05,
"loss": 0.2683,
"step": 181000
},
{
"epoch": 3.48,
"learning_rate": 3.257622302434529e-05,
"loss": 0.2669,
"step": 181500
},
{
"epoch": 3.49,
"learning_rate": 3.252822363873743e-05,
"loss": 0.2667,
"step": 182000
},
{
"epoch": 3.5,
"learning_rate": 3.248022425312956e-05,
"loss": 0.2676,
"step": 182500
},
{
"epoch": 3.51,
"learning_rate": 3.24322248675217e-05,
"loss": 0.2657,
"step": 183000
},
{
"epoch": 3.52,
"learning_rate": 3.2384225481913835e-05,
"loss": 0.2663,
"step": 183500
},
{
"epoch": 3.53,
"learning_rate": 3.2336226096305974e-05,
"loss": 0.2664,
"step": 184000
},
{
"epoch": 3.54,
"learning_rate": 3.228822671069811e-05,
"loss": 0.2642,
"step": 184500
},
{
"epoch": 3.55,
"learning_rate": 3.224022732509024e-05,
"loss": 0.2674,
"step": 185000
},
{
"epoch": 3.56,
"learning_rate": 3.219222793948238e-05,
"loss": 0.2662,
"step": 185500
},
{
"epoch": 3.57,
"learning_rate": 3.214422855387451e-05,
"loss": 0.2657,
"step": 186000
},
{
"epoch": 3.58,
"learning_rate": 3.209622916826665e-05,
"loss": 0.2633,
"step": 186500
},
{
"epoch": 3.59,
"learning_rate": 3.2048229782658784e-05,
"loss": 0.2661,
"step": 187000
},
{
"epoch": 3.6,
"learning_rate": 3.200023039705092e-05,
"loss": 0.2672,
"step": 187500
},
{
"epoch": 3.61,
"learning_rate": 3.1952231011443056e-05,
"loss": 0.2661,
"step": 188000
},
{
"epoch": 3.62,
"learning_rate": 3.190423162583519e-05,
"loss": 0.2672,
"step": 188500
},
{
"epoch": 3.63,
"learning_rate": 3.185623224022732e-05,
"loss": 0.263,
"step": 189000
},
{
"epoch": 3.64,
"learning_rate": 3.180823285461946e-05,
"loss": 0.267,
"step": 189500
},
{
"epoch": 3.65,
"learning_rate": 3.1760233469011594e-05,
"loss": 0.2624,
"step": 190000
}
],
"max_steps": 520840,
"num_train_epochs": 10,
"total_flos": 1.929376694335058e+18,
"trial_name": null,
"trial_params": null
}