{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999549121276575, "global_step": 204000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2e-05, "loss": 4.2154, "step": 500 }, { "epoch": 0.01, "learning_rate": 1.9950871056174036e-05, "loss": 3.5237, "step": 1000 }, { "epoch": 0.01, "learning_rate": 1.990174211234807e-05, "loss": 3.3401, "step": 1500 }, { "epoch": 0.02, "learning_rate": 1.9852613168522107e-05, "loss": 3.2179, "step": 2000 }, { "epoch": 0.02, "learning_rate": 1.980348422469614e-05, "loss": 3.1655, "step": 2500 }, { "epoch": 0.03, "learning_rate": 1.9754355280870174e-05, "loss": 3.1335, "step": 3000 }, { "epoch": 0.03, "learning_rate": 1.9705226337044208e-05, "loss": 3.1039, "step": 3500 }, { "epoch": 0.04, "learning_rate": 1.9656097393218242e-05, "loss": 3.0406, "step": 4000 }, { "epoch": 0.04, "learning_rate": 1.9606968449392276e-05, "loss": 3.0212, "step": 4500 }, { "epoch": 0.05, "learning_rate": 1.9557839505566313e-05, "loss": 3.0054, "step": 5000 }, { "epoch": 0.05, "learning_rate": 1.9508710561740344e-05, "loss": 2.9778, "step": 5500 }, { "epoch": 0.06, "learning_rate": 1.945958161791438e-05, "loss": 2.9428, "step": 6000 }, { "epoch": 0.06, "learning_rate": 1.9410452674088415e-05, "loss": 2.9546, "step": 6500 }, { "epoch": 0.07, "learning_rate": 1.936132373026245e-05, "loss": 2.9442, "step": 7000 }, { "epoch": 0.07, "learning_rate": 1.9312194786436483e-05, "loss": 2.9142, "step": 7500 }, { "epoch": 0.08, "learning_rate": 1.9263065842610517e-05, "loss": 2.8991, "step": 8000 }, { "epoch": 0.08, "learning_rate": 1.921393689878455e-05, "loss": 2.8399, "step": 8500 }, { "epoch": 0.09, "learning_rate": 1.9164807954958588e-05, "loss": 2.8758, "step": 9000 }, { "epoch": 0.09, "learning_rate": 1.9115679011132618e-05, "loss": 2.8876, "step": 9500 }, { "epoch": 0.1, "learning_rate": 1.9066550067306656e-05, "loss": 2.8404, "step": 10000 }, { "epoch": 0.1, "learning_rate": 1.901742112348069e-05, "loss": 2.8612, "step": 10500 }, { "epoch": 0.11, "learning_rate": 1.8968292179654723e-05, "loss": 2.8226, "step": 11000 }, { "epoch": 0.11, "learning_rate": 1.8919163235828757e-05, "loss": 2.8144, "step": 11500 }, { "epoch": 0.12, "learning_rate": 1.887003429200279e-05, "loss": 2.8132, "step": 12000 }, { "epoch": 0.12, "learning_rate": 1.8820905348176825e-05, "loss": 2.8182, "step": 12500 }, { "epoch": 0.13, "learning_rate": 1.8771776404350862e-05, "loss": 2.7883, "step": 13000 }, { "epoch": 0.13, "learning_rate": 1.8722647460524893e-05, "loss": 2.7961, "step": 13500 }, { "epoch": 0.14, "learning_rate": 1.867351851669893e-05, "loss": 2.7605, "step": 14000 }, { "epoch": 0.14, "learning_rate": 1.8624389572872964e-05, "loss": 2.7576, "step": 14500 }, { "epoch": 0.15, "learning_rate": 1.8575260629046998e-05, "loss": 2.7763, "step": 15000 }, { "epoch": 0.15, "learning_rate": 1.852613168522103e-05, "loss": 2.7519, "step": 15500 }, { "epoch": 0.16, "learning_rate": 1.8477002741395065e-05, "loss": 2.7704, "step": 16000 }, { "epoch": 0.16, "learning_rate": 1.8427873797569103e-05, "loss": 2.6919, "step": 16500 }, { "epoch": 0.17, "learning_rate": 1.8378744853743137e-05, "loss": 2.7287, "step": 17000 }, { "epoch": 0.17, "learning_rate": 1.832961590991717e-05, "loss": 2.7121, "step": 17500 }, { "epoch": 0.18, "learning_rate": 1.8280486966091204e-05, "loss": 2.7137, "step": 18000 }, { "epoch": 0.18, "learning_rate": 1.8231358022265238e-05, "loss": 2.7361, "step": 18500 }, { "epoch": 0.19, "learning_rate": 1.8182229078439272e-05, "loss": 2.7176, "step": 19000 }, { "epoch": 0.19, "learning_rate": 1.813310013461331e-05, "loss": 2.7055, "step": 19500 }, { "epoch": 0.2, "learning_rate": 1.8083971190787343e-05, "loss": 2.7325, "step": 20000 }, { "epoch": 0.2, "learning_rate": 1.8034842246961377e-05, "loss": 2.7259, "step": 20500 }, { "epoch": 0.21, "learning_rate": 1.798571330313541e-05, "loss": 2.6834, "step": 21000 }, { "epoch": 0.21, "learning_rate": 1.7936584359309445e-05, "loss": 2.6681, "step": 21500 }, { "epoch": 0.22, "learning_rate": 1.788745541548348e-05, "loss": 2.6709, "step": 22000 }, { "epoch": 0.22, "learning_rate": 1.7838326471657516e-05, "loss": 2.673, "step": 22500 }, { "epoch": 0.23, "learning_rate": 1.7789197527831547e-05, "loss": 2.6801, "step": 23000 }, { "epoch": 0.23, "learning_rate": 1.7740068584005584e-05, "loss": 2.6635, "step": 23500 }, { "epoch": 0.24, "learning_rate": 1.7690939640179618e-05, "loss": 2.6699, "step": 24000 }, { "epoch": 0.24, "learning_rate": 1.764181069635365e-05, "loss": 2.6431, "step": 24500 }, { "epoch": 0.25, "learning_rate": 1.7592681752527685e-05, "loss": 2.6895, "step": 25000 }, { "epoch": 0.25, "learning_rate": 1.754355280870172e-05, "loss": 2.6452, "step": 25500 }, { "epoch": 0.25, "learning_rate": 1.7494423864875753e-05, "loss": 2.6619, "step": 26000 }, { "epoch": 0.26, "learning_rate": 1.744529492104979e-05, "loss": 2.6232, "step": 26500 }, { "epoch": 0.26, "learning_rate": 1.739616597722382e-05, "loss": 2.663, "step": 27000 }, { "epoch": 0.27, "learning_rate": 1.7347037033397858e-05, "loss": 2.6151, "step": 27500 }, { "epoch": 0.27, "learning_rate": 1.7297908089571892e-05, "loss": 2.6166, "step": 28000 }, { "epoch": 0.28, "learning_rate": 1.7248779145745926e-05, "loss": 2.6394, "step": 28500 }, { "epoch": 0.28, "learning_rate": 1.719965020191996e-05, "loss": 2.6242, "step": 29000 }, { "epoch": 0.29, "learning_rate": 1.7150521258093994e-05, "loss": 2.6205, "step": 29500 }, { "epoch": 0.29, "learning_rate": 1.7101392314268028e-05, "loss": 2.6596, "step": 30000 }, { "epoch": 0.3, "learning_rate": 1.7052263370442065e-05, "loss": 2.6469, "step": 30500 }, { "epoch": 0.3, "learning_rate": 1.7003134426616095e-05, "loss": 2.6189, "step": 31000 }, { "epoch": 0.31, "learning_rate": 1.6954005482790133e-05, "loss": 2.6035, "step": 31500 }, { "epoch": 0.31, "learning_rate": 1.6904876538964167e-05, "loss": 2.5861, "step": 32000 }, { "epoch": 0.32, "learning_rate": 1.68557475951382e-05, "loss": 2.6504, "step": 32500 }, { "epoch": 0.32, "learning_rate": 1.6806618651312238e-05, "loss": 2.6006, "step": 33000 }, { "epoch": 0.33, "learning_rate": 1.6757489707486268e-05, "loss": 2.6398, "step": 33500 }, { "epoch": 0.33, "learning_rate": 1.6708360763660305e-05, "loss": 2.6024, "step": 34000 }, { "epoch": 0.34, "learning_rate": 1.665923181983434e-05, "loss": 2.6039, "step": 34500 }, { "epoch": 0.34, "learning_rate": 1.6610102876008373e-05, "loss": 2.6224, "step": 35000 }, { "epoch": 0.35, "learning_rate": 1.6560973932182407e-05, "loss": 2.5644, "step": 35500 }, { "epoch": 0.35, "learning_rate": 1.651184498835644e-05, "loss": 2.5826, "step": 36000 }, { "epoch": 0.36, "learning_rate": 1.6462716044530475e-05, "loss": 2.585, "step": 36500 }, { "epoch": 0.36, "learning_rate": 1.6413587100704512e-05, "loss": 2.5589, "step": 37000 }, { "epoch": 0.37, "learning_rate": 1.6364458156878546e-05, "loss": 2.6029, "step": 37500 }, { "epoch": 0.37, "learning_rate": 1.631532921305258e-05, "loss": 2.554, "step": 38000 }, { "epoch": 0.38, "learning_rate": 1.6266200269226614e-05, "loss": 2.5599, "step": 38500 }, { "epoch": 0.38, "learning_rate": 1.6217071325400648e-05, "loss": 2.5285, "step": 39000 }, { "epoch": 0.39, "learning_rate": 1.616794238157468e-05, "loss": 2.5684, "step": 39500 }, { "epoch": 0.39, "learning_rate": 1.611881343774872e-05, "loss": 2.5816, "step": 40000 }, { "epoch": 0.4, "learning_rate": 1.606968449392275e-05, "loss": 2.566, "step": 40500 }, { "epoch": 0.4, "learning_rate": 1.6020555550096787e-05, "loss": 2.5244, "step": 41000 }, { "epoch": 0.41, "learning_rate": 1.597142660627082e-05, "loss": 2.5571, "step": 41500 }, { "epoch": 0.41, "learning_rate": 1.5922297662444854e-05, "loss": 2.5071, "step": 42000 }, { "epoch": 0.42, "learning_rate": 1.5873168718618888e-05, "loss": 2.5497, "step": 42500 }, { "epoch": 0.42, "learning_rate": 1.5824039774792922e-05, "loss": 2.5566, "step": 43000 }, { "epoch": 0.43, "learning_rate": 1.5774910830966956e-05, "loss": 2.5448, "step": 43500 }, { "epoch": 0.43, "learning_rate": 1.5725781887140993e-05, "loss": 2.5649, "step": 44000 }, { "epoch": 0.44, "learning_rate": 1.5676652943315024e-05, "loss": 2.5562, "step": 44500 }, { "epoch": 0.44, "learning_rate": 1.562752399948906e-05, "loss": 2.5756, "step": 45000 }, { "epoch": 0.45, "learning_rate": 1.5578395055663095e-05, "loss": 2.5357, "step": 45500 }, { "epoch": 0.45, "learning_rate": 1.552926611183713e-05, "loss": 2.5394, "step": 46000 }, { "epoch": 0.46, "learning_rate": 1.5480137168011163e-05, "loss": 2.5416, "step": 46500 }, { "epoch": 0.46, "learning_rate": 1.5431008224185196e-05, "loss": 2.523, "step": 47000 }, { "epoch": 0.47, "learning_rate": 1.5381879280359234e-05, "loss": 2.4957, "step": 47500 }, { "epoch": 0.47, "learning_rate": 1.5332750336533268e-05, "loss": 2.5392, "step": 48000 }, { "epoch": 0.48, "learning_rate": 1.52836213927073e-05, "loss": 2.5505, "step": 48500 }, { "epoch": 0.48, "learning_rate": 1.5234492448881335e-05, "loss": 2.5251, "step": 49000 }, { "epoch": 0.49, "learning_rate": 1.5185363505055371e-05, "loss": 2.4836, "step": 49500 }, { "epoch": 0.49, "learning_rate": 1.5136234561229403e-05, "loss": 2.5077, "step": 50000 }, { "epoch": 0.49, "learning_rate": 1.5087105617403439e-05, "loss": 2.5103, "step": 50500 }, { "epoch": 0.5, "learning_rate": 1.5037976673577473e-05, "loss": 2.5157, "step": 51000 }, { "epoch": 0.5, "learning_rate": 1.4988847729751508e-05, "loss": 2.509, "step": 51500 }, { "epoch": 0.51, "learning_rate": 1.493971878592554e-05, "loss": 2.4982, "step": 52000 }, { "epoch": 0.51, "learning_rate": 1.4890589842099576e-05, "loss": 2.4741, "step": 52500 }, { "epoch": 0.52, "learning_rate": 1.484146089827361e-05, "loss": 2.5438, "step": 53000 }, { "epoch": 0.52, "learning_rate": 1.4792331954447645e-05, "loss": 2.4698, "step": 53500 }, { "epoch": 0.53, "learning_rate": 1.4743203010621678e-05, "loss": 2.5175, "step": 54000 }, { "epoch": 0.53, "learning_rate": 1.4694074066795713e-05, "loss": 2.4857, "step": 54500 }, { "epoch": 0.54, "learning_rate": 1.4644945122969747e-05, "loss": 2.5006, "step": 55000 }, { "epoch": 0.54, "learning_rate": 1.4595816179143783e-05, "loss": 2.4875, "step": 55500 }, { "epoch": 0.55, "learning_rate": 1.4546687235317816e-05, "loss": 2.4964, "step": 56000 }, { "epoch": 0.55, "learning_rate": 1.449755829149185e-05, "loss": 2.5175, "step": 56500 }, { "epoch": 0.56, "learning_rate": 1.4448429347665884e-05, "loss": 2.4912, "step": 57000 }, { "epoch": 0.56, "learning_rate": 1.439930040383992e-05, "loss": 2.5074, "step": 57500 }, { "epoch": 0.57, "learning_rate": 1.4350171460013954e-05, "loss": 2.4655, "step": 58000 }, { "epoch": 0.57, "learning_rate": 1.430104251618799e-05, "loss": 2.4985, "step": 58500 }, { "epoch": 0.58, "learning_rate": 1.4251913572362021e-05, "loss": 2.4791, "step": 59000 }, { "epoch": 0.58, "learning_rate": 1.4202784628536057e-05, "loss": 2.4881, "step": 59500 }, { "epoch": 0.59, "learning_rate": 1.4153655684710091e-05, "loss": 2.4805, "step": 60000 }, { "epoch": 0.59, "learning_rate": 1.4104526740884126e-05, "loss": 2.4591, "step": 60500 }, { "epoch": 0.6, "learning_rate": 1.4055397797058159e-05, "loss": 2.4958, "step": 61000 }, { "epoch": 0.6, "learning_rate": 1.4006268853232194e-05, "loss": 2.4691, "step": 61500 }, { "epoch": 0.61, "learning_rate": 1.3957139909406228e-05, "loss": 2.5049, "step": 62000 }, { "epoch": 0.61, "learning_rate": 1.3908010965580264e-05, "loss": 2.4584, "step": 62500 }, { "epoch": 0.62, "learning_rate": 1.38588820217543e-05, "loss": 2.4761, "step": 63000 }, { "epoch": 0.62, "learning_rate": 1.3809753077928331e-05, "loss": 2.4677, "step": 63500 }, { "epoch": 0.63, "learning_rate": 1.3760624134102367e-05, "loss": 2.4868, "step": 64000 }, { "epoch": 0.63, "learning_rate": 1.3711495190276401e-05, "loss": 2.4739, "step": 64500 }, { "epoch": 0.64, "learning_rate": 1.3662366246450436e-05, "loss": 2.4558, "step": 65000 }, { "epoch": 0.64, "learning_rate": 1.3613237302624469e-05, "loss": 2.4528, "step": 65500 }, { "epoch": 0.65, "learning_rate": 1.3564108358798504e-05, "loss": 2.4608, "step": 66000 }, { "epoch": 0.65, "learning_rate": 1.3514979414972538e-05, "loss": 2.4461, "step": 66500 }, { "epoch": 0.66, "learning_rate": 1.3465850471146574e-05, "loss": 2.4382, "step": 67000 }, { "epoch": 0.66, "learning_rate": 1.3416721527320606e-05, "loss": 2.4554, "step": 67500 }, { "epoch": 0.67, "learning_rate": 1.3367592583494641e-05, "loss": 2.4488, "step": 68000 }, { "epoch": 0.67, "learning_rate": 1.3318463639668675e-05, "loss": 2.4594, "step": 68500 }, { "epoch": 0.68, "learning_rate": 1.3269334695842711e-05, "loss": 2.4583, "step": 69000 }, { "epoch": 0.68, "learning_rate": 1.3220205752016743e-05, "loss": 2.4392, "step": 69500 }, { "epoch": 0.69, "learning_rate": 1.3171076808190779e-05, "loss": 2.4366, "step": 70000 }, { "epoch": 0.69, "learning_rate": 1.3121947864364813e-05, "loss": 2.4437, "step": 70500 }, { "epoch": 0.7, "learning_rate": 1.3072818920538848e-05, "loss": 2.4635, "step": 71000 }, { "epoch": 0.7, "learning_rate": 1.302368997671288e-05, "loss": 2.435, "step": 71500 }, { "epoch": 0.71, "learning_rate": 1.2974561032886916e-05, "loss": 2.4333, "step": 72000 }, { "epoch": 0.71, "learning_rate": 1.292543208906095e-05, "loss": 2.4678, "step": 72500 }, { "epoch": 0.72, "learning_rate": 1.2876303145234985e-05, "loss": 2.4538, "step": 73000 }, { "epoch": 0.72, "learning_rate": 1.282717420140902e-05, "loss": 2.4561, "step": 73500 }, { "epoch": 0.73, "learning_rate": 1.2778045257583053e-05, "loss": 2.5159, "step": 74000 }, { "epoch": 0.73, "learning_rate": 1.2728916313757087e-05, "loss": 2.4392, "step": 74500 }, { "epoch": 0.74, "learning_rate": 1.2679787369931123e-05, "loss": 2.4503, "step": 75000 }, { "epoch": 0.74, "learning_rate": 1.2630658426105156e-05, "loss": 2.4428, "step": 75500 }, { "epoch": 0.74, "learning_rate": 1.2581529482279192e-05, "loss": 2.426, "step": 76000 }, { "epoch": 0.75, "learning_rate": 1.2532400538453224e-05, "loss": 2.4809, "step": 76500 }, { "epoch": 0.75, "learning_rate": 1.248327159462726e-05, "loss": 2.4687, "step": 77000 }, { "epoch": 0.76, "learning_rate": 1.2434142650801294e-05, "loss": 2.4306, "step": 77500 }, { "epoch": 0.76, "learning_rate": 1.238501370697533e-05, "loss": 2.4455, "step": 78000 }, { "epoch": 0.77, "learning_rate": 1.2335884763149363e-05, "loss": 2.3706, "step": 78500 }, { "epoch": 0.77, "learning_rate": 1.2286755819323397e-05, "loss": 2.4224, "step": 79000 }, { "epoch": 0.78, "learning_rate": 1.2237626875497433e-05, "loss": 2.4242, "step": 79500 }, { "epoch": 0.78, "learning_rate": 1.2188497931671466e-05, "loss": 2.429, "step": 80000 }, { "epoch": 0.79, "learning_rate": 1.2139368987845502e-05, "loss": 2.4247, "step": 80500 }, { "epoch": 0.79, "learning_rate": 1.2090240044019534e-05, "loss": 2.4195, "step": 81000 }, { "epoch": 0.8, "learning_rate": 1.204111110019357e-05, "loss": 2.4293, "step": 81500 }, { "epoch": 0.8, "learning_rate": 1.1991982156367604e-05, "loss": 2.4245, "step": 82000 }, { "epoch": 0.81, "learning_rate": 1.194285321254164e-05, "loss": 2.4415, "step": 82500 }, { "epoch": 0.81, "learning_rate": 1.1893724268715671e-05, "loss": 2.4199, "step": 83000 }, { "epoch": 0.82, "learning_rate": 1.1844595324889707e-05, "loss": 2.4386, "step": 83500 }, { "epoch": 0.82, "learning_rate": 1.179546638106374e-05, "loss": 2.4063, "step": 84000 }, { "epoch": 0.83, "learning_rate": 1.1746337437237776e-05, "loss": 2.4376, "step": 84500 }, { "epoch": 0.83, "learning_rate": 1.1697208493411809e-05, "loss": 2.4177, "step": 85000 }, { "epoch": 0.84, "learning_rate": 1.1648079549585844e-05, "loss": 2.3778, "step": 85500 }, { "epoch": 0.84, "learning_rate": 1.1598950605759878e-05, "loss": 2.4355, "step": 86000 }, { "epoch": 0.85, "learning_rate": 1.1549821661933914e-05, "loss": 2.4214, "step": 86500 }, { "epoch": 0.85, "learning_rate": 1.1500692718107946e-05, "loss": 2.3941, "step": 87000 }, { "epoch": 0.86, "learning_rate": 1.1451563774281981e-05, "loss": 2.4122, "step": 87500 }, { "epoch": 0.86, "learning_rate": 1.1402434830456015e-05, "loss": 2.4021, "step": 88000 }, { "epoch": 0.87, "learning_rate": 1.135330588663005e-05, "loss": 2.4154, "step": 88500 }, { "epoch": 0.87, "learning_rate": 1.1304176942804083e-05, "loss": 2.4004, "step": 89000 }, { "epoch": 0.88, "learning_rate": 1.1255047998978119e-05, "loss": 2.4091, "step": 89500 }, { "epoch": 0.88, "learning_rate": 1.1205919055152152e-05, "loss": 2.4348, "step": 90000 }, { "epoch": 0.89, "learning_rate": 1.1156790111326188e-05, "loss": 2.3965, "step": 90500 }, { "epoch": 0.89, "learning_rate": 1.1107661167500222e-05, "loss": 2.3904, "step": 91000 }, { "epoch": 0.9, "learning_rate": 1.1058532223674256e-05, "loss": 2.3947, "step": 91500 }, { "epoch": 0.9, "learning_rate": 1.100940327984829e-05, "loss": 2.4075, "step": 92000 }, { "epoch": 0.91, "learning_rate": 1.0960274336022325e-05, "loss": 2.3987, "step": 92500 }, { "epoch": 0.91, "learning_rate": 1.0911145392196359e-05, "loss": 2.4116, "step": 93000 }, { "epoch": 0.92, "learning_rate": 1.0862016448370393e-05, "loss": 2.4416, "step": 93500 }, { "epoch": 0.92, "learning_rate": 1.0812887504544429e-05, "loss": 2.3899, "step": 94000 }, { "epoch": 0.93, "learning_rate": 1.0763758560718462e-05, "loss": 2.4015, "step": 94500 }, { "epoch": 0.93, "learning_rate": 1.0714629616892498e-05, "loss": 2.3741, "step": 95000 }, { "epoch": 0.94, "learning_rate": 1.0665500673066532e-05, "loss": 2.3951, "step": 95500 }, { "epoch": 0.94, "learning_rate": 1.0616371729240566e-05, "loss": 2.406, "step": 96000 }, { "epoch": 0.95, "learning_rate": 1.05672427854146e-05, "loss": 2.4102, "step": 96500 }, { "epoch": 0.95, "learning_rate": 1.0518113841588635e-05, "loss": 2.4031, "step": 97000 }, { "epoch": 0.96, "learning_rate": 1.0468984897762669e-05, "loss": 2.417, "step": 97500 }, { "epoch": 0.96, "learning_rate": 1.0419855953936705e-05, "loss": 2.3978, "step": 98000 }, { "epoch": 0.97, "learning_rate": 1.0370727010110737e-05, "loss": 2.4009, "step": 98500 }, { "epoch": 0.97, "learning_rate": 1.0321598066284772e-05, "loss": 2.3966, "step": 99000 }, { "epoch": 0.98, "learning_rate": 1.0272469122458806e-05, "loss": 2.3918, "step": 99500 }, { "epoch": 0.98, "learning_rate": 1.0223340178632842e-05, "loss": 2.3853, "step": 100000 }, { "epoch": 0.99, "learning_rate": 1.0174211234806874e-05, "loss": 2.3996, "step": 100500 }, { "epoch": 0.99, "learning_rate": 1.012508229098091e-05, "loss": 2.388, "step": 101000 }, { "epoch": 0.99, "learning_rate": 1.0075953347154944e-05, "loss": 2.3845, "step": 101500 }, { "epoch": 1.0, "learning_rate": 1.0026824403328979e-05, "loss": 2.385, "step": 102000 }, { "epoch": 1.0, "learning_rate": 9.977695459503013e-06, "loss": 2.3387, "step": 102500 }, { "epoch": 1.01, "learning_rate": 9.928566515677047e-06, "loss": 2.3401, "step": 103000 }, { "epoch": 1.01, "learning_rate": 9.87943757185108e-06, "loss": 2.3291, "step": 103500 }, { "epoch": 1.02, "learning_rate": 9.830308628025116e-06, "loss": 2.3783, "step": 104000 }, { "epoch": 1.02, "learning_rate": 9.78117968419915e-06, "loss": 2.3154, "step": 104500 }, { "epoch": 1.03, "learning_rate": 9.732050740373184e-06, "loss": 2.3229, "step": 105000 }, { "epoch": 1.03, "learning_rate": 9.68292179654722e-06, "loss": 2.3844, "step": 105500 }, { "epoch": 1.04, "learning_rate": 9.633792852721254e-06, "loss": 2.3945, "step": 106000 }, { "epoch": 1.04, "learning_rate": 9.584663908895287e-06, "loss": 2.3354, "step": 106500 }, { "epoch": 1.05, "learning_rate": 9.535534965069321e-06, "loss": 2.325, "step": 107000 }, { "epoch": 1.05, "learning_rate": 9.486406021243357e-06, "loss": 2.3722, "step": 107500 }, { "epoch": 1.06, "learning_rate": 9.43727707741739e-06, "loss": 2.37, "step": 108000 }, { "epoch": 1.06, "learning_rate": 9.388148133591425e-06, "loss": 2.3454, "step": 108500 }, { "epoch": 1.07, "learning_rate": 9.339019189765458e-06, "loss": 2.3118, "step": 109000 }, { "epoch": 1.07, "learning_rate": 9.289890245939494e-06, "loss": 2.34, "step": 109500 }, { "epoch": 1.08, "learning_rate": 9.240761302113528e-06, "loss": 2.3156, "step": 110000 }, { "epoch": 1.08, "learning_rate": 9.191632358287562e-06, "loss": 2.3686, "step": 110500 }, { "epoch": 1.09, "learning_rate": 9.142503414461596e-06, "loss": 2.3641, "step": 111000 }, { "epoch": 1.09, "learning_rate": 9.093374470635631e-06, "loss": 2.3399, "step": 111500 }, { "epoch": 1.1, "learning_rate": 9.044245526809665e-06, "loss": 2.3038, "step": 112000 }, { "epoch": 1.1, "learning_rate": 8.995116582983699e-06, "loss": 2.3394, "step": 112500 }, { "epoch": 1.11, "learning_rate": 8.945987639157735e-06, "loss": 2.3144, "step": 113000 }, { "epoch": 1.11, "learning_rate": 8.896858695331768e-06, "loss": 2.3542, "step": 113500 }, { "epoch": 1.12, "learning_rate": 8.847729751505802e-06, "loss": 2.3378, "step": 114000 }, { "epoch": 1.12, "learning_rate": 8.798600807679836e-06, "loss": 2.3562, "step": 114500 }, { "epoch": 1.13, "learning_rate": 8.749471863853872e-06, "loss": 2.3759, "step": 115000 }, { "epoch": 1.13, "learning_rate": 8.700342920027906e-06, "loss": 2.3459, "step": 115500 }, { "epoch": 1.14, "learning_rate": 8.65121397620194e-06, "loss": 2.3041, "step": 116000 }, { "epoch": 1.14, "learning_rate": 8.602085032375973e-06, "loss": 2.3076, "step": 116500 }, { "epoch": 1.15, "learning_rate": 8.552956088550009e-06, "loss": 2.3598, "step": 117000 }, { "epoch": 1.15, "learning_rate": 8.503827144724045e-06, "loss": 2.3463, "step": 117500 }, { "epoch": 1.16, "learning_rate": 8.454698200898078e-06, "loss": 2.3505, "step": 118000 }, { "epoch": 1.16, "learning_rate": 8.405569257072112e-06, "loss": 2.3106, "step": 118500 }, { "epoch": 1.17, "learning_rate": 8.356440313246146e-06, "loss": 2.3391, "step": 119000 }, { "epoch": 1.17, "learning_rate": 8.307311369420182e-06, "loss": 2.3316, "step": 119500 }, { "epoch": 1.18, "learning_rate": 8.258182425594216e-06, "loss": 2.3122, "step": 120000 }, { "epoch": 1.18, "learning_rate": 8.20905348176825e-06, "loss": 2.3244, "step": 120500 }, { "epoch": 1.19, "learning_rate": 8.159924537942283e-06, "loss": 2.3457, "step": 121000 }, { "epoch": 1.19, "learning_rate": 8.110795594116319e-06, "loss": 2.328, "step": 121500 }, { "epoch": 1.2, "learning_rate": 8.061666650290353e-06, "loss": 2.319, "step": 122000 }, { "epoch": 1.2, "learning_rate": 8.012537706464387e-06, "loss": 2.3168, "step": 122500 }, { "epoch": 1.21, "learning_rate": 7.963408762638422e-06, "loss": 2.3288, "step": 123000 }, { "epoch": 1.21, "learning_rate": 7.914279818812456e-06, "loss": 2.3169, "step": 123500 }, { "epoch": 1.22, "learning_rate": 7.86515087498649e-06, "loss": 2.3398, "step": 124000 }, { "epoch": 1.22, "learning_rate": 7.816021931160524e-06, "loss": 2.3182, "step": 124500 }, { "epoch": 1.23, "learning_rate": 7.76689298733456e-06, "loss": 2.3219, "step": 125000 }, { "epoch": 1.23, "learning_rate": 7.717764043508593e-06, "loss": 2.3407, "step": 125500 }, { "epoch": 1.24, "learning_rate": 7.668635099682627e-06, "loss": 2.2986, "step": 126000 }, { "epoch": 1.24, "learning_rate": 7.619506155856662e-06, "loss": 2.3192, "step": 126500 }, { "epoch": 1.24, "learning_rate": 7.570377212030696e-06, "loss": 2.3229, "step": 127000 }, { "epoch": 1.25, "learning_rate": 7.521248268204731e-06, "loss": 2.3073, "step": 127500 }, { "epoch": 1.25, "learning_rate": 7.4721193243787646e-06, "loss": 2.2936, "step": 128000 }, { "epoch": 1.26, "learning_rate": 7.422990380552799e-06, "loss": 2.3078, "step": 128500 }, { "epoch": 1.26, "learning_rate": 7.373861436726833e-06, "loss": 2.3507, "step": 129000 }, { "epoch": 1.27, "learning_rate": 7.324732492900868e-06, "loss": 2.3071, "step": 129500 }, { "epoch": 1.27, "learning_rate": 7.275603549074902e-06, "loss": 2.3076, "step": 130000 }, { "epoch": 1.28, "learning_rate": 7.2264746052489365e-06, "loss": 2.3321, "step": 130500 }, { "epoch": 1.28, "learning_rate": 7.177345661422971e-06, "loss": 2.3224, "step": 131000 }, { "epoch": 1.29, "learning_rate": 7.128216717597005e-06, "loss": 2.3083, "step": 131500 }, { "epoch": 1.29, "learning_rate": 7.07908777377104e-06, "loss": 2.3196, "step": 132000 }, { "epoch": 1.3, "learning_rate": 7.0299588299450745e-06, "loss": 2.3311, "step": 132500 }, { "epoch": 1.3, "learning_rate": 6.980829886119109e-06, "loss": 2.3505, "step": 133000 }, { "epoch": 1.31, "learning_rate": 6.931700942293144e-06, "loss": 2.3014, "step": 133500 }, { "epoch": 1.31, "learning_rate": 6.882571998467178e-06, "loss": 2.3001, "step": 134000 }, { "epoch": 1.32, "learning_rate": 6.833443054641213e-06, "loss": 2.3122, "step": 134500 }, { "epoch": 1.32, "learning_rate": 6.7843141108152465e-06, "loss": 2.3031, "step": 135000 }, { "epoch": 1.33, "learning_rate": 6.735185166989281e-06, "loss": 2.3426, "step": 135500 }, { "epoch": 1.33, "learning_rate": 6.686056223163315e-06, "loss": 2.3242, "step": 136000 }, { "epoch": 1.34, "learning_rate": 6.63692727933735e-06, "loss": 2.3312, "step": 136500 }, { "epoch": 1.34, "learning_rate": 6.587798335511384e-06, "loss": 2.3265, "step": 137000 }, { "epoch": 1.35, "learning_rate": 6.5386693916854184e-06, "loss": 2.3226, "step": 137500 }, { "epoch": 1.35, "learning_rate": 6.489540447859452e-06, "loss": 2.2682, "step": 138000 }, { "epoch": 1.36, "learning_rate": 6.440411504033487e-06, "loss": 2.2961, "step": 138500 }, { "epoch": 1.36, "learning_rate": 6.391282560207521e-06, "loss": 2.2935, "step": 139000 }, { "epoch": 1.37, "learning_rate": 6.342153616381556e-06, "loss": 2.324, "step": 139500 }, { "epoch": 1.37, "learning_rate": 6.2930246725555895e-06, "loss": 2.2955, "step": 140000 }, { "epoch": 1.38, "learning_rate": 6.243895728729624e-06, "loss": 2.292, "step": 140500 }, { "epoch": 1.38, "learning_rate": 6.194766784903659e-06, "loss": 2.3076, "step": 141000 }, { "epoch": 1.39, "learning_rate": 6.145637841077693e-06, "loss": 2.3151, "step": 141500 }, { "epoch": 1.39, "learning_rate": 6.0965088972517276e-06, "loss": 2.3051, "step": 142000 }, { "epoch": 1.4, "learning_rate": 6.0473799534257615e-06, "loss": 2.3247, "step": 142500 }, { "epoch": 1.4, "learning_rate": 5.998251009599796e-06, "loss": 2.2741, "step": 143000 }, { "epoch": 1.41, "learning_rate": 5.94912206577383e-06, "loss": 2.2937, "step": 143500 }, { "epoch": 1.41, "learning_rate": 5.899993121947865e-06, "loss": 2.2883, "step": 144000 }, { "epoch": 1.42, "learning_rate": 5.850864178121899e-06, "loss": 2.2985, "step": 144500 }, { "epoch": 1.42, "learning_rate": 5.801735234295933e-06, "loss": 2.3006, "step": 145000 }, { "epoch": 1.43, "learning_rate": 5.752606290469967e-06, "loss": 2.2663, "step": 145500 }, { "epoch": 1.43, "learning_rate": 5.703477346644002e-06, "loss": 2.3, "step": 146000 }, { "epoch": 1.44, "learning_rate": 5.654348402818036e-06, "loss": 2.2688, "step": 146500 }, { "epoch": 1.44, "learning_rate": 5.605219458992071e-06, "loss": 2.2915, "step": 147000 }, { "epoch": 1.45, "learning_rate": 5.5560905151661045e-06, "loss": 2.2897, "step": 147500 }, { "epoch": 1.45, "learning_rate": 5.50696157134014e-06, "loss": 2.3316, "step": 148000 }, { "epoch": 1.46, "learning_rate": 5.457832627514175e-06, "loss": 2.3386, "step": 148500 }, { "epoch": 1.46, "learning_rate": 5.408703683688209e-06, "loss": 2.3215, "step": 149000 }, { "epoch": 1.47, "learning_rate": 5.359574739862243e-06, "loss": 2.3012, "step": 149500 }, { "epoch": 1.47, "learning_rate": 5.310445796036277e-06, "loss": 2.2939, "step": 150000 }, { "epoch": 1.48, "learning_rate": 5.261316852210312e-06, "loss": 2.2933, "step": 150500 }, { "epoch": 1.48, "learning_rate": 5.212187908384346e-06, "loss": 2.3057, "step": 151000 }, { "epoch": 1.48, "learning_rate": 5.163058964558381e-06, "loss": 2.2883, "step": 151500 }, { "epoch": 1.49, "learning_rate": 5.113930020732415e-06, "loss": 2.3136, "step": 152000 }, { "epoch": 1.49, "learning_rate": 5.064801076906449e-06, "loss": 2.2841, "step": 152500 }, { "epoch": 1.5, "learning_rate": 5.015672133080484e-06, "loss": 2.2684, "step": 153000 }, { "epoch": 1.5, "learning_rate": 4.966543189254518e-06, "loss": 2.2535, "step": 153500 }, { "epoch": 1.51, "learning_rate": 4.9174142454285525e-06, "loss": 2.2899, "step": 154000 }, { "epoch": 1.51, "learning_rate": 4.868285301602586e-06, "loss": 2.308, "step": 154500 }, { "epoch": 1.52, "learning_rate": 4.819156357776621e-06, "loss": 2.297, "step": 155000 }, { "epoch": 1.52, "learning_rate": 4.770027413950655e-06, "loss": 2.3058, "step": 155500 }, { "epoch": 1.53, "learning_rate": 4.72089847012469e-06, "loss": 2.2709, "step": 156000 }, { "epoch": 1.53, "learning_rate": 4.671769526298724e-06, "loss": 2.2958, "step": 156500 }, { "epoch": 1.54, "learning_rate": 4.622640582472758e-06, "loss": 2.3003, "step": 157000 }, { "epoch": 1.54, "learning_rate": 4.573511638646792e-06, "loss": 2.2724, "step": 157500 }, { "epoch": 1.55, "learning_rate": 4.524382694820827e-06, "loss": 2.2806, "step": 158000 }, { "epoch": 1.55, "learning_rate": 4.475253750994861e-06, "loss": 2.2622, "step": 158500 }, { "epoch": 1.56, "learning_rate": 4.4261248071688956e-06, "loss": 2.2915, "step": 159000 }, { "epoch": 1.56, "learning_rate": 4.37699586334293e-06, "loss": 2.3153, "step": 159500 }, { "epoch": 1.57, "learning_rate": 4.327866919516965e-06, "loss": 2.2784, "step": 160000 }, { "epoch": 1.57, "learning_rate": 4.278737975690999e-06, "loss": 2.2704, "step": 160500 }, { "epoch": 1.58, "learning_rate": 4.229609031865034e-06, "loss": 2.2792, "step": 161000 }, { "epoch": 1.58, "learning_rate": 4.1804800880390675e-06, "loss": 2.2642, "step": 161500 }, { "epoch": 1.59, "learning_rate": 4.131351144213102e-06, "loss": 2.2858, "step": 162000 }, { "epoch": 1.59, "learning_rate": 4.082222200387136e-06, "loss": 2.2784, "step": 162500 }, { "epoch": 1.6, "learning_rate": 4.033093256561171e-06, "loss": 2.2851, "step": 163000 }, { "epoch": 1.6, "learning_rate": 3.983964312735205e-06, "loss": 2.2762, "step": 163500 }, { "epoch": 1.61, "learning_rate": 3.9348353689092394e-06, "loss": 2.3028, "step": 164000 }, { "epoch": 1.61, "learning_rate": 3.885706425083274e-06, "loss": 2.2735, "step": 164500 }, { "epoch": 1.62, "learning_rate": 3.836577481257308e-06, "loss": 2.2511, "step": 165000 }, { "epoch": 1.62, "learning_rate": 3.7874485374313424e-06, "loss": 2.2856, "step": 165500 }, { "epoch": 1.63, "learning_rate": 3.7383195936053767e-06, "loss": 2.2979, "step": 166000 }, { "epoch": 1.63, "learning_rate": 3.689190649779411e-06, "loss": 2.2537, "step": 166500 }, { "epoch": 1.64, "learning_rate": 3.6400617059534453e-06, "loss": 2.2746, "step": 167000 }, { "epoch": 1.64, "learning_rate": 3.5909327621274804e-06, "loss": 2.3058, "step": 167500 }, { "epoch": 1.65, "learning_rate": 3.5418038183015147e-06, "loss": 2.3015, "step": 168000 }, { "epoch": 1.65, "learning_rate": 3.492674874475549e-06, "loss": 2.2863, "step": 168500 }, { "epoch": 1.66, "learning_rate": 3.4435459306495833e-06, "loss": 2.2733, "step": 169000 }, { "epoch": 1.66, "learning_rate": 3.3944169868236176e-06, "loss": 2.2806, "step": 169500 }, { "epoch": 1.67, "learning_rate": 3.345288042997652e-06, "loss": 2.259, "step": 170000 }, { "epoch": 1.67, "learning_rate": 3.2961590991716862e-06, "loss": 2.3225, "step": 170500 }, { "epoch": 1.68, "learning_rate": 3.2470301553457205e-06, "loss": 2.2887, "step": 171000 }, { "epoch": 1.68, "learning_rate": 3.197901211519755e-06, "loss": 2.2957, "step": 171500 }, { "epoch": 1.69, "learning_rate": 3.148772267693789e-06, "loss": 2.294, "step": 172000 }, { "epoch": 1.69, "learning_rate": 3.0996433238678234e-06, "loss": 2.2976, "step": 172500 }, { "epoch": 1.7, "learning_rate": 3.050514380041858e-06, "loss": 2.3028, "step": 173000 }, { "epoch": 1.7, "learning_rate": 3.0013854362158925e-06, "loss": 2.2875, "step": 173500 }, { "epoch": 1.71, "learning_rate": 2.9522564923899268e-06, "loss": 2.3208, "step": 174000 }, { "epoch": 1.71, "learning_rate": 2.903127548563961e-06, "loss": 2.3068, "step": 174500 }, { "epoch": 1.72, "learning_rate": 2.853998604737996e-06, "loss": 2.28, "step": 175000 }, { "epoch": 1.72, "learning_rate": 2.80486966091203e-06, "loss": 2.301, "step": 175500 }, { "epoch": 1.73, "learning_rate": 2.7557407170860644e-06, "loss": 2.2428, "step": 176000 }, { "epoch": 1.73, "learning_rate": 2.7066117732600987e-06, "loss": 2.2843, "step": 176500 }, { "epoch": 1.73, "learning_rate": 2.657482829434133e-06, "loss": 2.2756, "step": 177000 }, { "epoch": 1.74, "learning_rate": 2.6083538856081673e-06, "loss": 2.289, "step": 177500 }, { "epoch": 1.74, "learning_rate": 2.5592249417822016e-06, "loss": 2.2902, "step": 178000 }, { "epoch": 1.75, "learning_rate": 2.5100959979562363e-06, "loss": 2.298, "step": 178500 }, { "epoch": 1.75, "learning_rate": 2.4609670541302707e-06, "loss": 2.2764, "step": 179000 }, { "epoch": 1.76, "learning_rate": 2.411838110304305e-06, "loss": 2.2487, "step": 179500 }, { "epoch": 1.76, "learning_rate": 2.3627091664783393e-06, "loss": 2.323, "step": 180000 }, { "epoch": 1.77, "learning_rate": 2.3135802226523736e-06, "loss": 2.3174, "step": 180500 }, { "epoch": 1.77, "learning_rate": 2.2644512788264083e-06, "loss": 2.3205, "step": 181000 }, { "epoch": 1.78, "learning_rate": 2.2153223350004426e-06, "loss": 2.313, "step": 181500 }, { "epoch": 1.78, "learning_rate": 2.166193391174477e-06, "loss": 2.3015, "step": 182000 }, { "epoch": 1.79, "learning_rate": 2.117064447348511e-06, "loss": 2.2836, "step": 182500 }, { "epoch": 1.79, "learning_rate": 2.0679355035225455e-06, "loss": 2.2737, "step": 183000 }, { "epoch": 1.8, "learning_rate": 2.01880655969658e-06, "loss": 2.2824, "step": 183500 }, { "epoch": 1.8, "learning_rate": 1.969677615870614e-06, "loss": 2.2577, "step": 184000 }, { "epoch": 1.81, "learning_rate": 1.920548672044649e-06, "loss": 2.2563, "step": 184500 }, { "epoch": 1.81, "learning_rate": 1.871419728218683e-06, "loss": 2.3253, "step": 185000 }, { "epoch": 1.82, "learning_rate": 1.8222907843927174e-06, "loss": 2.2504, "step": 185500 }, { "epoch": 1.82, "learning_rate": 1.7731618405667517e-06, "loss": 2.3124, "step": 186000 }, { "epoch": 1.83, "learning_rate": 1.724032896740786e-06, "loss": 2.2784, "step": 186500 }, { "epoch": 1.83, "learning_rate": 1.6749039529148203e-06, "loss": 2.2832, "step": 187000 }, { "epoch": 1.84, "learning_rate": 1.6257750090888546e-06, "loss": 2.2924, "step": 187500 }, { "epoch": 1.84, "learning_rate": 1.576646065262889e-06, "loss": 2.2843, "step": 188000 }, { "epoch": 1.85, "learning_rate": 1.5275171214369237e-06, "loss": 2.2724, "step": 188500 }, { "epoch": 1.85, "learning_rate": 1.478388177610958e-06, "loss": 2.3194, "step": 189000 }, { "epoch": 1.86, "learning_rate": 1.4292592337849923e-06, "loss": 2.2532, "step": 189500 }, { "epoch": 1.86, "learning_rate": 1.3801302899590266e-06, "loss": 2.2773, "step": 190000 }, { "epoch": 1.87, "learning_rate": 1.3310013461330609e-06, "loss": 2.2869, "step": 190500 }, { "epoch": 1.87, "learning_rate": 1.2818724023070952e-06, "loss": 2.2751, "step": 191000 }, { "epoch": 1.88, "learning_rate": 1.2327434584811297e-06, "loss": 2.3004, "step": 191500 }, { "epoch": 1.88, "learning_rate": 1.183614514655164e-06, "loss": 2.3015, "step": 192000 }, { "epoch": 1.89, "learning_rate": 1.1344855708291983e-06, "loss": 2.2566, "step": 192500 }, { "epoch": 1.89, "learning_rate": 1.0853566270032326e-06, "loss": 2.2978, "step": 193000 }, { "epoch": 1.9, "learning_rate": 1.0362276831772671e-06, "loss": 2.2898, "step": 193500 }, { "epoch": 1.9, "learning_rate": 9.870987393513014e-07, "loss": 2.2806, "step": 194000 }, { "epoch": 1.91, "learning_rate": 9.379697955253358e-07, "loss": 2.2777, "step": 194500 }, { "epoch": 1.91, "learning_rate": 8.888408516993701e-07, "loss": 2.261, "step": 195000 }, { "epoch": 1.92, "learning_rate": 8.397119078734047e-07, "loss": 2.252, "step": 195500 }, { "epoch": 1.92, "learning_rate": 7.90582964047439e-07, "loss": 2.2906, "step": 196000 }, { "epoch": 1.93, "learning_rate": 7.414540202214733e-07, "loss": 2.2968, "step": 196500 }, { "epoch": 1.93, "learning_rate": 6.923250763955077e-07, "loss": 2.2463, "step": 197000 }, { "epoch": 1.94, "learning_rate": 6.431961325695421e-07, "loss": 2.2681, "step": 197500 }, { "epoch": 1.94, "learning_rate": 5.940671887435765e-07, "loss": 2.2965, "step": 198000 }, { "epoch": 1.95, "learning_rate": 5.449382449176108e-07, "loss": 2.2705, "step": 198500 }, { "epoch": 1.95, "learning_rate": 4.958093010916452e-07, "loss": 2.2669, "step": 199000 }, { "epoch": 1.96, "learning_rate": 4.4668035726567956e-07, "loss": 2.2478, "step": 199500 }, { "epoch": 1.96, "learning_rate": 3.975514134397139e-07, "loss": 2.3053, "step": 200000 }, { "epoch": 1.97, "learning_rate": 3.484224696137483e-07, "loss": 2.2731, "step": 200500 }, { "epoch": 1.97, "learning_rate": 2.992935257877826e-07, "loss": 2.2518, "step": 201000 }, { "epoch": 1.98, "learning_rate": 2.50164581961817e-07, "loss": 2.2657, "step": 201500 }, { "epoch": 1.98, "learning_rate": 2.0103563813585136e-07, "loss": 2.2967, "step": 202000 }, { "epoch": 1.98, "learning_rate": 1.5190669430988575e-07, "loss": 2.2874, "step": 202500 }, { "epoch": 1.99, "learning_rate": 1.0277775048392012e-07, "loss": 2.2764, "step": 203000 }, { "epoch": 1.99, "learning_rate": 5.364880665795447e-08, "loss": 2.2628, "step": 203500 }, { "epoch": 2.0, "learning_rate": 4.519862831988839e-09, "loss": 2.2711, "step": 204000 } ], "max_steps": 204046, "num_train_epochs": 2, "total_flos": 6.966781556932454e+17, "trial_name": null, "trial_params": null }