{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05378368203087183, "grad_norm": 1.0489896535873413, "learning_rate": 3.125e-06, "loss": 7.5484, "step": 1000 }, { "epoch": 0.10756736406174366, "grad_norm": 0.7405256628990173, "learning_rate": 6.25e-06, "loss": 5.8085, "step": 2000 }, { "epoch": 0.1613510460926155, "grad_norm": 0.8219595551490784, "learning_rate": 9.375000000000001e-06, "loss": 5.3802, "step": 3000 }, { "epoch": 0.21513472812348733, "grad_norm": 0.9044649600982666, "learning_rate": 1.25e-05, "loss": 5.1583, "step": 4000 }, { "epoch": 0.2689184101543592, "grad_norm": 1.00751793384552, "learning_rate": 1.5625e-05, "loss": 5.0035, "step": 5000 }, { "epoch": 0.322702092185231, "grad_norm": 1.116739273071289, "learning_rate": 1.8750000000000002e-05, "loss": 4.8663, "step": 6000 }, { "epoch": 0.37648577421610285, "grad_norm": 1.1231814622879028, "learning_rate": 2.1875e-05, "loss": 4.7541, "step": 7000 }, { "epoch": 0.43026945624697466, "grad_norm": 1.0907611846923828, "learning_rate": 2.5e-05, "loss": 4.6488, "step": 8000 }, { "epoch": 0.4840531382778465, "grad_norm": 1.113344669342041, "learning_rate": 2.8125000000000003e-05, "loss": 4.5619, "step": 9000 }, { "epoch": 0.5378368203087184, "grad_norm": 1.0354745388031006, "learning_rate": 3.125e-05, "loss": 4.4881, "step": 10000 }, { "epoch": 0.5916205023395902, "grad_norm": 1.5056177377700806, "learning_rate": 3.4371875e-05, "loss": 4.4158, "step": 11000 }, { "epoch": 0.645404184370462, "grad_norm": 0.9814821481704712, "learning_rate": 3.7496875e-05, "loss": 4.3495, "step": 12000 }, { "epoch": 0.6991878664013338, "grad_norm": 1.0727801322937012, "learning_rate": 4.0621875e-05, "loss": 4.2913, "step": 13000 }, { "epoch": 0.7529715484322057, "grad_norm": 1.0662927627563477, "learning_rate": 4.374375e-05, "loss": 4.2355, "step": 14000 }, { "epoch": 0.8067552304630775, "grad_norm": 1.030743956565857, "learning_rate": 4.686875e-05, "loss": 4.1862, "step": 15000 }, { "epoch": 0.8605389124939493, "grad_norm": 1.0385651588439941, "learning_rate": 4.9990625000000004e-05, "loss": 4.1341, "step": 16000 }, { "epoch": 0.9143225945248211, "grad_norm": 0.9680750966072083, "learning_rate": 5.3115625000000005e-05, "loss": 4.0955, "step": 17000 }, { "epoch": 0.968106276555693, "grad_norm": 1.0905612707138062, "learning_rate": 5.6240625e-05, "loss": 4.0555, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.30862948193360185, "eval_loss": 4.260831832885742, "eval_runtime": 152.7708, "eval_samples_per_second": 379.117, "eval_steps_per_second": 5.924, "step": 18593 }, { "epoch": 1.0218899585865648, "grad_norm": 0.9744522571563721, "learning_rate": 5.93625e-05, "loss": 4.02, "step": 19000 }, { "epoch": 1.0756736406174368, "grad_norm": 0.952312707901001, "learning_rate": 6.24875e-05, "loss": 3.9723, "step": 20000 }, { "epoch": 1.1294573226483084, "grad_norm": 0.9960818290710449, "learning_rate": 6.56125e-05, "loss": 3.9319, "step": 21000 }, { "epoch": 1.1832410046791804, "grad_norm": 1.0043728351593018, "learning_rate": 6.8734375e-05, "loss": 3.8906, "step": 22000 }, { "epoch": 1.2370246867100523, "grad_norm": 0.9806647300720215, "learning_rate": 7.185937500000001e-05, "loss": 3.8556, "step": 23000 }, { "epoch": 1.290808368740924, "grad_norm": 0.9609583020210266, "learning_rate": 7.4978125e-05, "loss": 3.8292, "step": 24000 }, { "epoch": 1.3445920507717959, "grad_norm": 0.9188491106033325, "learning_rate": 7.8103125e-05, "loss": 3.8004, "step": 25000 }, { "epoch": 1.3983757328026676, "grad_norm": 0.932732880115509, "learning_rate": 8.122500000000001e-05, "loss": 3.769, "step": 26000 }, { "epoch": 1.4521594148335395, "grad_norm": 0.8833909034729004, "learning_rate": 8.435e-05, "loss": 3.748, "step": 27000 }, { "epoch": 1.5059430968644114, "grad_norm": 0.9042672514915466, "learning_rate": 8.746875e-05, "loss": 3.7239, "step": 28000 }, { "epoch": 1.5597267788952833, "grad_norm": 0.9524121880531311, "learning_rate": 9.059375e-05, "loss": 3.7076, "step": 29000 }, { "epoch": 1.613510460926155, "grad_norm": 0.8914125561714172, "learning_rate": 9.3715625e-05, "loss": 3.6853, "step": 30000 }, { "epoch": 1.6672941429570267, "grad_norm": 0.8666671514511108, "learning_rate": 9.68375e-05, "loss": 3.6694, "step": 31000 }, { "epoch": 1.7210778249878986, "grad_norm": 0.8737355470657349, "learning_rate": 9.99625e-05, "loss": 3.6462, "step": 32000 }, { "epoch": 1.7748615070187705, "grad_norm": 0.8654928803443909, "learning_rate": 9.970929206143706e-05, "loss": 3.6316, "step": 33000 }, { "epoch": 1.8286451890496425, "grad_norm": 0.8172135949134827, "learning_rate": 9.941505325722356e-05, "loss": 3.616, "step": 34000 }, { "epoch": 1.8824288710805142, "grad_norm": 0.8614993691444397, "learning_rate": 9.912110869181429e-05, "loss": 3.5943, "step": 35000 }, { "epoch": 1.9362125531113858, "grad_norm": 0.8271329998970032, "learning_rate": 9.882716412640499e-05, "loss": 3.5805, "step": 36000 }, { "epoch": 1.9899962351422578, "grad_norm": 0.8484081029891968, "learning_rate": 9.85329253221915e-05, "loss": 3.5626, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.3633626394917919, "eval_loss": 3.7489588260650635, "eval_runtime": 153.9094, "eval_samples_per_second": 376.312, "eval_steps_per_second": 5.88, "step": 37186 }, { "epoch": 2.0437799171731297, "grad_norm": 0.834202766418457, "learning_rate": 9.8238686517978e-05, "loss": 3.5259, "step": 38000 }, { "epoch": 2.0975635992040016, "grad_norm": 0.8602707982063293, "learning_rate": 9.79444477137645e-05, "loss": 3.514, "step": 39000 }, { "epoch": 2.1513472812348735, "grad_norm": 0.7993927001953125, "learning_rate": 9.765050314835522e-05, "loss": 3.4999, "step": 40000 }, { "epoch": 2.205130963265745, "grad_norm": 0.8245725035667419, "learning_rate": 9.735626434414172e-05, "loss": 3.4915, "step": 41000 }, { "epoch": 2.258914645296617, "grad_norm": 0.8083192110061646, "learning_rate": 9.706261401753663e-05, "loss": 3.4863, "step": 42000 }, { "epoch": 2.312698327327489, "grad_norm": 0.8514248728752136, "learning_rate": 9.676837521332313e-05, "loss": 3.4754, "step": 43000 }, { "epoch": 2.3664820093583607, "grad_norm": 0.8821371793746948, "learning_rate": 9.647413640910963e-05, "loss": 3.468, "step": 44000 }, { "epoch": 2.4202656913892326, "grad_norm": 0.8014013767242432, "learning_rate": 9.617989760489615e-05, "loss": 3.4582, "step": 45000 }, { "epoch": 2.4740493734201046, "grad_norm": 0.8032485246658325, "learning_rate": 9.588565880068265e-05, "loss": 3.4497, "step": 46000 }, { "epoch": 2.527833055450976, "grad_norm": 0.8060674667358398, "learning_rate": 9.559171423527336e-05, "loss": 3.4412, "step": 47000 }, { "epoch": 2.581616737481848, "grad_norm": 0.7642372250556946, "learning_rate": 9.529747543105986e-05, "loss": 3.4374, "step": 48000 }, { "epoch": 2.63540041951272, "grad_norm": 0.8085050582885742, "learning_rate": 9.500382510445478e-05, "loss": 3.4257, "step": 49000 }, { "epoch": 2.6891841015435918, "grad_norm": 0.7765064239501953, "learning_rate": 9.470958630024128e-05, "loss": 3.4264, "step": 50000 }, { "epoch": 2.7429677835744632, "grad_norm": 0.7633680105209351, "learning_rate": 9.441534749602779e-05, "loss": 3.4157, "step": 51000 }, { "epoch": 2.796751465605335, "grad_norm": 0.7525299191474915, "learning_rate": 9.412140293061849e-05, "loss": 3.4093, "step": 52000 }, { "epoch": 2.850535147636207, "grad_norm": 0.8231662511825562, "learning_rate": 9.3827164126405e-05, "loss": 3.4054, "step": 53000 }, { "epoch": 2.904318829667079, "grad_norm": 0.7820568084716797, "learning_rate": 9.35329253221915e-05, "loss": 3.3949, "step": 54000 }, { "epoch": 2.958102511697951, "grad_norm": 0.7471756935119629, "learning_rate": 9.323898075678222e-05, "loss": 3.3926, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.38042721878496405, "eval_loss": 3.5728847980499268, "eval_runtime": 153.8413, "eval_samples_per_second": 376.479, "eval_steps_per_second": 5.883, "step": 55779 }, { "epoch": 3.011886193728823, "grad_norm": 0.7675151824951172, "learning_rate": 9.294474195256872e-05, "loss": 3.373, "step": 56000 }, { "epoch": 3.0656698757596943, "grad_norm": 0.8319113850593567, "learning_rate": 9.26505031483552e-05, "loss": 3.3312, "step": 57000 }, { "epoch": 3.119453557790566, "grad_norm": 0.745871365070343, "learning_rate": 9.23562643441417e-05, "loss": 3.3325, "step": 58000 }, { "epoch": 3.173237239821438, "grad_norm": 0.7976841926574707, "learning_rate": 9.206231977873242e-05, "loss": 3.3371, "step": 59000 }, { "epoch": 3.22702092185231, "grad_norm": 0.7604719996452332, "learning_rate": 9.176808097451893e-05, "loss": 3.3258, "step": 60000 }, { "epoch": 3.280804603883182, "grad_norm": 0.7433556318283081, "learning_rate": 9.147413640910963e-05, "loss": 3.329, "step": 61000 }, { "epoch": 3.3345882859140534, "grad_norm": 0.7537267804145813, "learning_rate": 9.117989760489613e-05, "loss": 3.3206, "step": 62000 }, { "epoch": 3.3883719679449253, "grad_norm": 0.7688448429107666, "learning_rate": 9.088595303948685e-05, "loss": 3.3202, "step": 63000 }, { "epoch": 3.4421556499757973, "grad_norm": 0.7358716726303101, "learning_rate": 9.059171423527336e-05, "loss": 3.3166, "step": 64000 }, { "epoch": 3.495939332006669, "grad_norm": 0.7672792673110962, "learning_rate": 9.029747543105986e-05, "loss": 3.317, "step": 65000 }, { "epoch": 3.549723014037541, "grad_norm": 0.7670078873634338, "learning_rate": 9.000353086565056e-05, "loss": 3.3139, "step": 66000 }, { "epoch": 3.603506696068413, "grad_norm": 0.7238633632659912, "learning_rate": 8.970929206143706e-05, "loss": 3.3109, "step": 67000 }, { "epoch": 3.657290378099285, "grad_norm": 0.6910108923912048, "learning_rate": 8.941505325722357e-05, "loss": 3.31, "step": 68000 }, { "epoch": 3.7110740601301564, "grad_norm": 0.7354035973548889, "learning_rate": 8.912110869181429e-05, "loss": 3.3028, "step": 69000 }, { "epoch": 3.7648577421610283, "grad_norm": 0.7346329092979431, "learning_rate": 8.882686988760079e-05, "loss": 3.3016, "step": 70000 }, { "epoch": 3.8186414241919002, "grad_norm": 0.7276666164398193, "learning_rate": 8.85329253221915e-05, "loss": 3.2962, "step": 71000 }, { "epoch": 3.872425106222772, "grad_norm": 0.7881675958633423, "learning_rate": 8.823927499558642e-05, "loss": 3.2929, "step": 72000 }, { "epoch": 3.9262087882536436, "grad_norm": 0.731143593788147, "learning_rate": 8.794503619137292e-05, "loss": 3.2931, "step": 73000 }, { "epoch": 3.9799924702845155, "grad_norm": 0.7707085013389587, "learning_rate": 8.765079738715942e-05, "loss": 3.2863, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.38832393254759884, "eval_loss": 3.512221574783325, "eval_runtime": 153.8816, "eval_samples_per_second": 376.38, "eval_steps_per_second": 5.881, "step": 74372 }, { "epoch": 4.033776152315387, "grad_norm": 0.8132415413856506, "learning_rate": 8.735655858294592e-05, "loss": 3.2565, "step": 75000 }, { "epoch": 4.087559834346259, "grad_norm": 0.7561783790588379, "learning_rate": 8.706231977873243e-05, "loss": 3.2326, "step": 76000 }, { "epoch": 4.141343516377131, "grad_norm": 0.757048487663269, "learning_rate": 8.676808097451893e-05, "loss": 3.2321, "step": 77000 }, { "epoch": 4.195127198408003, "grad_norm": 0.7583024501800537, "learning_rate": 8.647384217030541e-05, "loss": 3.2371, "step": 78000 }, { "epoch": 4.248910880438875, "grad_norm": 0.7448434233665466, "learning_rate": 8.617989760489615e-05, "loss": 3.2365, "step": 79000 }, { "epoch": 4.302694562469747, "grad_norm": 0.7461341619491577, "learning_rate": 8.588565880068264e-05, "loss": 3.2369, "step": 80000 }, { "epoch": 4.356478244500618, "grad_norm": 0.7581353187561035, "learning_rate": 8.559141999646914e-05, "loss": 3.2349, "step": 81000 }, { "epoch": 4.41026192653149, "grad_norm": 0.7130771279335022, "learning_rate": 8.529718119225564e-05, "loss": 3.2389, "step": 82000 }, { "epoch": 4.464045608562362, "grad_norm": 0.7467326521873474, "learning_rate": 8.500323662684634e-05, "loss": 3.2338, "step": 83000 }, { "epoch": 4.517829290593234, "grad_norm": 0.7349050641059875, "learning_rate": 8.470929206143707e-05, "loss": 3.231, "step": 84000 }, { "epoch": 4.571612972624106, "grad_norm": 0.7301473021507263, "learning_rate": 8.441505325722357e-05, "loss": 3.2323, "step": 85000 }, { "epoch": 4.625396654654978, "grad_norm": 0.7459990978240967, "learning_rate": 8.412110869181427e-05, "loss": 3.2319, "step": 86000 }, { "epoch": 4.6791803366858495, "grad_norm": 0.7310500144958496, "learning_rate": 8.382686988760077e-05, "loss": 3.2316, "step": 87000 }, { "epoch": 4.7329640187167215, "grad_norm": 0.7355625033378601, "learning_rate": 8.35329253221915e-05, "loss": 3.2298, "step": 88000 }, { "epoch": 4.786747700747593, "grad_norm": 0.7653241157531738, "learning_rate": 8.3238686517978e-05, "loss": 3.2223, "step": 89000 }, { "epoch": 4.840531382778465, "grad_norm": 0.7360557913780212, "learning_rate": 8.29447419525687e-05, "loss": 3.2246, "step": 90000 }, { "epoch": 4.894315064809337, "grad_norm": 0.726395308971405, "learning_rate": 8.26505031483552e-05, "loss": 3.2265, "step": 91000 }, { "epoch": 4.948098746840209, "grad_norm": 0.7324568033218384, "learning_rate": 8.23562643441417e-05, "loss": 3.2223, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.3931525087864058, "eval_loss": 3.470435857772827, "eval_runtime": 153.6492, "eval_samples_per_second": 376.95, "eval_steps_per_second": 5.89, "step": 92965 }, { "epoch": 5.00188242887108, "grad_norm": 0.7105480432510376, "learning_rate": 8.20620255399282e-05, "loss": 3.2157, "step": 93000 }, { "epoch": 5.055666110901952, "grad_norm": 0.77333664894104, "learning_rate": 8.176778673571471e-05, "loss": 3.167, "step": 94000 }, { "epoch": 5.109449792932824, "grad_norm": 0.7714924812316895, "learning_rate": 8.147384217030543e-05, "loss": 3.1654, "step": 95000 }, { "epoch": 5.163233474963696, "grad_norm": 0.7432717084884644, "learning_rate": 8.117960336609193e-05, "loss": 3.1702, "step": 96000 }, { "epoch": 5.217017156994568, "grad_norm": 0.7248101830482483, "learning_rate": 8.088536456187843e-05, "loss": 3.1683, "step": 97000 }, { "epoch": 5.27080083902544, "grad_norm": 0.7558987140655518, "learning_rate": 8.059112575766492e-05, "loss": 3.1737, "step": 98000 }, { "epoch": 5.324584521056312, "grad_norm": 0.7432435750961304, "learning_rate": 8.029718119225564e-05, "loss": 3.1774, "step": 99000 }, { "epoch": 5.3783682030871836, "grad_norm": 0.7622554898262024, "learning_rate": 8.000323662684636e-05, "loss": 3.1746, "step": 100000 }, { "epoch": 5.4321518851180555, "grad_norm": 0.742205798625946, "learning_rate": 7.970899782263285e-05, "loss": 3.1724, "step": 101000 }, { "epoch": 5.485935567148927, "grad_norm": 0.7343482971191406, "learning_rate": 7.941475901841935e-05, "loss": 3.1735, "step": 102000 }, { "epoch": 5.539719249179798, "grad_norm": 0.7449206709861755, "learning_rate": 7.912052021420585e-05, "loss": 3.1736, "step": 103000 }, { "epoch": 5.59350293121067, "grad_norm": 0.7648908495903015, "learning_rate": 7.882628140999235e-05, "loss": 3.1748, "step": 104000 }, { "epoch": 5.647286613241542, "grad_norm": 0.706194281578064, "learning_rate": 7.853233684458307e-05, "loss": 3.1724, "step": 105000 }, { "epoch": 5.701070295272414, "grad_norm": 0.7112085819244385, "learning_rate": 7.823839227917378e-05, "loss": 3.173, "step": 106000 }, { "epoch": 5.754853977303286, "grad_norm": 0.7374659776687622, "learning_rate": 7.794415347496028e-05, "loss": 3.174, "step": 107000 }, { "epoch": 5.808637659334158, "grad_norm": 0.7422733902931213, "learning_rate": 7.764991467074678e-05, "loss": 3.1727, "step": 108000 }, { "epoch": 5.86242134136503, "grad_norm": 0.7205289602279663, "learning_rate": 7.735567586653328e-05, "loss": 3.1717, "step": 109000 }, { "epoch": 5.916205023395902, "grad_norm": 0.7607922554016113, "learning_rate": 7.706143706231978e-05, "loss": 3.1679, "step": 110000 }, { "epoch": 5.969988705426774, "grad_norm": 0.7205678224563599, "learning_rate": 7.67674924969105e-05, "loss": 3.1687, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.39598443418529594, "eval_loss": 3.4542243480682373, "eval_runtime": 154.6543, "eval_samples_per_second": 374.5, "eval_steps_per_second": 5.852, "step": 111558 }, { "epoch": 6.023772387457646, "grad_norm": 0.7311733365058899, "learning_rate": 7.6473253692697e-05, "loss": 3.143, "step": 112000 }, { "epoch": 6.077556069488518, "grad_norm": 0.7482650876045227, "learning_rate": 7.617930912728771e-05, "loss": 3.1172, "step": 113000 }, { "epoch": 6.131339751519389, "grad_norm": 0.768342912197113, "learning_rate": 7.588507032307421e-05, "loss": 3.1151, "step": 114000 }, { "epoch": 6.1851234335502605, "grad_norm": 0.769808828830719, "learning_rate": 7.559112575766493e-05, "loss": 3.1213, "step": 115000 }, { "epoch": 6.238907115581132, "grad_norm": 0.7565628290176392, "learning_rate": 7.529688695345143e-05, "loss": 3.1255, "step": 116000 }, { "epoch": 6.292690797612004, "grad_norm": 0.7597582340240479, "learning_rate": 7.500264814923792e-05, "loss": 3.1226, "step": 117000 }, { "epoch": 6.346474479642876, "grad_norm": 0.7350876331329346, "learning_rate": 7.470840934502442e-05, "loss": 3.1263, "step": 118000 }, { "epoch": 6.400258161673748, "grad_norm": 0.734434962272644, "learning_rate": 7.441475901841936e-05, "loss": 3.1267, "step": 119000 }, { "epoch": 6.45404184370462, "grad_norm": 0.7643101215362549, "learning_rate": 7.412052021420586e-05, "loss": 3.13, "step": 120000 }, { "epoch": 6.507825525735492, "grad_norm": 0.7487729787826538, "learning_rate": 7.382628140999235e-05, "loss": 3.1309, "step": 121000 }, { "epoch": 6.561609207766364, "grad_norm": 0.7111514806747437, "learning_rate": 7.353204260577885e-05, "loss": 3.1298, "step": 122000 }, { "epoch": 6.615392889797236, "grad_norm": 0.7280795574188232, "learning_rate": 7.323780380156535e-05, "loss": 3.1316, "step": 123000 }, { "epoch": 6.669176571828107, "grad_norm": 0.7801093459129333, "learning_rate": 7.294385923615607e-05, "loss": 3.1235, "step": 124000 }, { "epoch": 6.722960253858979, "grad_norm": 0.7695817351341248, "learning_rate": 7.264962043194257e-05, "loss": 3.1298, "step": 125000 }, { "epoch": 6.776743935889851, "grad_norm": 0.7277592420578003, "learning_rate": 7.235538162772907e-05, "loss": 3.1337, "step": 126000 }, { "epoch": 6.830527617920723, "grad_norm": 0.7386214137077332, "learning_rate": 7.206143706231978e-05, "loss": 3.1248, "step": 127000 }, { "epoch": 6.8843112999515945, "grad_norm": 0.7697268128395081, "learning_rate": 7.176719825810628e-05, "loss": 3.1267, "step": 128000 }, { "epoch": 6.938094981982466, "grad_norm": 0.7416918873786926, "learning_rate": 7.147325369269699e-05, "loss": 3.1255, "step": 129000 }, { "epoch": 6.991878664013338, "grad_norm": 0.7437503933906555, "learning_rate": 7.11790148884835e-05, "loss": 3.1265, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.3987135038494649, "eval_loss": 3.430708169937134, "eval_runtime": 154.0366, "eval_samples_per_second": 376.001, "eval_steps_per_second": 5.875, "step": 130151 }, { "epoch": 7.04566234604421, "grad_norm": 0.7688168883323669, "learning_rate": 7.088507032307421e-05, "loss": 3.0768, "step": 131000 }, { "epoch": 7.099446028075082, "grad_norm": 0.7706940174102783, "learning_rate": 7.059083151886071e-05, "loss": 3.0727, "step": 132000 }, { "epoch": 7.153229710105954, "grad_norm": 0.7756544947624207, "learning_rate": 7.029688695345142e-05, "loss": 3.0784, "step": 133000 }, { "epoch": 7.207013392136826, "grad_norm": 0.7629918456077576, "learning_rate": 7.000264814923792e-05, "loss": 3.0812, "step": 134000 }, { "epoch": 7.260797074167698, "grad_norm": 0.7643315196037292, "learning_rate": 6.970840934502443e-05, "loss": 3.0791, "step": 135000 }, { "epoch": 7.314580756198569, "grad_norm": 0.7508428692817688, "learning_rate": 6.941417054081093e-05, "loss": 3.0904, "step": 136000 }, { "epoch": 7.368364438229441, "grad_norm": 0.749332070350647, "learning_rate": 6.912022597540164e-05, "loss": 3.0907, "step": 137000 }, { "epoch": 7.422148120260313, "grad_norm": 0.7576011419296265, "learning_rate": 6.882598717118814e-05, "loss": 3.0874, "step": 138000 }, { "epoch": 7.475931802291185, "grad_norm": 0.7459414601325989, "learning_rate": 6.853174836697463e-05, "loss": 3.0893, "step": 139000 }, { "epoch": 7.529715484322057, "grad_norm": 0.7699885964393616, "learning_rate": 6.823750956276113e-05, "loss": 3.0894, "step": 140000 }, { "epoch": 7.5834991663529285, "grad_norm": 0.7432721853256226, "learning_rate": 6.794327075854765e-05, "loss": 3.0884, "step": 141000 }, { "epoch": 7.6372828483838004, "grad_norm": 0.7425631880760193, "learning_rate": 6.764903195433415e-05, "loss": 3.0955, "step": 142000 }, { "epoch": 7.691066530414672, "grad_norm": 0.7397525906562805, "learning_rate": 6.735508738892485e-05, "loss": 3.0927, "step": 143000 }, { "epoch": 7.744850212445544, "grad_norm": 0.8293583989143372, "learning_rate": 6.706084858471135e-05, "loss": 3.0944, "step": 144000 }, { "epoch": 7.798633894476416, "grad_norm": 0.7823474407196045, "learning_rate": 6.676690401930206e-05, "loss": 3.092, "step": 145000 }, { "epoch": 7.852417576507287, "grad_norm": 0.7494142651557922, "learning_rate": 6.647266521508858e-05, "loss": 3.0927, "step": 146000 }, { "epoch": 7.906201258538159, "grad_norm": 0.7707638144493103, "learning_rate": 6.617872064967928e-05, "loss": 3.0927, "step": 147000 }, { "epoch": 7.959984940569031, "grad_norm": 0.7771040797233582, "learning_rate": 6.588448184546578e-05, "loss": 3.0955, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.4013845282133079, "eval_loss": 3.4010231494903564, "eval_runtime": 154.2403, "eval_samples_per_second": 375.505, "eval_steps_per_second": 5.867, "step": 148744 }, { "epoch": 8.013768622599903, "grad_norm": 0.7500186562538147, "learning_rate": 6.559024304125229e-05, "loss": 3.0751, "step": 149000 }, { "epoch": 8.067552304630775, "grad_norm": 0.7857389450073242, "learning_rate": 6.529600423703879e-05, "loss": 3.038, "step": 150000 }, { "epoch": 8.121335986661647, "grad_norm": 0.7659834027290344, "learning_rate": 6.500205967162951e-05, "loss": 3.0429, "step": 151000 }, { "epoch": 8.175119668692519, "grad_norm": 0.7773808240890503, "learning_rate": 6.470811510622021e-05, "loss": 3.0451, "step": 152000 }, { "epoch": 8.22890335072339, "grad_norm": 0.7654848694801331, "learning_rate": 6.441387630200672e-05, "loss": 3.0463, "step": 153000 }, { "epoch": 8.282687032754263, "grad_norm": 0.7545380592346191, "learning_rate": 6.411963749779322e-05, "loss": 3.0458, "step": 154000 }, { "epoch": 8.336470714785134, "grad_norm": 0.7594432830810547, "learning_rate": 6.382569293238392e-05, "loss": 3.0503, "step": 155000 }, { "epoch": 8.390254396816006, "grad_norm": 0.7385092973709106, "learning_rate": 6.353145412817044e-05, "loss": 3.0529, "step": 156000 }, { "epoch": 8.444038078846878, "grad_norm": 0.7623139023780823, "learning_rate": 6.323750956276114e-05, "loss": 3.056, "step": 157000 }, { "epoch": 8.49782176087775, "grad_norm": 0.7708114385604858, "learning_rate": 6.294327075854765e-05, "loss": 3.0551, "step": 158000 }, { "epoch": 8.551605442908622, "grad_norm": 0.7581725120544434, "learning_rate": 6.264903195433413e-05, "loss": 3.0606, "step": 159000 }, { "epoch": 8.605389124939494, "grad_norm": 0.7970029711723328, "learning_rate": 6.235479315012063e-05, "loss": 3.0621, "step": 160000 }, { "epoch": 8.659172806970366, "grad_norm": 0.759104311466217, "learning_rate": 6.206084858471135e-05, "loss": 3.0587, "step": 161000 }, { "epoch": 8.712956489001236, "grad_norm": 0.7619072794914246, "learning_rate": 6.176660978049786e-05, "loss": 3.0615, "step": 162000 }, { "epoch": 8.766740171032108, "grad_norm": 0.7338131070137024, "learning_rate": 6.147266521508856e-05, "loss": 3.0647, "step": 163000 }, { "epoch": 8.82052385306298, "grad_norm": 0.7602887153625488, "learning_rate": 6.117872064967928e-05, "loss": 3.0653, "step": 164000 }, { "epoch": 8.874307535093852, "grad_norm": 0.7433264255523682, "learning_rate": 6.088448184546578e-05, "loss": 3.0627, "step": 165000 }, { "epoch": 8.928091217124724, "grad_norm": 0.7426097989082336, "learning_rate": 6.059024304125228e-05, "loss": 3.0664, "step": 166000 }, { "epoch": 8.981874899155596, "grad_norm": 0.7173585891723633, "learning_rate": 6.0296004237038787e-05, "loss": 3.0614, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.4025592065422428, "eval_loss": 3.394737958908081, "eval_runtime": 154.2459, "eval_samples_per_second": 375.491, "eval_steps_per_second": 5.867, "step": 167337 }, { "epoch": 9.035658581186468, "grad_norm": 0.8185796141624451, "learning_rate": 6.000176543282529e-05, "loss": 3.0225, "step": 168000 }, { "epoch": 9.08944226321734, "grad_norm": 0.7936908602714539, "learning_rate": 5.970752662861179e-05, "loss": 3.0087, "step": 169000 }, { "epoch": 9.143225945248211, "grad_norm": 0.7915844321250916, "learning_rate": 5.94135820632025e-05, "loss": 3.0149, "step": 170000 }, { "epoch": 9.197009627279083, "grad_norm": 0.7934896945953369, "learning_rate": 5.9119343258989e-05, "loss": 3.0156, "step": 171000 }, { "epoch": 9.250793309309955, "grad_norm": 0.7754538059234619, "learning_rate": 5.882539869357972e-05, "loss": 3.0215, "step": 172000 }, { "epoch": 9.304576991340827, "grad_norm": 0.7899085879325867, "learning_rate": 5.853115988936622e-05, "loss": 3.0223, "step": 173000 }, { "epoch": 9.358360673371699, "grad_norm": 0.7922378182411194, "learning_rate": 5.823750956276114e-05, "loss": 3.0266, "step": 174000 }, { "epoch": 9.412144355402571, "grad_norm": 0.8085660338401794, "learning_rate": 5.794327075854764e-05, "loss": 3.0204, "step": 175000 }, { "epoch": 9.465928037433443, "grad_norm": 0.8308489322662354, "learning_rate": 5.764903195433414e-05, "loss": 3.0297, "step": 176000 }, { "epoch": 9.519711719464315, "grad_norm": 0.7885105609893799, "learning_rate": 5.735479315012065e-05, "loss": 3.0309, "step": 177000 }, { "epoch": 9.573495401495187, "grad_norm": 0.7959656715393066, "learning_rate": 5.7060554345907135e-05, "loss": 3.0299, "step": 178000 }, { "epoch": 9.627279083526059, "grad_norm": 0.8052105903625488, "learning_rate": 5.6766609780497856e-05, "loss": 3.0341, "step": 179000 }, { "epoch": 9.68106276555693, "grad_norm": 0.77768474817276, "learning_rate": 5.647237097628435e-05, "loss": 3.0311, "step": 180000 }, { "epoch": 9.734846447587802, "grad_norm": 0.7868794202804565, "learning_rate": 5.617842641087507e-05, "loss": 3.037, "step": 181000 }, { "epoch": 9.788630129618674, "grad_norm": 0.7672579884529114, "learning_rate": 5.5884187606661565e-05, "loss": 3.0332, "step": 182000 }, { "epoch": 9.842413811649546, "grad_norm": 0.7784843444824219, "learning_rate": 5.5589948802448066e-05, "loss": 3.0331, "step": 183000 }, { "epoch": 9.896197493680418, "grad_norm": 0.8073210120201111, "learning_rate": 5.529600423703878e-05, "loss": 3.0338, "step": 184000 }, { "epoch": 9.949981175711288, "grad_norm": 0.7723698616027832, "learning_rate": 5.500176543282528e-05, "loss": 3.0346, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.40372259828500323, "eval_loss": 3.386077642440796, "eval_runtime": 153.8424, "eval_samples_per_second": 376.476, "eval_steps_per_second": 5.883, "step": 185930 }, { "epoch": 10.00376485774216, "grad_norm": 0.7879741191864014, "learning_rate": 5.470752662861178e-05, "loss": 3.035, "step": 186000 }, { "epoch": 10.057548539773032, "grad_norm": 0.7588198781013489, "learning_rate": 5.441328782439828e-05, "loss": 2.9797, "step": 187000 }, { "epoch": 10.111332221803904, "grad_norm": 0.7911401987075806, "learning_rate": 5.4119343258989e-05, "loss": 2.9867, "step": 188000 }, { "epoch": 10.165115903834776, "grad_norm": 0.834837794303894, "learning_rate": 5.38251044547755e-05, "loss": 2.9866, "step": 189000 }, { "epoch": 10.218899585865648, "grad_norm": 0.785953938961029, "learning_rate": 5.353115988936621e-05, "loss": 2.9908, "step": 190000 }, { "epoch": 10.27268326789652, "grad_norm": 0.7968313694000244, "learning_rate": 5.323692108515271e-05, "loss": 2.9986, "step": 191000 }, { "epoch": 10.326466949927392, "grad_norm": 0.815880298614502, "learning_rate": 5.294297651974343e-05, "loss": 2.9967, "step": 192000 }, { "epoch": 10.380250631958264, "grad_norm": 0.8155861496925354, "learning_rate": 5.264873771552993e-05, "loss": 3.0001, "step": 193000 }, { "epoch": 10.434034313989136, "grad_norm": 0.8102470636367798, "learning_rate": 5.235479315012064e-05, "loss": 3.0023, "step": 194000 }, { "epoch": 10.487817996020008, "grad_norm": 0.8228176832199097, "learning_rate": 5.206055434590714e-05, "loss": 3.0047, "step": 195000 }, { "epoch": 10.54160167805088, "grad_norm": 0.810368537902832, "learning_rate": 5.1766315541693643e-05, "loss": 3.0041, "step": 196000 }, { "epoch": 10.595385360081751, "grad_norm": 0.8073120713233948, "learning_rate": 5.1472076737480144e-05, "loss": 3.0071, "step": 197000 }, { "epoch": 10.649169042112623, "grad_norm": 0.7942905426025391, "learning_rate": 5.117813217207086e-05, "loss": 3.0055, "step": 198000 }, { "epoch": 10.702952724143495, "grad_norm": 0.8009095788002014, "learning_rate": 5.088389336785736e-05, "loss": 3.0065, "step": 199000 }, { "epoch": 10.756736406174367, "grad_norm": 0.7769667506217957, "learning_rate": 5.0589654563643853e-05, "loss": 3.0097, "step": 200000 }, { "epoch": 10.810520088205239, "grad_norm": 0.7919924259185791, "learning_rate": 5.0295415759430354e-05, "loss": 3.0101, "step": 201000 }, { "epoch": 10.864303770236111, "grad_norm": 0.7941082715988159, "learning_rate": 5.000147119402107e-05, "loss": 3.0089, "step": 202000 }, { "epoch": 10.918087452266983, "grad_norm": 0.764107346534729, "learning_rate": 4.970752662861178e-05, "loss": 3.0077, "step": 203000 }, { "epoch": 10.971871134297855, "grad_norm": 0.7957196235656738, "learning_rate": 4.941328782439828e-05, "loss": 3.0121, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.40473771922010227, "eval_loss": 3.37876033782959, "eval_runtime": 154.4072, "eval_samples_per_second": 375.099, "eval_steps_per_second": 5.861, "step": 204523 }, { "epoch": 11.025654816328727, "grad_norm": 0.826215922832489, "learning_rate": 4.9119049020184784e-05, "loss": 2.9851, "step": 205000 }, { "epoch": 11.079438498359597, "grad_norm": 0.8113217353820801, "learning_rate": 4.88251044547755e-05, "loss": 2.9589, "step": 206000 }, { "epoch": 11.133222180390469, "grad_norm": 0.8072004318237305, "learning_rate": 4.8530865650562e-05, "loss": 2.9636, "step": 207000 }, { "epoch": 11.18700586242134, "grad_norm": 0.8238457441329956, "learning_rate": 4.82366268463485e-05, "loss": 2.969, "step": 208000 }, { "epoch": 11.240789544452213, "grad_norm": 0.8087642788887024, "learning_rate": 4.7942388042135e-05, "loss": 2.9709, "step": 209000 }, { "epoch": 11.294573226483084, "grad_norm": 0.7844156622886658, "learning_rate": 4.764873771552993e-05, "loss": 2.9751, "step": 210000 }, { "epoch": 11.348356908513956, "grad_norm": 0.8092362284660339, "learning_rate": 4.735449891131642e-05, "loss": 2.9723, "step": 211000 }, { "epoch": 11.402140590544828, "grad_norm": 0.833483874797821, "learning_rate": 4.706055434590714e-05, "loss": 2.9742, "step": 212000 }, { "epoch": 11.4559242725757, "grad_norm": 0.8131833672523499, "learning_rate": 4.676631554169364e-05, "loss": 2.981, "step": 213000 }, { "epoch": 11.509707954606572, "grad_norm": 0.8103277683258057, "learning_rate": 4.647207673748014e-05, "loss": 2.98, "step": 214000 }, { "epoch": 11.563491636637444, "grad_norm": 0.8259956240653992, "learning_rate": 4.6177837933266646e-05, "loss": 2.9833, "step": 215000 }, { "epoch": 11.617275318668316, "grad_norm": 0.8347487449645996, "learning_rate": 4.588389336785735e-05, "loss": 2.978, "step": 216000 }, { "epoch": 11.671059000699188, "grad_norm": 0.8020747303962708, "learning_rate": 4.558965456364386e-05, "loss": 2.9826, "step": 217000 }, { "epoch": 11.72484268273006, "grad_norm": 0.7874395251274109, "learning_rate": 4.529570999823457e-05, "loss": 2.9874, "step": 218000 }, { "epoch": 11.778626364760932, "grad_norm": 0.816592812538147, "learning_rate": 4.500147119402107e-05, "loss": 2.9812, "step": 219000 }, { "epoch": 11.832410046791804, "grad_norm": 0.8037729263305664, "learning_rate": 4.470752662861178e-05, "loss": 2.9872, "step": 220000 }, { "epoch": 11.886193728822676, "grad_norm": 0.7837305068969727, "learning_rate": 4.44135820632025e-05, "loss": 2.9866, "step": 221000 }, { "epoch": 11.939977410853547, "grad_norm": 0.7976572513580322, "learning_rate": 4.4119343258989e-05, "loss": 2.9896, "step": 222000 }, { "epoch": 11.99376109288442, "grad_norm": 0.802457869052887, "learning_rate": 4.382539869357971e-05, "loss": 2.9917, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.40500792546768455, "eval_loss": 3.3736560344696045, "eval_runtime": 154.2381, "eval_samples_per_second": 375.51, "eval_steps_per_second": 5.868, "step": 223116 }, { "epoch": 12.047544774915291, "grad_norm": 0.8179975748062134, "learning_rate": 4.353115988936621e-05, "loss": 2.9436, "step": 224000 }, { "epoch": 12.101328456946163, "grad_norm": 0.8451590538024902, "learning_rate": 4.323692108515271e-05, "loss": 2.9435, "step": 225000 }, { "epoch": 12.155112138977035, "grad_norm": 0.8380730748176575, "learning_rate": 4.2942682280939214e-05, "loss": 2.9481, "step": 226000 }, { "epoch": 12.208895821007907, "grad_norm": 0.8392196297645569, "learning_rate": 4.264873771552992e-05, "loss": 2.9472, "step": 227000 }, { "epoch": 12.262679503038777, "grad_norm": 0.8227624893188477, "learning_rate": 4.235449891131642e-05, "loss": 2.9494, "step": 228000 }, { "epoch": 12.316463185069649, "grad_norm": 0.824691653251648, "learning_rate": 4.206026010710293e-05, "loss": 2.9491, "step": 229000 }, { "epoch": 12.370246867100521, "grad_norm": 0.829526960849762, "learning_rate": 4.176631554169364e-05, "loss": 2.957, "step": 230000 }, { "epoch": 12.424030549131393, "grad_norm": 0.8544576168060303, "learning_rate": 4.1472076737480145e-05, "loss": 2.9542, "step": 231000 }, { "epoch": 12.477814231162265, "grad_norm": 0.8392364978790283, "learning_rate": 4.117813217207085e-05, "loss": 2.9559, "step": 232000 }, { "epoch": 12.531597913193137, "grad_norm": 0.8318558931350708, "learning_rate": 4.0883893367857353e-05, "loss": 2.9587, "step": 233000 }, { "epoch": 12.585381595224009, "grad_norm": 0.8154683709144592, "learning_rate": 4.0589654563643854e-05, "loss": 2.9603, "step": 234000 }, { "epoch": 12.63916527725488, "grad_norm": 0.8392585515975952, "learning_rate": 4.0295415759430355e-05, "loss": 2.9579, "step": 235000 }, { "epoch": 12.692948959285753, "grad_norm": 0.8337314128875732, "learning_rate": 4.0001176955216856e-05, "loss": 2.9643, "step": 236000 }, { "epoch": 12.746732641316624, "grad_norm": 0.8414183259010315, "learning_rate": 3.970752662861178e-05, "loss": 2.9609, "step": 237000 }, { "epoch": 12.800516323347496, "grad_norm": 0.8429349064826965, "learning_rate": 3.9413287824398284e-05, "loss": 2.9656, "step": 238000 }, { "epoch": 12.854300005378368, "grad_norm": 0.8262794613838196, "learning_rate": 3.9119049020184785e-05, "loss": 2.9668, "step": 239000 }, { "epoch": 12.90808368740924, "grad_norm": 0.8269763588905334, "learning_rate": 3.88251044547755e-05, "loss": 2.9689, "step": 240000 }, { "epoch": 12.961867369440112, "grad_norm": 0.8158543109893799, "learning_rate": 3.8530865650562e-05, "loss": 2.968, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4055080959085722, "eval_loss": 3.3828203678131104, "eval_runtime": 154.2308, "eval_samples_per_second": 375.528, "eval_steps_per_second": 5.868, "step": 241709 }, { "epoch": 13.015651051470984, "grad_norm": 0.8760387897491455, "learning_rate": 3.82366268463485e-05, "loss": 2.9507, "step": 242000 }, { "epoch": 13.069434733501856, "grad_norm": 0.8574272394180298, "learning_rate": 3.7942388042134995e-05, "loss": 2.9228, "step": 243000 }, { "epoch": 13.123218415532728, "grad_norm": 0.8364537358283997, "learning_rate": 3.7648443476725716e-05, "loss": 2.9249, "step": 244000 }, { "epoch": 13.1770020975636, "grad_norm": 0.8717691898345947, "learning_rate": 3.735420467251221e-05, "loss": 2.9258, "step": 245000 }, { "epoch": 13.230785779594472, "grad_norm": 0.8629365563392639, "learning_rate": 3.705996586829871e-05, "loss": 2.9303, "step": 246000 }, { "epoch": 13.284569461625344, "grad_norm": 0.8226146101951599, "learning_rate": 3.6766021302889425e-05, "loss": 2.9324, "step": 247000 }, { "epoch": 13.338353143656215, "grad_norm": 0.8641866445541382, "learning_rate": 3.6471782498675926e-05, "loss": 2.9361, "step": 248000 }, { "epoch": 13.392136825687086, "grad_norm": 0.8602815866470337, "learning_rate": 3.6177543694462427e-05, "loss": 2.9348, "step": 249000 }, { "epoch": 13.445920507717958, "grad_norm": 0.8341040015220642, "learning_rate": 3.588359912905314e-05, "loss": 2.9366, "step": 250000 }, { "epoch": 13.49970418974883, "grad_norm": 0.8433042764663696, "learning_rate": 3.558936032483964e-05, "loss": 2.9371, "step": 251000 }, { "epoch": 13.553487871779701, "grad_norm": 0.8445100784301758, "learning_rate": 3.529512152062614e-05, "loss": 2.937, "step": 252000 }, { "epoch": 13.607271553810573, "grad_norm": 0.8464850783348083, "learning_rate": 3.5001176955216856e-05, "loss": 2.939, "step": 253000 }, { "epoch": 13.661055235841445, "grad_norm": 0.8083788156509399, "learning_rate": 3.470693815100336e-05, "loss": 2.9434, "step": 254000 }, { "epoch": 13.714838917872317, "grad_norm": 0.8728957176208496, "learning_rate": 3.441269934678986e-05, "loss": 2.941, "step": 255000 }, { "epoch": 13.768622599903189, "grad_norm": 0.8241551518440247, "learning_rate": 3.411846054257636e-05, "loss": 2.9452, "step": 256000 }, { "epoch": 13.822406281934061, "grad_norm": 0.8384578227996826, "learning_rate": 3.382451597716707e-05, "loss": 2.9448, "step": 257000 }, { "epoch": 13.876189963964933, "grad_norm": 0.810941219329834, "learning_rate": 3.3530277172953574e-05, "loss": 2.9405, "step": 258000 }, { "epoch": 13.929973645995805, "grad_norm": 0.8232195973396301, "learning_rate": 3.323603836874007e-05, "loss": 2.9492, "step": 259000 }, { "epoch": 13.983757328026677, "grad_norm": 0.8487170338630676, "learning_rate": 3.294179956452657e-05, "loss": 2.9462, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.4059775641477828, "eval_loss": 3.392092227935791, "eval_runtime": 154.2386, "eval_samples_per_second": 375.509, "eval_steps_per_second": 5.868, "step": 260302 }, { "epoch": 14.037541010057549, "grad_norm": 0.8577378988265991, "learning_rate": 3.264785499911728e-05, "loss": 2.9154, "step": 261000 }, { "epoch": 14.09132469208842, "grad_norm": 0.868270993232727, "learning_rate": 3.2353910433708004e-05, "loss": 2.9065, "step": 262000 }, { "epoch": 14.145108374119292, "grad_norm": 0.8782519102096558, "learning_rate": 3.20596716294945e-05, "loss": 2.9078, "step": 263000 }, { "epoch": 14.198892056150164, "grad_norm": 0.8726826906204224, "learning_rate": 3.1765432825281e-05, "loss": 2.9074, "step": 264000 }, { "epoch": 14.252675738181036, "grad_norm": 0.8613258600234985, "learning_rate": 3.14711940210675e-05, "loss": 2.9139, "step": 265000 }, { "epoch": 14.306459420211908, "grad_norm": 0.8485667109489441, "learning_rate": 3.1177249455658214e-05, "loss": 2.9174, "step": 266000 }, { "epoch": 14.36024310224278, "grad_norm": 0.9009717702865601, "learning_rate": 3.0883010651444715e-05, "loss": 2.9158, "step": 267000 }, { "epoch": 14.414026784273652, "grad_norm": 0.8538597822189331, "learning_rate": 3.058936032483964e-05, "loss": 2.9185, "step": 268000 }, { "epoch": 14.467810466304524, "grad_norm": 0.856895387172699, "learning_rate": 3.029512152062614e-05, "loss": 2.9169, "step": 269000 }, { "epoch": 14.521594148335396, "grad_norm": 0.8506320714950562, "learning_rate": 3.0000882716412644e-05, "loss": 2.9192, "step": 270000 }, { "epoch": 14.575377830366268, "grad_norm": 0.8925907611846924, "learning_rate": 2.9706643912199145e-05, "loss": 2.918, "step": 271000 }, { "epoch": 14.629161512397138, "grad_norm": 0.8765429854393005, "learning_rate": 2.941269934678986e-05, "loss": 2.9245, "step": 272000 }, { "epoch": 14.68294519442801, "grad_norm": 0.8704999089241028, "learning_rate": 2.911875478138057e-05, "loss": 2.9222, "step": 273000 }, { "epoch": 14.736728876458882, "grad_norm": 0.8442783951759338, "learning_rate": 2.882451597716707e-05, "loss": 2.921, "step": 274000 }, { "epoch": 14.790512558489754, "grad_norm": 0.8700312376022339, "learning_rate": 2.8530277172953568e-05, "loss": 2.9234, "step": 275000 }, { "epoch": 14.844296240520626, "grad_norm": 0.8738580346107483, "learning_rate": 2.823603836874007e-05, "loss": 2.9259, "step": 276000 }, { "epoch": 14.898079922551497, "grad_norm": 0.8423880934715271, "learning_rate": 2.7941799564526573e-05, "loss": 2.9253, "step": 277000 }, { "epoch": 14.95186360458237, "grad_norm": 0.8413381576538086, "learning_rate": 2.7647560760313074e-05, "loss": 2.9308, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.40731603234036906, "eval_loss": 3.384235382080078, "eval_runtime": 153.8779, "eval_samples_per_second": 376.389, "eval_steps_per_second": 5.881, "step": 278895 }, { "epoch": 15.005647286613241, "grad_norm": 0.8703798651695251, "learning_rate": 2.7353616194903788e-05, "loss": 2.925, "step": 279000 }, { "epoch": 15.059430968644113, "grad_norm": 0.8850764036178589, "learning_rate": 2.705937739069029e-05, "loss": 2.8835, "step": 280000 }, { "epoch": 15.113214650674985, "grad_norm": 0.8852105140686035, "learning_rate": 2.6765138586476786e-05, "loss": 2.8864, "step": 281000 }, { "epoch": 15.166998332705857, "grad_norm": 0.9038397669792175, "learning_rate": 2.6470899782263287e-05, "loss": 2.8946, "step": 282000 }, { "epoch": 15.220782014736729, "grad_norm": 0.8719667792320251, "learning_rate": 2.6176955216853998e-05, "loss": 2.894, "step": 283000 }, { "epoch": 15.274565696767601, "grad_norm": 0.8941630125045776, "learning_rate": 2.5882716412640502e-05, "loss": 2.8992, "step": 284000 }, { "epoch": 15.328349378798473, "grad_norm": 0.8849285840988159, "learning_rate": 2.5588771847231213e-05, "loss": 2.8982, "step": 285000 }, { "epoch": 15.382133060829345, "grad_norm": 0.916803240776062, "learning_rate": 2.5294533043017717e-05, "loss": 2.8998, "step": 286000 }, { "epoch": 15.435916742860217, "grad_norm": 0.871529757976532, "learning_rate": 2.5000882716412644e-05, "loss": 2.9004, "step": 287000 }, { "epoch": 15.489700424891089, "grad_norm": 0.8877633213996887, "learning_rate": 2.470664391219914e-05, "loss": 2.9036, "step": 288000 }, { "epoch": 15.54348410692196, "grad_norm": 0.8539232015609741, "learning_rate": 2.4412405107985643e-05, "loss": 2.901, "step": 289000 }, { "epoch": 15.597267788952832, "grad_norm": 0.8996196985244751, "learning_rate": 2.4118460542576357e-05, "loss": 2.9012, "step": 290000 }, { "epoch": 15.651051470983704, "grad_norm": 0.9288415908813477, "learning_rate": 2.3824221738362857e-05, "loss": 2.9059, "step": 291000 }, { "epoch": 15.704835153014574, "grad_norm": 0.83979731798172, "learning_rate": 2.3529982934149355e-05, "loss": 2.9078, "step": 292000 }, { "epoch": 15.758618835045446, "grad_norm": 0.871395468711853, "learning_rate": 2.3235744129935856e-05, "loss": 2.905, "step": 293000 }, { "epoch": 15.812402517076318, "grad_norm": 0.8687715530395508, "learning_rate": 2.294179956452657e-05, "loss": 2.9092, "step": 294000 }, { "epoch": 15.86618619910719, "grad_norm": 0.8818358778953552, "learning_rate": 2.2647854999117284e-05, "loss": 2.9087, "step": 295000 }, { "epoch": 15.919969881138062, "grad_norm": 0.8488963842391968, "learning_rate": 2.2353910433707998e-05, "loss": 2.9098, "step": 296000 }, { "epoch": 15.973753563168934, "grad_norm": 0.8631708025932312, "learning_rate": 2.20596716294945e-05, "loss": 2.9096, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.4075240279998705, "eval_loss": 3.3800442218780518, "eval_runtime": 154.4151, "eval_samples_per_second": 375.08, "eval_steps_per_second": 5.861, "step": 297488 }, { "epoch": 16.027537245199806, "grad_norm": 0.9190370440483093, "learning_rate": 2.1765432825281e-05, "loss": 2.8902, "step": 298000 }, { "epoch": 16.081320927230678, "grad_norm": 0.9171285033226013, "learning_rate": 2.14711940210675e-05, "loss": 2.8722, "step": 299000 }, { "epoch": 16.13510460926155, "grad_norm": 0.8986324071884155, "learning_rate": 2.1176955216853998e-05, "loss": 2.8795, "step": 300000 }, { "epoch": 16.18888829129242, "grad_norm": 0.9201930165290833, "learning_rate": 2.08827164126405e-05, "loss": 2.8797, "step": 301000 }, { "epoch": 16.242671973323294, "grad_norm": 0.9046939611434937, "learning_rate": 2.0588771847231213e-05, "loss": 2.8811, "step": 302000 }, { "epoch": 16.296455655354166, "grad_norm": 0.9016453623771667, "learning_rate": 2.0294533043017714e-05, "loss": 2.8794, "step": 303000 }, { "epoch": 16.350239337385037, "grad_norm": 0.9240383505821228, "learning_rate": 2.0000294238804215e-05, "loss": 2.8855, "step": 304000 }, { "epoch": 16.40402301941591, "grad_norm": 0.9093482494354248, "learning_rate": 1.9706055434590716e-05, "loss": 2.8801, "step": 305000 }, { "epoch": 16.45780670144678, "grad_norm": 0.8959923386573792, "learning_rate": 1.941211086918143e-05, "loss": 2.8851, "step": 306000 }, { "epoch": 16.511590383477653, "grad_norm": 0.9291424751281738, "learning_rate": 1.9117872064967927e-05, "loss": 2.8823, "step": 307000 }, { "epoch": 16.565374065508525, "grad_norm": 0.9166994094848633, "learning_rate": 1.8823633260754428e-05, "loss": 2.8914, "step": 308000 }, { "epoch": 16.619157747539397, "grad_norm": 0.9259293675422668, "learning_rate": 1.852939445654093e-05, "loss": 2.8881, "step": 309000 }, { "epoch": 16.67294142957027, "grad_norm": 0.9145235419273376, "learning_rate": 1.8235449891131643e-05, "loss": 2.8876, "step": 310000 }, { "epoch": 16.72672511160114, "grad_norm": 0.926196813583374, "learning_rate": 1.7941211086918144e-05, "loss": 2.8875, "step": 311000 }, { "epoch": 16.780508793632013, "grad_norm": 0.8985670208930969, "learning_rate": 1.7647266521508858e-05, "loss": 2.8897, "step": 312000 }, { "epoch": 16.834292475662885, "grad_norm": 0.9112594723701477, "learning_rate": 1.735302771729536e-05, "loss": 2.891, "step": 313000 }, { "epoch": 16.888076157693757, "grad_norm": 0.9346410036087036, "learning_rate": 1.705908315188607e-05, "loss": 2.8916, "step": 314000 }, { "epoch": 16.94185983972463, "grad_norm": 0.9073229432106018, "learning_rate": 1.6764844347672574e-05, "loss": 2.8898, "step": 315000 }, { "epoch": 16.9956435217555, "grad_norm": 0.8861480951309204, "learning_rate": 1.647060554345907e-05, "loss": 2.889, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.4077118018591426, "eval_loss": 3.384974479675293, "eval_runtime": 154.1381, "eval_samples_per_second": 375.754, "eval_steps_per_second": 5.871, "step": 316081 }, { "epoch": 17.049427203786372, "grad_norm": 0.9185024499893188, "learning_rate": 1.6176660978049785e-05, "loss": 2.8642, "step": 317000 }, { "epoch": 17.103210885817244, "grad_norm": 0.8915971517562866, "learning_rate": 1.5882422173836286e-05, "loss": 2.8621, "step": 318000 }, { "epoch": 17.156994567848116, "grad_norm": 0.9077499508857727, "learning_rate": 1.5588183369622787e-05, "loss": 2.8628, "step": 319000 }, { "epoch": 17.210778249878988, "grad_norm": 0.9534841775894165, "learning_rate": 1.5294533043017714e-05, "loss": 2.8685, "step": 320000 }, { "epoch": 17.26456193190986, "grad_norm": 0.924517035484314, "learning_rate": 1.5000294238804214e-05, "loss": 2.8611, "step": 321000 }, { "epoch": 17.318345613940732, "grad_norm": 0.9032144546508789, "learning_rate": 1.4706055434590716e-05, "loss": 2.8672, "step": 322000 }, { "epoch": 17.372129295971604, "grad_norm": 0.9569965600967407, "learning_rate": 1.4411816630377215e-05, "loss": 2.8669, "step": 323000 }, { "epoch": 17.425912978002476, "grad_norm": 0.9252744913101196, "learning_rate": 1.4117577826163716e-05, "loss": 2.8681, "step": 324000 }, { "epoch": 17.479696660033348, "grad_norm": 0.9229059815406799, "learning_rate": 1.3823633260754429e-05, "loss": 2.8733, "step": 325000 }, { "epoch": 17.533480342064216, "grad_norm": 0.9757437705993652, "learning_rate": 1.3529394456540928e-05, "loss": 2.873, "step": 326000 }, { "epoch": 17.587264024095088, "grad_norm": 0.9332020878791809, "learning_rate": 1.323515565232743e-05, "loss": 2.8693, "step": 327000 }, { "epoch": 17.64104770612596, "grad_norm": 0.9381711483001709, "learning_rate": 1.294091684811393e-05, "loss": 2.8694, "step": 328000 }, { "epoch": 17.69483138815683, "grad_norm": 0.9215328693389893, "learning_rate": 1.2646972282704644e-05, "loss": 2.8712, "step": 329000 }, { "epoch": 17.748615070187704, "grad_norm": 0.9105529189109802, "learning_rate": 1.2353027717295358e-05, "loss": 2.873, "step": 330000 }, { "epoch": 17.802398752218576, "grad_norm": 0.9045984148979187, "learning_rate": 1.2058788913081857e-05, "loss": 2.8739, "step": 331000 }, { "epoch": 17.856182434249448, "grad_norm": 0.8981735110282898, "learning_rate": 1.1764550108868358e-05, "loss": 2.8732, "step": 332000 }, { "epoch": 17.90996611628032, "grad_norm": 0.9405637383460999, "learning_rate": 1.1470311304654859e-05, "loss": 2.8759, "step": 333000 }, { "epoch": 17.96374979831119, "grad_norm": 0.880014955997467, "learning_rate": 1.1176366739245573e-05, "loss": 2.8779, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.407625137001017, "eval_loss": 3.392023801803589, "eval_runtime": 154.3691, "eval_samples_per_second": 375.192, "eval_steps_per_second": 5.863, "step": 334674 }, { "epoch": 18.017533480342063, "grad_norm": 0.9213481545448303, "learning_rate": 1.0882127935032074e-05, "loss": 2.8646, "step": 335000 }, { "epoch": 18.071317162372935, "grad_norm": 0.926913321018219, "learning_rate": 1.0587889130818573e-05, "loss": 2.8498, "step": 336000 }, { "epoch": 18.125100844403807, "grad_norm": 0.953425943851471, "learning_rate": 1.0293650326605072e-05, "loss": 2.8487, "step": 337000 }, { "epoch": 18.17888452643468, "grad_norm": 0.9122514724731445, "learning_rate": 9.999705761195786e-06, "loss": 2.8491, "step": 338000 }, { "epoch": 18.23266820846555, "grad_norm": 0.9349797964096069, "learning_rate": 9.7057611957865e-06, "loss": 2.8526, "step": 339000 }, { "epoch": 18.286451890496423, "grad_norm": 0.9712046384811401, "learning_rate": 9.411522391573001e-06, "loss": 2.8535, "step": 340000 }, { "epoch": 18.340235572527295, "grad_norm": 0.9234364628791809, "learning_rate": 9.117577826163715e-06, "loss": 2.857, "step": 341000 }, { "epoch": 18.394019254558167, "grad_norm": 0.9454054832458496, "learning_rate": 8.823339021950216e-06, "loss": 2.8539, "step": 342000 }, { "epoch": 18.44780293658904, "grad_norm": 0.9207014441490173, "learning_rate": 8.529100217736715e-06, "loss": 2.8589, "step": 343000 }, { "epoch": 18.50158661861991, "grad_norm": 0.895506739616394, "learning_rate": 8.235155652327429e-06, "loss": 2.8582, "step": 344000 }, { "epoch": 18.555370300650782, "grad_norm": 0.9252230525016785, "learning_rate": 7.941211086918143e-06, "loss": 2.8572, "step": 345000 }, { "epoch": 18.609153982681654, "grad_norm": 0.954779863357544, "learning_rate": 7.646972282704644e-06, "loss": 2.8593, "step": 346000 }, { "epoch": 18.662937664712526, "grad_norm": 0.9090393781661987, "learning_rate": 7.352733478491143e-06, "loss": 2.8606, "step": 347000 }, { "epoch": 18.716721346743398, "grad_norm": 0.9491481184959412, "learning_rate": 7.058494674277644e-06, "loss": 2.8569, "step": 348000 }, { "epoch": 18.77050502877427, "grad_norm": 0.9368701577186584, "learning_rate": 6.764255870064144e-06, "loss": 2.857, "step": 349000 }, { "epoch": 18.824288710805142, "grad_norm": 0.9315699934959412, "learning_rate": 6.470017065850645e-06, "loss": 2.8571, "step": 350000 }, { "epoch": 18.878072392836014, "grad_norm": 0.9450963735580444, "learning_rate": 6.176366739245571e-06, "loss": 2.8616, "step": 351000 }, { "epoch": 18.931856074866886, "grad_norm": 0.958202064037323, "learning_rate": 5.882127935032072e-06, "loss": 2.8551, "step": 352000 }, { "epoch": 18.985639756897758, "grad_norm": 0.916106104850769, "learning_rate": 5.587889130818573e-06, "loss": 2.8585, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.4083989399745756, "eval_loss": 3.389803171157837, "eval_runtime": 154.6817, "eval_samples_per_second": 374.433, "eval_steps_per_second": 5.851, "step": 353267 }, { "epoch": 19.03942343892863, "grad_norm": 0.9335721731185913, "learning_rate": 5.293944565409286e-06, "loss": 2.8473, "step": 354000 }, { "epoch": 19.0932071209595, "grad_norm": 0.9438668489456177, "learning_rate": 4.999705761195786e-06, "loss": 2.8388, "step": 355000 }, { "epoch": 19.146990802990373, "grad_norm": 0.9449619054794312, "learning_rate": 4.705466956982287e-06, "loss": 2.842, "step": 356000 }, { "epoch": 19.200774485021245, "grad_norm": 0.9449966549873352, "learning_rate": 4.411522391573001e-06, "loss": 2.8438, "step": 357000 }, { "epoch": 19.254558167052117, "grad_norm": 0.9282692074775696, "learning_rate": 4.1172835873595005e-06, "loss": 2.8424, "step": 358000 }, { "epoch": 19.30834184908299, "grad_norm": 0.9687463641166687, "learning_rate": 3.823044783146001e-06, "loss": 2.8429, "step": 359000 }, { "epoch": 19.36212553111386, "grad_norm": 0.9648985862731934, "learning_rate": 3.528805978932502e-06, "loss": 2.8414, "step": 360000 }, { "epoch": 19.415909213144733, "grad_norm": 0.933380126953125, "learning_rate": 3.234567174719002e-06, "loss": 2.844, "step": 361000 }, { "epoch": 19.469692895175605, "grad_norm": 0.9169676303863525, "learning_rate": 2.940622609309716e-06, "loss": 2.8437, "step": 362000 }, { "epoch": 19.523476577206477, "grad_norm": 0.9346097707748413, "learning_rate": 2.6463838050962164e-06, "loss": 2.844, "step": 363000 }, { "epoch": 19.57726025923735, "grad_norm": 0.9195376634597778, "learning_rate": 2.35243923968693e-06, "loss": 2.8418, "step": 364000 }, { "epoch": 19.63104394126822, "grad_norm": 0.9312041401863098, "learning_rate": 2.0582004354734305e-06, "loss": 2.8455, "step": 365000 }, { "epoch": 19.684827623299093, "grad_norm": 0.9278233647346497, "learning_rate": 1.7639616312599305e-06, "loss": 2.8454, "step": 366000 }, { "epoch": 19.738611305329965, "grad_norm": 0.9816317558288574, "learning_rate": 1.4700170658506446e-06, "loss": 2.8422, "step": 367000 }, { "epoch": 19.792394987360836, "grad_norm": 0.9713099002838135, "learning_rate": 1.1760725004413582e-06, "loss": 2.8441, "step": 368000 }, { "epoch": 19.846178669391705, "grad_norm": 0.9396886825561523, "learning_rate": 8.818336962278584e-07, "loss": 2.844, "step": 369000 }, { "epoch": 19.899962351422577, "grad_norm": 0.9674928784370422, "learning_rate": 5.875948920143589e-07, "loss": 2.8451, "step": 370000 }, { "epoch": 19.95374603345345, "grad_norm": 0.9253866076469421, "learning_rate": 2.933560878008592e-07, "loss": 2.8469, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.4080815719204785, "eval_loss": 3.3971192836761475, "eval_runtime": 154.407, "eval_samples_per_second": 375.1, "eval_steps_per_second": 5.861, "step": 371860 }, { "epoch": 20.0, "step": 371860, "total_flos": 1.5667414205184e+18, "train_loss": 3.1547193204799435, "train_runtime": 81066.2548, "train_samples_per_second": 146.785, "train_steps_per_second": 4.587 } ], "logging_steps": 1000, "max_steps": 371860, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.5667414205184e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }