diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,8666 +1,21828 @@ { - "best_metric": 0.8209169054441261, - "best_model_checkpoint": "videomae-base-finetuned-scratch_1/checkpoint-5280", - "epoch": 35.02580971659919, + "best_metric": 0.7623873873873874, + "best_model_checkpoint": "videomae-base-finetuned-scratch_1/checkpoint-24360", + "epoch": 71.01232114467409, "eval_steps": 500, - "global_step": 11856, + "global_step": 30192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 5.349636554718018, - "learning_rate": 4.2158516020236085e-07, - "loss": 0.7141, + "grad_norm": 6.798513412475586, + "learning_rate": 1.6556291390728477e-07, + "loss": 0.6854, "step": 10 }, { "epoch": 0.0, - "grad_norm": 5.897022724151611, - "learning_rate": 8.431703204047217e-07, - "loss": 0.7044, + "grad_norm": 6.500337600708008, + "learning_rate": 3.3112582781456954e-07, + "loss": 0.7216, "step": 20 }, { "epoch": 0.0, - "grad_norm": 4.595437526702881, - "learning_rate": 1.2647554806070827e-06, - "loss": 0.7118, + "grad_norm": 4.219776630401611, + "learning_rate": 4.966887417218544e-07, + "loss": 0.6915, "step": 30 }, { "epoch": 0.0, - "grad_norm": 4.464962959289551, - "learning_rate": 1.6863406408094434e-06, - "loss": 0.7121, + "grad_norm": 4.801699161529541, + "learning_rate": 6.622516556291391e-07, + "loss": 0.7103, "step": 40 }, { "epoch": 0.0, - "grad_norm": 4.585201263427734, - "learning_rate": 2.1079258010118047e-06, - "loss": 0.7018, + "grad_norm": 4.168068885803223, + "learning_rate": 8.278145695364239e-07, + "loss": 0.7183, "step": 50 }, { - "epoch": 0.01, - "grad_norm": 4.25459098815918, - "learning_rate": 2.5295109612141654e-06, - "loss": 0.6952, + "epoch": 0.0, + "grad_norm": 4.032826900482178, + "learning_rate": 9.933774834437087e-07, + "loss": 0.6903, "step": 60 }, { - "epoch": 0.01, - "grad_norm": 3.792253017425537, - "learning_rate": 2.951096121416526e-06, - "loss": 0.6813, + "epoch": 0.0, + "grad_norm": 4.603478908538818, + "learning_rate": 1.1589403973509934e-06, + "loss": 0.7016, "step": 70 }, { - "epoch": 0.01, - "grad_norm": 3.621083974838257, - "learning_rate": 3.372681281618887e-06, - "loss": 0.6814, + "epoch": 0.0, + "grad_norm": 6.0669779777526855, + "learning_rate": 1.3245033112582782e-06, + "loss": 0.7131, "step": 80 }, { - "epoch": 0.01, - "grad_norm": 4.006009578704834, - "learning_rate": 3.794266441821248e-06, - "loss": 0.6896, + "epoch": 0.0, + "grad_norm": 3.940962314605713, + "learning_rate": 1.4900662251655629e-06, + "loss": 0.6874, "step": 90 }, { - "epoch": 0.01, - "grad_norm": 3.3775813579559326, - "learning_rate": 4.2158516020236095e-06, - "loss": 0.6803, + "epoch": 0.0, + "grad_norm": 3.132936716079712, + "learning_rate": 1.6556291390728478e-06, + "loss": 0.7025, "step": 100 }, { - "epoch": 0.01, - "grad_norm": 5.32543420791626, - "learning_rate": 4.63743676222597e-06, - "loss": 0.6793, + "epoch": 0.0, + "grad_norm": 4.358637809753418, + "learning_rate": 1.8211920529801325e-06, + "loss": 0.694, "step": 110 }, { - "epoch": 0.01, - "grad_norm": 4.859120845794678, - "learning_rate": 5.059021922428331e-06, - "loss": 0.6715, + "epoch": 0.0, + "grad_norm": 4.67786169052124, + "learning_rate": 1.9867549668874175e-06, + "loss": 0.69, "step": 120 }, { - "epoch": 0.01, - "grad_norm": 3.3114655017852783, - "learning_rate": 5.480607082630692e-06, - "loss": 0.705, + "epoch": 0.0, + "grad_norm": 5.850955486297607, + "learning_rate": 2.152317880794702e-06, + "loss": 0.6855, "step": 130 }, { - "epoch": 0.01, - "grad_norm": 4.681263446807861, - "learning_rate": 5.902192242833052e-06, - "loss": 0.7032, + "epoch": 0.0, + "grad_norm": 7.933328628540039, + "learning_rate": 2.317880794701987e-06, + "loss": 0.7019, "step": 140 }, { - "epoch": 0.01, - "grad_norm": 3.5089430809020996, - "learning_rate": 6.323777403035413e-06, - "loss": 0.6903, + "epoch": 0.0, + "grad_norm": 6.646462440490723, + "learning_rate": 2.4834437086092716e-06, + "loss": 0.7026, "step": 150 }, { "epoch": 0.01, - "grad_norm": 4.625184059143066, - "learning_rate": 6.745362563237774e-06, - "loss": 0.6885, + "grad_norm": 5.709299087524414, + "learning_rate": 2.6490066225165563e-06, + "loss": 0.677, "step": 160 }, { "epoch": 0.01, - "grad_norm": 3.5083889961242676, - "learning_rate": 7.166947723440136e-06, - "loss": 0.6671, + "grad_norm": 4.531544208526611, + "learning_rate": 2.8145695364238415e-06, + "loss": 0.7028, "step": 170 }, { - "epoch": 0.02, - "grad_norm": 6.028371334075928, - "learning_rate": 7.588532883642496e-06, - "loss": 0.6822, + "epoch": 0.01, + "grad_norm": 9.23356819152832, + "learning_rate": 2.9801324503311258e-06, + "loss": 0.667, "step": 180 }, { - "epoch": 0.02, - "grad_norm": 4.046874046325684, - "learning_rate": 8.010118043844857e-06, - "loss": 0.676, + "epoch": 0.01, + "grad_norm": 7.787632942199707, + "learning_rate": 3.145695364238411e-06, + "loss": 0.6753, "step": 190 }, { - "epoch": 0.02, - "grad_norm": 7.515069961547852, - "learning_rate": 8.431703204047219e-06, - "loss": 0.6649, + "epoch": 0.01, + "grad_norm": 4.996011257171631, + "learning_rate": 3.3112582781456956e-06, + "loss": 0.7245, "step": 200 }, { - "epoch": 0.02, - "grad_norm": 4.611435890197754, - "learning_rate": 8.85328836424958e-06, - "loss": 0.6726, + "epoch": 0.01, + "grad_norm": 6.756231784820557, + "learning_rate": 3.47682119205298e-06, + "loss": 0.663, "step": 210 }, { - "epoch": 0.02, - "grad_norm": 5.385730743408203, - "learning_rate": 9.27487352445194e-06, - "loss": 0.6771, + "epoch": 0.01, + "grad_norm": 4.762441635131836, + "learning_rate": 3.642384105960265e-06, + "loss": 0.6785, "step": 220 }, { - "epoch": 0.02, - "grad_norm": 4.537276268005371, - "learning_rate": 9.696458684654301e-06, - "loss": 0.6543, + "epoch": 0.01, + "grad_norm": 5.188967704772949, + "learning_rate": 3.8079470198675498e-06, + "loss": 0.6748, "step": 230 }, { - "epoch": 0.02, - "grad_norm": 4.257579803466797, - "learning_rate": 1.0118043844856662e-05, - "loss": 0.6602, + "epoch": 0.01, + "grad_norm": 7.416825771331787, + "learning_rate": 3.973509933774835e-06, + "loss": 0.6679, "step": 240 }, { - "epoch": 0.02, - "grad_norm": 5.346296787261963, - "learning_rate": 1.0539629005059022e-05, - "loss": 0.6293, + "epoch": 0.01, + "grad_norm": 7.885051727294922, + "learning_rate": 4.13907284768212e-06, + "loss": 0.6712, "step": 250 }, { - "epoch": 0.02, - "grad_norm": 8.624234199523926, - "learning_rate": 1.0961214165261384e-05, - "loss": 0.6735, + "epoch": 0.01, + "grad_norm": 3.602786064147949, + "learning_rate": 4.304635761589404e-06, + "loss": 0.6838, "step": 260 }, { - "epoch": 0.02, - "grad_norm": 5.386282920837402, - "learning_rate": 1.1382799325463744e-05, - "loss": 0.6612, + "epoch": 0.01, + "grad_norm": 9.783942222595215, + "learning_rate": 4.470198675496689e-06, + "loss": 0.6824, "step": 270 }, { - "epoch": 0.02, - "grad_norm": 9.212226867675781, - "learning_rate": 1.1804384485666105e-05, - "loss": 0.673, + "epoch": 0.01, + "grad_norm": 5.1008172035217285, + "learning_rate": 4.635761589403974e-06, + "loss": 0.661, "step": 280 }, { - "epoch": 0.02, - "grad_norm": 6.993302822113037, - "learning_rate": 1.2225969645868467e-05, - "loss": 0.6551, + "epoch": 0.01, + "grad_norm": 7.266839504241943, + "learning_rate": 4.801324503311259e-06, + "loss": 0.6225, "step": 290 }, { - "epoch": 0.03, - "grad_norm": 5.731637954711914, - "learning_rate": 1.2647554806070827e-05, - "loss": 0.6432, + "epoch": 0.01, + "grad_norm": 6.263260364532471, + "learning_rate": 4.966887417218543e-06, + "loss": 0.6457, "step": 300 }, { - "epoch": 0.03, - "grad_norm": 5.617886543273926, - "learning_rate": 1.3069139966273189e-05, - "loss": 0.6777, + "epoch": 0.01, + "grad_norm": 9.13078784942627, + "learning_rate": 5.1324503311258275e-06, + "loss": 0.639, "step": 310 }, { - "epoch": 0.03, - "grad_norm": 7.462168216705322, - "learning_rate": 1.3490725126475547e-05, - "loss": 0.6153, + "epoch": 0.01, + "grad_norm": 4.028023719787598, + "learning_rate": 5.298013245033113e-06, + "loss": 0.6535, "step": 320 }, { - "epoch": 0.03, - "grad_norm": 9.60923957824707, - "learning_rate": 1.391231028667791e-05, - "loss": 0.6811, - "step": 330 - }, - { - "epoch": 0.03, - "eval_accuracy": 0.6547277936962751, - "eval_loss": 0.6314294338226318, - "eval_runtime": 33.9302, - "eval_samples_per_second": 20.572, - "eval_steps_per_second": 1.739, + "epoch": 0.01, + "grad_norm": 5.480044364929199, + "learning_rate": 5.463576158940398e-06, + "loss": 0.6519, "step": 330 }, { - "epoch": 1.0, - "grad_norm": 9.263134002685547, - "learning_rate": 1.4333895446880271e-05, - "loss": 0.6244, + "epoch": 0.01, + "grad_norm": 5.179543972015381, + "learning_rate": 5.629139072847683e-06, + "loss": 0.6611, "step": 340 }, { - "epoch": 1.0, - "grad_norm": 8.611780166625977, - "learning_rate": 1.4755480607082632e-05, - "loss": 0.6358, + "epoch": 0.01, + "grad_norm": 3.5863168239593506, + "learning_rate": 5.794701986754967e-06, + "loss": 0.6115, "step": 350 }, { - "epoch": 1.0, - "grad_norm": 8.437411308288574, - "learning_rate": 1.5177065767284992e-05, - "loss": 0.6273, + "epoch": 0.01, + "grad_norm": 7.434300899505615, + "learning_rate": 5.9602649006622515e-06, + "loss": 0.6357, "step": 360 }, { - "epoch": 1.0, - "grad_norm": 6.759949207305908, - "learning_rate": 1.5598650927487355e-05, - "loss": 0.6529, + "epoch": 0.01, + "grad_norm": 4.279542922973633, + "learning_rate": 6.125827814569537e-06, + "loss": 0.6173, "step": 370 }, { - "epoch": 1.0, - "grad_norm": 15.71631908416748, - "learning_rate": 1.6020236087689714e-05, - "loss": 0.6066, + "epoch": 0.01, + "grad_norm": 7.049438953399658, + "learning_rate": 6.291390728476822e-06, + "loss": 0.6348, "step": 380 }, { - "epoch": 1.01, - "grad_norm": 17.849533081054688, - "learning_rate": 1.6441821247892076e-05, - "loss": 0.5587, + "epoch": 0.01, + "grad_norm": 7.15338659286499, + "learning_rate": 6.456953642384106e-06, + "loss": 0.5899, "step": 390 }, { - "epoch": 1.01, - "grad_norm": 4.601895809173584, - "learning_rate": 1.6863406408094438e-05, - "loss": 0.6334, + "epoch": 0.01, + "grad_norm": 10.467881202697754, + "learning_rate": 6.622516556291391e-06, + "loss": 0.6648, "step": 400 }, { - "epoch": 1.01, - "grad_norm": 6.037390232086182, - "learning_rate": 1.7284991568296797e-05, - "loss": 0.7324, + "epoch": 0.01, + "grad_norm": 12.209418296813965, + "learning_rate": 6.7880794701986755e-06, + "loss": 0.6523, "step": 410 }, { - "epoch": 1.01, - "grad_norm": 4.270590782165527, - "learning_rate": 1.770657672849916e-05, - "loss": 0.6552, + "epoch": 0.01, + "grad_norm": 18.373001098632812, + "learning_rate": 6.95364238410596e-06, + "loss": 0.5872, "step": 420 }, { - "epoch": 1.01, - "grad_norm": 10.071660995483398, - "learning_rate": 1.812816188870152e-05, - "loss": 0.6441, + "epoch": 0.01, + "eval_accuracy": 0.5518018018018018, + "eval_loss": 0.6883862614631653, + "eval_runtime": 46.4984, + "eval_samples_per_second": 19.097, + "eval_steps_per_second": 1.591, + "step": 420 + }, + { + "epoch": 1.0, + "grad_norm": 8.16763973236084, + "learning_rate": 7.119205298013246e-06, + "loss": 0.617, "step": 430 }, { - "epoch": 1.01, - "grad_norm": 7.655844211578369, - "learning_rate": 1.854974704890388e-05, - "loss": 0.6271, + "epoch": 1.0, + "grad_norm": 7.374216556549072, + "learning_rate": 7.28476821192053e-06, + "loss": 0.6017, "step": 440 }, { - "epoch": 1.01, - "grad_norm": 7.267734050750732, - "learning_rate": 1.897133220910624e-05, - "loss": 0.6048, + "epoch": 1.0, + "grad_norm": 8.875380516052246, + "learning_rate": 7.450331125827815e-06, + "loss": 0.5943, "step": 450 }, { - "epoch": 1.01, - "grad_norm": 6.196805953979492, - "learning_rate": 1.9392917369308603e-05, - "loss": 0.6371, + "epoch": 1.0, + "grad_norm": 6.8913116455078125, + "learning_rate": 7.6158940397350995e-06, + "loss": 0.6053, "step": 460 }, { - "epoch": 1.01, - "grad_norm": 5.247162342071533, - "learning_rate": 1.981450252951096e-05, - "loss": 0.5485, + "epoch": 1.0, + "grad_norm": 19.417211532592773, + "learning_rate": 7.781456953642384e-06, + "loss": 0.5664, "step": 470 }, { - "epoch": 1.01, - "grad_norm": 7.14028787612915, - "learning_rate": 2.0236087689713324e-05, - "loss": 0.6503, + "epoch": 1.0, + "grad_norm": 9.484718322753906, + "learning_rate": 7.94701986754967e-06, + "loss": 0.5937, "step": 480 }, { - "epoch": 1.01, - "grad_norm": 7.4398722648620605, - "learning_rate": 2.0657672849915685e-05, - "loss": 0.5997, + "epoch": 1.0, + "grad_norm": 11.553343772888184, + "learning_rate": 8.112582781456954e-06, + "loss": 0.5961, "step": 490 }, { - "epoch": 1.01, - "grad_norm": 4.917046070098877, - "learning_rate": 2.1079258010118044e-05, - "loss": 0.6271, + "epoch": 1.0, + "grad_norm": 8.792490005493164, + "learning_rate": 8.27814569536424e-06, + "loss": 0.5381, "step": 500 }, { - "epoch": 1.02, - "grad_norm": 7.6909565925598145, - "learning_rate": 2.1500843170320406e-05, - "loss": 0.5816, + "epoch": 1.0, + "grad_norm": 11.141890525817871, + "learning_rate": 8.443708609271524e-06, + "loss": 0.662, "step": 510 }, { - "epoch": 1.02, - "grad_norm": 5.249167442321777, - "learning_rate": 2.1922428330522768e-05, - "loss": 0.6534, + "epoch": 1.0, + "grad_norm": 10.944332122802734, + "learning_rate": 8.609271523178809e-06, + "loss": 0.5583, "step": 520 }, { - "epoch": 1.02, - "grad_norm": 5.298259735107422, - "learning_rate": 2.2344013490725127e-05, - "loss": 0.6713, + "epoch": 1.0, + "grad_norm": 6.739534378051758, + "learning_rate": 8.774834437086093e-06, + "loss": 0.6051, "step": 530 }, { - "epoch": 1.02, - "grad_norm": 8.135736465454102, - "learning_rate": 2.276559865092749e-05, - "loss": 0.5999, + "epoch": 1.0, + "grad_norm": 14.809786796569824, + "learning_rate": 8.940397350993377e-06, + "loss": 0.5843, "step": 540 }, { - "epoch": 1.02, - "grad_norm": 9.37923812866211, - "learning_rate": 2.318718381112985e-05, - "loss": 0.7861, + "epoch": 1.0, + "grad_norm": 9.712545394897461, + "learning_rate": 9.105960264900662e-06, + "loss": 0.6071, "step": 550 }, { - "epoch": 1.02, - "grad_norm": 4.831663131713867, - "learning_rate": 2.360876897133221e-05, - "loss": 0.605, + "epoch": 1.0, + "grad_norm": 10.756208419799805, + "learning_rate": 9.271523178807948e-06, + "loss": 0.518, "step": 560 }, { - "epoch": 1.02, - "grad_norm": 5.473945617675781, - "learning_rate": 2.403035413153457e-05, - "loss": 0.6066, + "epoch": 1.0, + "grad_norm": 12.263407707214355, + "learning_rate": 9.437086092715232e-06, + "loss": 0.5224, "step": 570 }, { - "epoch": 1.02, - "grad_norm": 5.188632488250732, - "learning_rate": 2.4451939291736933e-05, - "loss": 0.6189, + "epoch": 1.01, + "grad_norm": 12.013748168945312, + "learning_rate": 9.602649006622518e-06, + "loss": 0.563, "step": 580 }, { - "epoch": 1.02, - "grad_norm": 4.069911956787109, - "learning_rate": 2.487352445193929e-05, - "loss": 0.6705, + "epoch": 1.01, + "grad_norm": 10.52728271484375, + "learning_rate": 9.768211920529802e-06, + "loss": 0.584, "step": 590 }, { - "epoch": 1.02, - "grad_norm": 8.71951675415039, - "learning_rate": 2.5295109612141654e-05, - "loss": 0.6168, + "epoch": 1.01, + "grad_norm": 18.45766830444336, + "learning_rate": 9.933774834437086e-06, + "loss": 0.5911, "step": 600 }, { - "epoch": 1.02, - "grad_norm": 6.839282989501953, - "learning_rate": 2.5716694772344012e-05, - "loss": 0.6465, + "epoch": 1.01, + "grad_norm": 11.80704116821289, + "learning_rate": 1.0099337748344372e-05, + "loss": 0.5943, "step": 610 }, { - "epoch": 1.02, - "grad_norm": 5.155991077423096, - "learning_rate": 2.6138279932546377e-05, - "loss": 0.6007, + "epoch": 1.01, + "grad_norm": 5.923819541931152, + "learning_rate": 1.0264900662251655e-05, + "loss": 0.5752, "step": 620 }, { - "epoch": 1.03, - "grad_norm": 4.38847541809082, - "learning_rate": 2.6559865092748736e-05, - "loss": 0.6253, + "epoch": 1.01, + "grad_norm": 12.2268648147583, + "learning_rate": 1.0430463576158941e-05, + "loss": 0.5505, "step": 630 }, { - "epoch": 1.03, - "grad_norm": 6.741366386413574, - "learning_rate": 2.6981450252951095e-05, - "loss": 0.614, + "epoch": 1.01, + "grad_norm": 8.633209228515625, + "learning_rate": 1.0596026490066225e-05, + "loss": 0.4826, "step": 640 }, { - "epoch": 1.03, - "grad_norm": 6.325467586517334, - "learning_rate": 2.740303541315346e-05, - "loss": 0.6444, + "epoch": 1.01, + "grad_norm": 25.050275802612305, + "learning_rate": 1.076158940397351e-05, + "loss": 0.6089, "step": 650 }, { - "epoch": 1.03, - "grad_norm": 7.080298900604248, - "learning_rate": 2.782462057335582e-05, - "loss": 0.6706, - "step": 660 - }, - { - "epoch": 1.03, - "eval_accuracy": 0.6318051575931232, - "eval_loss": 0.635435938835144, - "eval_runtime": 36.8826, - "eval_samples_per_second": 18.925, - "eval_steps_per_second": 1.6, + "epoch": 1.01, + "grad_norm": 14.282061576843262, + "learning_rate": 1.0927152317880796e-05, + "loss": 0.5236, "step": 660 }, { - "epoch": 2.0, - "grad_norm": 8.330774307250977, - "learning_rate": 2.8246205733558177e-05, - "loss": 0.5594, + "epoch": 1.01, + "grad_norm": 4.38491678237915, + "learning_rate": 1.109271523178808e-05, + "loss": 0.5622, "step": 670 }, { - "epoch": 2.0, - "grad_norm": 5.559295654296875, - "learning_rate": 2.8667790893760543e-05, - "loss": 0.6125, + "epoch": 1.01, + "grad_norm": 5.651684761047363, + "learning_rate": 1.1258278145695366e-05, + "loss": 0.6044, "step": 680 }, { - "epoch": 2.0, - "grad_norm": 7.404927730560303, - "learning_rate": 2.90893760539629e-05, - "loss": 0.5856, + "epoch": 1.01, + "grad_norm": 6.315426826477051, + "learning_rate": 1.142384105960265e-05, + "loss": 0.6392, "step": 690 }, { - "epoch": 2.0, - "grad_norm": 13.15217113494873, - "learning_rate": 2.9510961214165263e-05, - "loss": 0.5817, + "epoch": 1.01, + "grad_norm": 5.555671215057373, + "learning_rate": 1.1589403973509934e-05, + "loss": 0.542, "step": 700 }, { - "epoch": 2.0, - "grad_norm": 8.084522247314453, - "learning_rate": 2.9932546374367625e-05, - "loss": 0.646, + "epoch": 1.01, + "grad_norm": 17.609895706176758, + "learning_rate": 1.1754966887417219e-05, + "loss": 0.5469, "step": 710 }, { - "epoch": 2.01, - "grad_norm": 10.898458480834961, - "learning_rate": 3.0354131534569984e-05, - "loss": 0.5814, + "epoch": 1.01, + "grad_norm": 5.622751235961914, + "learning_rate": 1.1920529801324503e-05, + "loss": 0.5576, "step": 720 }, { - "epoch": 2.01, - "grad_norm": 4.796360015869141, - "learning_rate": 3.0775716694772346e-05, - "loss": 0.563, + "epoch": 1.01, + "grad_norm": 16.457481384277344, + "learning_rate": 1.2086092715231789e-05, + "loss": 0.5037, "step": 730 }, { - "epoch": 2.01, - "grad_norm": 6.336984157562256, - "learning_rate": 3.119730185497471e-05, - "loss": 0.5837, + "epoch": 1.01, + "grad_norm": 12.223016738891602, + "learning_rate": 1.2251655629139073e-05, + "loss": 0.5333, "step": 740 }, { - "epoch": 2.01, - "grad_norm": 6.342469692230225, - "learning_rate": 3.161888701517707e-05, - "loss": 0.5755, + "epoch": 1.01, + "grad_norm": 12.266618728637695, + "learning_rate": 1.2417218543046358e-05, + "loss": 0.6454, "step": 750 }, { - "epoch": 2.01, - "grad_norm": 3.3841326236724854, - "learning_rate": 3.204047217537943e-05, - "loss": 0.6185, + "epoch": 1.01, + "grad_norm": 14.025192260742188, + "learning_rate": 1.2582781456953644e-05, + "loss": 0.6073, "step": 760 }, { - "epoch": 2.01, - "grad_norm": 3.3901569843292236, - "learning_rate": 3.2462057335581793e-05, - "loss": 0.6117, + "epoch": 1.01, + "grad_norm": 7.08126163482666, + "learning_rate": 1.274834437086093e-05, + "loss": 0.5818, "step": 770 }, { - "epoch": 2.01, - "grad_norm": 5.979400634765625, - "learning_rate": 3.288364249578415e-05, - "loss": 0.5974, + "epoch": 1.01, + "grad_norm": 5.972238063812256, + "learning_rate": 1.2913907284768212e-05, + "loss": 0.491, "step": 780 }, { - "epoch": 2.01, - "grad_norm": 8.773338317871094, - "learning_rate": 3.330522765598651e-05, - "loss": 0.6024, + "epoch": 1.01, + "grad_norm": 9.566168785095215, + "learning_rate": 1.3079470198675498e-05, + "loss": 0.6412, "step": 790 }, { - "epoch": 2.01, - "grad_norm": 7.4114298820495605, - "learning_rate": 3.3726812816188876e-05, - "loss": 0.5838, + "epoch": 1.01, + "grad_norm": 10.850152969360352, + "learning_rate": 1.3245033112582782e-05, + "loss": 0.5616, "step": 800 }, { - "epoch": 2.01, - "grad_norm": 8.659492492675781, - "learning_rate": 3.4148397976391235e-05, - "loss": 0.6818, + "epoch": 1.01, + "grad_norm": 5.276033878326416, + "learning_rate": 1.3410596026490067e-05, + "loss": 0.6066, "step": 810 }, { - "epoch": 2.01, - "grad_norm": 4.71567440032959, - "learning_rate": 3.456998313659359e-05, - "loss": 0.6604, + "epoch": 1.01, + "grad_norm": 6.786125659942627, + "learning_rate": 1.3576158940397351e-05, + "loss": 0.5489, "step": 820 }, { - "epoch": 2.01, - "grad_norm": 4.012688159942627, - "learning_rate": 3.499156829679596e-05, - "loss": 0.6041, + "epoch": 1.01, + "grad_norm": 7.395523548126221, + "learning_rate": 1.3741721854304637e-05, + "loss": 0.4285, "step": 830 }, { - "epoch": 2.02, - "grad_norm": 5.472045421600342, - "learning_rate": 3.541315345699832e-05, - "loss": 0.573, + "epoch": 1.01, + "grad_norm": 24.986873626708984, + "learning_rate": 1.390728476821192e-05, + "loss": 0.5358, + "step": 840 + }, + { + "epoch": 1.01, + "eval_accuracy": 0.6193693693693694, + "eval_loss": 0.6766347289085388, + "eval_runtime": 42.9716, + "eval_samples_per_second": 20.665, + "eval_steps_per_second": 1.722, "step": 840 }, { - "epoch": 2.02, - "grad_norm": 3.6241841316223145, - "learning_rate": 3.5834738617200676e-05, - "loss": 0.6385, + "epoch": 2.0, + "grad_norm": 13.267349243164062, + "learning_rate": 1.4072847682119206e-05, + "loss": 0.4912, "step": 850 }, { - "epoch": 2.02, - "grad_norm": 3.782322645187378, - "learning_rate": 3.625632377740304e-05, - "loss": 0.6425, + "epoch": 2.0, + "grad_norm": 9.027665138244629, + "learning_rate": 1.4238410596026492e-05, + "loss": 0.4733, "step": 860 }, { - "epoch": 2.02, - "grad_norm": 5.188979148864746, - "learning_rate": 3.66779089376054e-05, - "loss": 0.5915, + "epoch": 2.0, + "grad_norm": 8.086003303527832, + "learning_rate": 1.4403973509933774e-05, + "loss": 0.4839, "step": 870 }, { - "epoch": 2.02, - "grad_norm": 7.798283576965332, - "learning_rate": 3.709949409780776e-05, - "loss": 0.6863, + "epoch": 2.0, + "grad_norm": 9.574597358703613, + "learning_rate": 1.456953642384106e-05, + "loss": 0.5129, "step": 880 }, { - "epoch": 2.02, - "grad_norm": 4.644458293914795, - "learning_rate": 3.7521079258010123e-05, - "loss": 0.6003, + "epoch": 2.0, + "grad_norm": 9.984691619873047, + "learning_rate": 1.4735099337748346e-05, + "loss": 0.5445, "step": 890 }, { - "epoch": 2.02, - "grad_norm": 7.098191738128662, - "learning_rate": 3.794266441821248e-05, - "loss": 0.5639, + "epoch": 2.0, + "grad_norm": 12.852778434753418, + "learning_rate": 1.490066225165563e-05, + "loss": 0.4693, "step": 900 }, { - "epoch": 2.02, - "grad_norm": 5.175333499908447, - "learning_rate": 3.836424957841484e-05, + "epoch": 2.0, + "grad_norm": 13.385702133178711, + "learning_rate": 1.5066225165562913e-05, "loss": 0.5598, "step": 910 }, { - "epoch": 2.02, - "grad_norm": 7.205776691436768, - "learning_rate": 3.8785834738617206e-05, - "loss": 0.6401, + "epoch": 2.0, + "grad_norm": 14.717290878295898, + "learning_rate": 1.5231788079470199e-05, + "loss": 0.4971, "step": 920 }, { - "epoch": 2.02, - "grad_norm": 4.420816898345947, - "learning_rate": 3.9207419898819565e-05, - "loss": 0.6388, + "epoch": 2.0, + "grad_norm": 9.329611778259277, + "learning_rate": 1.5397350993377485e-05, + "loss": 0.5318, "step": 930 }, { - "epoch": 2.02, - "grad_norm": 3.2338483333587646, - "learning_rate": 3.962900505902192e-05, - "loss": 0.5893, + "epoch": 2.0, + "grad_norm": 9.604193687438965, + "learning_rate": 1.5562913907284768e-05, + "loss": 0.5795, "step": 940 }, { - "epoch": 2.02, - "grad_norm": 3.816267251968384, - "learning_rate": 4.005059021922429e-05, - "loss": 0.6419, + "epoch": 2.0, + "grad_norm": 5.933267593383789, + "learning_rate": 1.5728476821192054e-05, + "loss": 0.5661, "step": 950 }, { - "epoch": 2.03, - "grad_norm": 3.6634039878845215, - "learning_rate": 4.047217537942665e-05, - "loss": 0.659, + "epoch": 2.0, + "grad_norm": 11.780954360961914, + "learning_rate": 1.589403973509934e-05, + "loss": 0.4763, "step": 960 }, { - "epoch": 2.03, - "grad_norm": 2.776705503463745, - "learning_rate": 4.0893760539629006e-05, - "loss": 0.6155, + "epoch": 2.0, + "grad_norm": 5.923733234405518, + "learning_rate": 1.6059602649006622e-05, + "loss": 0.5337, "step": 970 }, { - "epoch": 2.03, - "grad_norm": 3.137341260910034, - "learning_rate": 4.131534569983137e-05, - "loss": 0.6158, + "epoch": 2.0, + "grad_norm": 12.70390796661377, + "learning_rate": 1.6225165562913908e-05, + "loss": 0.5132, "step": 980 }, { - "epoch": 2.03, - "grad_norm": 6.1107707023620605, - "learning_rate": 4.173693086003373e-05, - "loss": 0.6581, - "step": 990 - }, - { - "epoch": 2.03, - "eval_accuracy": 0.6375358166189111, - "eval_loss": 0.6327700018882751, - "eval_runtime": 33.7114, - "eval_samples_per_second": 20.705, - "eval_steps_per_second": 1.75, + "epoch": 2.0, + "grad_norm": 11.865839004516602, + "learning_rate": 1.6390728476821194e-05, + "loss": 0.5244, "step": 990 }, { - "epoch": 3.0, - "grad_norm": 3.0415008068084717, - "learning_rate": 4.215851602023609e-05, - "loss": 0.6034, + "epoch": 2.01, + "grad_norm": 10.646509170532227, + "learning_rate": 1.655629139072848e-05, + "loss": 0.4553, "step": 1000 }, { - "epoch": 3.0, - "grad_norm": 6.006179332733154, - "learning_rate": 4.2580101180438453e-05, - "loss": 0.4955, + "epoch": 2.01, + "grad_norm": 15.619583129882812, + "learning_rate": 1.6721854304635763e-05, + "loss": 0.423, "step": 1010 }, { - "epoch": 3.0, - "grad_norm": 6.862912654876709, - "learning_rate": 4.300168634064081e-05, - "loss": 0.4918, + "epoch": 2.01, + "grad_norm": 8.261073112487793, + "learning_rate": 1.688741721854305e-05, + "loss": 0.4211, "step": 1020 }, { - "epoch": 3.0, - "grad_norm": 11.918879508972168, - "learning_rate": 4.342327150084317e-05, - "loss": 0.5643, + "epoch": 2.01, + "grad_norm": 9.633855819702148, + "learning_rate": 1.705298013245033e-05, + "loss": 0.5283, "step": 1030 }, { - "epoch": 3.0, - "grad_norm": 8.16517448425293, - "learning_rate": 4.3844856661045536e-05, - "loss": 0.638, + "epoch": 2.01, + "grad_norm": 14.031935691833496, + "learning_rate": 1.7218543046357617e-05, + "loss": 0.5528, "step": 1040 }, { - "epoch": 3.01, - "grad_norm": 6.384884357452393, - "learning_rate": 4.4266441821247895e-05, - "loss": 0.5995, + "epoch": 2.01, + "grad_norm": 11.338902473449707, + "learning_rate": 1.73841059602649e-05, + "loss": 0.4548, "step": 1050 }, { - "epoch": 3.01, - "grad_norm": 5.2973127365112305, - "learning_rate": 4.468802698145025e-05, - "loss": 0.639, + "epoch": 2.01, + "grad_norm": 23.000455856323242, + "learning_rate": 1.7549668874172186e-05, + "loss": 0.5563, "step": 1060 }, { - "epoch": 3.01, - "grad_norm": 3.8324005603790283, - "learning_rate": 4.510961214165262e-05, - "loss": 0.572, + "epoch": 2.01, + "grad_norm": 9.367509841918945, + "learning_rate": 1.771523178807947e-05, + "loss": 0.4287, "step": 1070 }, { - "epoch": 3.01, - "grad_norm": 4.97128963470459, - "learning_rate": 4.553119730185498e-05, - "loss": 0.597, + "epoch": 2.01, + "grad_norm": 7.15519905090332, + "learning_rate": 1.7880794701986755e-05, + "loss": 0.5689, "step": 1080 }, { - "epoch": 3.01, - "grad_norm": 5.064922332763672, - "learning_rate": 4.5952782462057336e-05, - "loss": 0.5617, + "epoch": 2.01, + "grad_norm": 9.858805656433105, + "learning_rate": 1.804635761589404e-05, + "loss": 0.4932, "step": 1090 }, { - "epoch": 3.01, - "grad_norm": 3.734870672225952, - "learning_rate": 4.63743676222597e-05, - "loss": 0.5439, + "epoch": 2.01, + "grad_norm": 6.52086877822876, + "learning_rate": 1.8211920529801323e-05, + "loss": 0.5288, "step": 1100 }, { - "epoch": 3.01, - "grad_norm": 12.495979309082031, - "learning_rate": 4.679595278246206e-05, - "loss": 0.5659, + "epoch": 2.01, + "grad_norm": 13.988100051879883, + "learning_rate": 1.837748344370861e-05, + "loss": 0.4448, "step": 1110 }, { - "epoch": 3.01, - "grad_norm": 3.5089731216430664, - "learning_rate": 4.721753794266442e-05, - "loss": 0.517, + "epoch": 2.01, + "grad_norm": 13.741047859191895, + "learning_rate": 1.8543046357615895e-05, + "loss": 0.3917, "step": 1120 }, { - "epoch": 3.01, - "grad_norm": 5.993117332458496, - "learning_rate": 4.7639123102866784e-05, - "loss": 0.6202, + "epoch": 2.01, + "grad_norm": 7.654866695404053, + "learning_rate": 1.870860927152318e-05, + "loss": 0.4636, "step": 1130 }, { - "epoch": 3.01, - "grad_norm": 3.431788444519043, - "learning_rate": 4.806070826306914e-05, - "loss": 0.5767, + "epoch": 2.01, + "grad_norm": 9.594705581665039, + "learning_rate": 1.8874172185430464e-05, + "loss": 0.5084, "step": 1140 }, { - "epoch": 3.01, - "grad_norm": 5.00049352645874, - "learning_rate": 4.84822934232715e-05, - "loss": 0.5623, + "epoch": 2.01, + "grad_norm": 8.550768852233887, + "learning_rate": 1.903973509933775e-05, + "loss": 0.4168, "step": 1150 }, { - "epoch": 3.01, - "grad_norm": 5.571552276611328, - "learning_rate": 4.8903878583473866e-05, - "loss": 0.5445, + "epoch": 2.01, + "grad_norm": 10.367500305175781, + "learning_rate": 1.9205298013245036e-05, + "loss": 0.5148, "step": 1160 }, { - "epoch": 3.02, - "grad_norm": 3.7177913188934326, - "learning_rate": 4.9325463743676225e-05, - "loss": 0.5748, + "epoch": 2.01, + "grad_norm": 10.711953163146973, + "learning_rate": 1.9370860927152318e-05, + "loss": 0.536, "step": 1170 }, { - "epoch": 3.02, - "grad_norm": 4.8672194480896, - "learning_rate": 4.974704890387858e-05, - "loss": 0.6161, + "epoch": 2.01, + "grad_norm": 8.47671890258789, + "learning_rate": 1.9536423841059604e-05, + "loss": 0.496, "step": 1180 }, { - "epoch": 3.02, - "grad_norm": 4.830544948577881, - "learning_rate": 4.9981255857544517e-05, - "loss": 0.587, + "epoch": 2.01, + "grad_norm": 14.76279067993164, + "learning_rate": 1.970198675496689e-05, + "loss": 0.5253, "step": 1190 }, { - "epoch": 3.02, - "grad_norm": 5.276002407073975, - "learning_rate": 4.993439550140581e-05, - "loss": 0.5564, + "epoch": 2.01, + "grad_norm": 11.065993309020996, + "learning_rate": 1.9867549668874173e-05, + "loss": 0.4508, "step": 1200 }, { - "epoch": 3.02, - "grad_norm": 6.24776029586792, - "learning_rate": 4.988753514526711e-05, - "loss": 0.5164, + "epoch": 2.01, + "grad_norm": 10.93369197845459, + "learning_rate": 2.003311258278146e-05, + "loss": 0.5272, "step": 1210 }, { - "epoch": 3.02, - "grad_norm": 4.341228008270264, - "learning_rate": 4.98406747891284e-05, - "loss": 0.501, + "epoch": 2.01, + "grad_norm": 9.056780815124512, + "learning_rate": 2.0198675496688745e-05, + "loss": 0.5077, "step": 1220 }, { - "epoch": 3.02, - "grad_norm": 3.2242419719696045, - "learning_rate": 4.979381443298969e-05, - "loss": 0.5489, + "epoch": 2.01, + "grad_norm": 8.762649536132812, + "learning_rate": 2.0364238410596027e-05, + "loss": 0.385, "step": 1230 }, { - "epoch": 3.02, - "grad_norm": 4.790765285491943, - "learning_rate": 4.9746954076850985e-05, - "loss": 0.5591, + "epoch": 2.01, + "grad_norm": 8.538049697875977, + "learning_rate": 2.052980132450331e-05, + "loss": 0.6194, "step": 1240 }, { - "epoch": 3.02, - "grad_norm": 3.7129054069519043, - "learning_rate": 4.9700093720712284e-05, - "loss": 0.612, + "epoch": 2.01, + "grad_norm": 8.191787719726562, + "learning_rate": 2.0695364238410596e-05, + "loss": 0.455, "step": 1250 }, { - "epoch": 3.02, - "grad_norm": 7.660614013671875, - "learning_rate": 4.9653233364573576e-05, - "loss": 0.5194, + "epoch": 2.01, + "grad_norm": 25.214134216308594, + "learning_rate": 2.0860927152317882e-05, + "loss": 0.5339, + "step": 1260 + }, + { + "epoch": 2.01, + "eval_accuracy": 0.6430180180180181, + "eval_loss": 0.6294997334480286, + "eval_runtime": 42.0635, + "eval_samples_per_second": 21.111, + "eval_steps_per_second": 1.759, "step": 1260 }, { - "epoch": 3.02, - "grad_norm": 5.343811511993408, - "learning_rate": 4.960637300843486e-05, - "loss": 0.6074, + "epoch": 3.0, + "grad_norm": 9.5454683303833, + "learning_rate": 2.1026490066225165e-05, + "loss": 0.4965, "step": 1270 }, { - "epoch": 3.02, - "grad_norm": 4.8525590896606445, - "learning_rate": 4.955951265229616e-05, - "loss": 0.548, + "epoch": 3.0, + "grad_norm": 4.766613483428955, + "learning_rate": 2.119205298013245e-05, + "loss": 0.4325, "step": 1280 }, { - "epoch": 3.03, - "grad_norm": 4.513533592224121, - "learning_rate": 4.951265229615745e-05, - "loss": 0.5818, + "epoch": 3.0, + "grad_norm": 9.407756805419922, + "learning_rate": 2.1357615894039737e-05, + "loss": 0.4026, "step": 1290 }, { - "epoch": 3.03, - "grad_norm": 3.897047996520996, - "learning_rate": 4.9465791940018746e-05, - "loss": 0.5784, + "epoch": 3.0, + "grad_norm": 5.415344715118408, + "learning_rate": 2.152317880794702e-05, + "loss": 0.4614, "step": 1300 }, { - "epoch": 3.03, - "grad_norm": 3.294161796569824, - "learning_rate": 4.941893158388004e-05, - "loss": 0.5605, + "epoch": 3.0, + "grad_norm": 10.35544204711914, + "learning_rate": 2.1688741721854305e-05, + "loss": 0.6127, "step": 1310 }, { - "epoch": 3.03, - "grad_norm": 6.77754545211792, - "learning_rate": 4.937207122774134e-05, - "loss": 0.5614, - "step": 1320 - }, - { - "epoch": 3.03, - "eval_accuracy": 0.7134670487106017, - "eval_loss": 0.5506558418273926, - "eval_runtime": 34.5926, - "eval_samples_per_second": 20.178, - "eval_steps_per_second": 1.706, + "epoch": 3.0, + "grad_norm": 6.637629508972168, + "learning_rate": 2.185430463576159e-05, + "loss": 0.445, "step": 1320 }, { - "epoch": 4.0, - "grad_norm": 4.854128360748291, - "learning_rate": 4.932521087160263e-05, - "loss": 0.5509, + "epoch": 3.0, + "grad_norm": 16.520090103149414, + "learning_rate": 2.2019867549668874e-05, + "loss": 0.3947, "step": 1330 }, { - "epoch": 4.0, - "grad_norm": 6.327508449554443, - "learning_rate": 4.927835051546392e-05, - "loss": 0.5109, + "epoch": 3.0, + "grad_norm": 17.7099609375, + "learning_rate": 2.218543046357616e-05, + "loss": 0.4481, "step": 1340 }, { - "epoch": 4.0, - "grad_norm": 5.180001258850098, - "learning_rate": 4.9231490159325214e-05, - "loss": 0.5522, + "epoch": 3.0, + "grad_norm": 10.103479385375977, + "learning_rate": 2.2350993377483446e-05, + "loss": 0.4481, "step": 1350 }, { - "epoch": 4.0, - "grad_norm": 6.649283409118652, - "learning_rate": 4.9184629803186506e-05, - "loss": 0.4932, + "epoch": 3.0, + "grad_norm": 8.171016693115234, + "learning_rate": 2.2516556291390732e-05, + "loss": 0.4555, "step": 1360 }, { - "epoch": 4.0, - "grad_norm": 4.401059150695801, - "learning_rate": 4.91377694470478e-05, - "loss": 0.5446, + "epoch": 3.0, + "grad_norm": 8.391425132751465, + "learning_rate": 2.2682119205298014e-05, + "loss": 0.3572, "step": 1370 }, { - "epoch": 4.01, - "grad_norm": 4.1620635986328125, - "learning_rate": 4.909090909090909e-05, - "loss": 0.4964, + "epoch": 3.0, + "grad_norm": 8.662993431091309, + "learning_rate": 2.28476821192053e-05, + "loss": 0.433, "step": 1380 }, { - "epoch": 4.01, - "grad_norm": 5.955656051635742, - "learning_rate": 4.904404873477039e-05, - "loss": 0.4685, + "epoch": 3.0, + "grad_norm": 16.419416427612305, + "learning_rate": 2.3013245033112586e-05, + "loss": 0.3873, "step": 1390 }, { - "epoch": 4.01, - "grad_norm": 2.9708614349365234, - "learning_rate": 4.899718837863168e-05, - "loss": 0.4874, + "epoch": 3.0, + "grad_norm": 17.038654327392578, + "learning_rate": 2.317880794701987e-05, + "loss": 0.4813, "step": 1400 }, { - "epoch": 4.01, - "grad_norm": 4.681861877441406, - "learning_rate": 4.8950328022492975e-05, - "loss": 0.5001, + "epoch": 3.0, + "grad_norm": 16.546653747558594, + "learning_rate": 2.3344370860927155e-05, + "loss": 0.3604, "step": 1410 }, { - "epoch": 4.01, - "grad_norm": 10.97381591796875, - "learning_rate": 4.890346766635427e-05, - "loss": 0.4289, + "epoch": 3.01, + "grad_norm": 7.0249738693237305, + "learning_rate": 2.3509933774834437e-05, + "loss": 0.394, "step": 1420 }, { - "epoch": 4.01, - "grad_norm": 4.6500349044799805, - "learning_rate": 4.885660731021556e-05, - "loss": 0.4725, + "epoch": 3.01, + "grad_norm": 6.531501293182373, + "learning_rate": 2.3675496688741723e-05, + "loss": 0.5181, "step": 1430 }, { - "epoch": 4.01, - "grad_norm": 7.461334705352783, - "learning_rate": 4.880974695407685e-05, - "loss": 0.5846, + "epoch": 3.01, + "grad_norm": 10.783075332641602, + "learning_rate": 2.3841059602649006e-05, + "loss": 0.5868, "step": 1440 }, { - "epoch": 4.01, - "grad_norm": 4.3084940910339355, - "learning_rate": 4.8762886597938144e-05, - "loss": 0.5864, + "epoch": 3.01, + "grad_norm": 5.320065975189209, + "learning_rate": 2.4006622516556292e-05, + "loss": 0.3903, "step": 1450 }, { - "epoch": 4.01, - "grad_norm": 8.88397216796875, - "learning_rate": 4.8716026241799436e-05, - "loss": 0.5392, + "epoch": 3.01, + "grad_norm": 7.875553131103516, + "learning_rate": 2.4172185430463578e-05, + "loss": 0.4643, "step": 1460 }, { - "epoch": 4.01, - "grad_norm": 6.398833274841309, - "learning_rate": 4.8669165885660735e-05, - "loss": 0.5501, + "epoch": 3.01, + "grad_norm": 10.691715240478516, + "learning_rate": 2.433774834437086e-05, + "loss": 0.543, "step": 1470 }, { - "epoch": 4.01, - "grad_norm": 6.638328552246094, - "learning_rate": 4.862230552952203e-05, - "loss": 0.5494, + "epoch": 3.01, + "grad_norm": 13.551742553710938, + "learning_rate": 2.4503311258278147e-05, + "loss": 0.4733, "step": 1480 }, { - "epoch": 4.01, - "grad_norm": 5.006091594696045, - "learning_rate": 4.857544517338332e-05, - "loss": 0.6367, + "epoch": 3.01, + "grad_norm": 13.485529899597168, + "learning_rate": 2.4668874172185433e-05, + "loss": 0.4886, "step": 1490 }, { - "epoch": 4.02, - "grad_norm": 5.209989547729492, - "learning_rate": 4.852858481724461e-05, - "loss": 0.5773, + "epoch": 3.01, + "grad_norm": 9.053728103637695, + "learning_rate": 2.4834437086092715e-05, + "loss": 0.4446, "step": 1500 }, { - "epoch": 4.02, - "grad_norm": 5.720169544219971, - "learning_rate": 4.8481724461105905e-05, - "loss": 0.5022, + "epoch": 3.01, + "grad_norm": 7.607123851776123, + "learning_rate": 2.5e-05, + "loss": 0.386, "step": 1510 }, { - "epoch": 4.02, - "grad_norm": 6.709425449371338, - "learning_rate": 4.84348641049672e-05, - "loss": 0.4968, + "epoch": 3.01, + "grad_norm": 11.253607749938965, + "learning_rate": 2.5165562913907287e-05, + "loss": 0.4385, "step": 1520 }, { - "epoch": 4.02, - "grad_norm": 6.903029918670654, - "learning_rate": 4.838800374882849e-05, - "loss": 0.5384, + "epoch": 3.01, + "grad_norm": 9.173261642456055, + "learning_rate": 2.5331125827814573e-05, + "loss": 0.5017, "step": 1530 }, { - "epoch": 4.02, - "grad_norm": 3.307621479034424, - "learning_rate": 4.834114339268979e-05, - "loss": 0.5675, + "epoch": 3.01, + "grad_norm": 13.1712646484375, + "learning_rate": 2.549668874172186e-05, + "loss": 0.3919, "step": 1540 }, { - "epoch": 4.02, - "grad_norm": 6.119607925415039, - "learning_rate": 4.829428303655108e-05, - "loss": 0.535, + "epoch": 3.01, + "grad_norm": 6.91953706741333, + "learning_rate": 2.566225165562914e-05, + "loss": 0.384, "step": 1550 }, { - "epoch": 4.02, - "grad_norm": 2.2412047386169434, - "learning_rate": 4.824742268041237e-05, - "loss": 0.4636, + "epoch": 3.01, + "grad_norm": 10.297747611999512, + "learning_rate": 2.5827814569536424e-05, + "loss": 0.525, "step": 1560 }, { - "epoch": 4.02, - "grad_norm": 3.960838794708252, - "learning_rate": 4.8200562324273665e-05, - "loss": 0.4383, + "epoch": 3.01, + "grad_norm": 8.570240020751953, + "learning_rate": 2.599337748344371e-05, + "loss": 0.5393, "step": 1570 }, { - "epoch": 4.02, - "grad_norm": 3.922579765319824, - "learning_rate": 4.8153701968134964e-05, - "loss": 0.4583, + "epoch": 3.01, + "grad_norm": 6.507657527923584, + "learning_rate": 2.6158940397350996e-05, + "loss": 0.4595, "step": 1580 }, { - "epoch": 4.02, - "grad_norm": 6.935645580291748, - "learning_rate": 4.810684161199626e-05, - "loss": 0.4757, + "epoch": 3.01, + "grad_norm": 6.807774066925049, + "learning_rate": 2.632450331125828e-05, + "loss": 0.5503, "step": 1590 }, { - "epoch": 4.02, - "grad_norm": 6.558530807495117, - "learning_rate": 4.805998125585754e-05, - "loss": 0.5097, + "epoch": 3.01, + "grad_norm": 4.914486885070801, + "learning_rate": 2.6490066225165565e-05, + "loss": 0.527, "step": 1600 }, { - "epoch": 4.02, - "grad_norm": 4.9284186363220215, - "learning_rate": 4.801312089971884e-05, - "loss": 0.5218, + "epoch": 3.01, + "grad_norm": 8.518081665039062, + "learning_rate": 2.6655629139072848e-05, + "loss": 0.4838, "step": 1610 }, { - "epoch": 4.03, - "grad_norm": 8.183663368225098, - "learning_rate": 4.7966260543580134e-05, - "loss": 0.4462, + "epoch": 3.01, + "grad_norm": 7.288885116577148, + "learning_rate": 2.6821192052980134e-05, + "loss": 0.4403, "step": 1620 }, { - "epoch": 4.03, - "grad_norm": 6.276401996612549, - "learning_rate": 4.7919400187441426e-05, - "loss": 0.7016, + "epoch": 3.01, + "grad_norm": 4.160616874694824, + "learning_rate": 2.6986754966887416e-05, + "loss": 0.3552, "step": 1630 }, { - "epoch": 4.03, - "grad_norm": 4.575777530670166, - "learning_rate": 4.787253983130272e-05, - "loss": 0.4815, + "epoch": 3.01, + "grad_norm": 13.016753196716309, + "learning_rate": 2.7152317880794702e-05, + "loss": 0.4487, "step": 1640 }, { - "epoch": 4.03, - "grad_norm": 2.3570098876953125, - "learning_rate": 4.782567947516402e-05, - "loss": 0.438, - "step": 1650 - }, - { - "epoch": 4.03, - "eval_accuracy": 0.7263610315186246, - "eval_loss": 0.5439700484275818, - "eval_runtime": 33.443, - "eval_samples_per_second": 20.871, - "eval_steps_per_second": 1.764, + "epoch": 3.01, + "grad_norm": 20.624540328979492, + "learning_rate": 2.7317880794701988e-05, + "loss": 0.4864, "step": 1650 }, { - "epoch": 5.0, - "grad_norm": 7.316202163696289, - "learning_rate": 4.777881911902531e-05, - "loss": 0.3581, + "epoch": 3.01, + "grad_norm": 11.90951919555664, + "learning_rate": 2.7483443708609274e-05, + "loss": 0.6142, "step": 1660 }, { - "epoch": 5.0, - "grad_norm": 6.561455249786377, - "learning_rate": 4.77319587628866e-05, - "loss": 0.6762, + "epoch": 3.01, + "grad_norm": 10.347407341003418, + "learning_rate": 2.764900662251656e-05, + "loss": 0.5207, "step": 1670 }, { - "epoch": 5.0, - "grad_norm": 3.9346327781677246, - "learning_rate": 4.7685098406747894e-05, - "loss": 0.4135, + "epoch": 3.01, + "grad_norm": 10.494477272033691, + "learning_rate": 2.781456953642384e-05, + "loss": 0.4483, "step": 1680 }, { - "epoch": 5.0, - "grad_norm": 3.1179192066192627, - "learning_rate": 4.7638238050609187e-05, - "loss": 0.4336, + "epoch": 3.01, + "eval_accuracy": 0.5957207207207207, + "eval_loss": 0.732318103313446, + "eval_runtime": 42.3337, + "eval_samples_per_second": 20.976, + "eval_steps_per_second": 1.748, + "step": 1680 + }, + { + "epoch": 4.0, + "grad_norm": 12.671928405761719, + "learning_rate": 2.7980132450331125e-05, + "loss": 0.444, "step": 1690 }, { - "epoch": 5.0, - "grad_norm": 6.217395305633545, - "learning_rate": 4.759137769447048e-05, - "loss": 0.4875, + "epoch": 4.0, + "grad_norm": 6.501286506652832, + "learning_rate": 2.814569536423841e-05, + "loss": 0.4082, "step": 1700 }, { - "epoch": 5.01, - "grad_norm": 5.203670978546143, - "learning_rate": 4.754451733833177e-05, - "loss": 0.4241, + "epoch": 4.0, + "grad_norm": 5.892317771911621, + "learning_rate": 2.8311258278145697e-05, + "loss": 0.3455, "step": 1710 }, { - "epoch": 5.01, - "grad_norm": 6.5499420166015625, - "learning_rate": 4.749765698219307e-05, - "loss": 0.4823, + "epoch": 4.0, + "grad_norm": 34.05948257446289, + "learning_rate": 2.8476821192052983e-05, + "loss": 0.39, "step": 1720 }, { - "epoch": 5.01, - "grad_norm": 4.842621803283691, - "learning_rate": 4.745079662605436e-05, - "loss": 0.3871, + "epoch": 4.0, + "grad_norm": 6.998101234436035, + "learning_rate": 2.864238410596027e-05, + "loss": 0.3866, "step": 1730 }, { - "epoch": 5.01, - "grad_norm": 5.887164115905762, - "learning_rate": 4.7403936269915655e-05, - "loss": 0.4075, + "epoch": 4.0, + "grad_norm": 11.206581115722656, + "learning_rate": 2.880794701986755e-05, + "loss": 0.4302, "step": 1740 }, { - "epoch": 5.01, - "grad_norm": 7.392530918121338, - "learning_rate": 4.735707591377695e-05, - "loss": 0.4504, + "epoch": 4.0, + "grad_norm": 19.16914176940918, + "learning_rate": 2.8973509933774834e-05, + "loss": 0.5607, "step": 1750 }, { - "epoch": 5.01, - "grad_norm": 6.812521934509277, - "learning_rate": 4.731021555763824e-05, - "loss": 0.5216, + "epoch": 4.0, + "grad_norm": 14.35991096496582, + "learning_rate": 2.913907284768212e-05, + "loss": 0.5429, "step": 1760 }, { - "epoch": 5.01, - "grad_norm": 5.895148754119873, - "learning_rate": 4.726335520149953e-05, - "loss": 0.455, + "epoch": 4.0, + "grad_norm": 8.55944538116455, + "learning_rate": 2.9304635761589406e-05, + "loss": 0.3615, "step": 1770 }, { - "epoch": 5.01, - "grad_norm": 7.757108688354492, - "learning_rate": 4.7216494845360824e-05, - "loss": 0.488, + "epoch": 4.0, + "grad_norm": 9.817693710327148, + "learning_rate": 2.9470198675496692e-05, + "loss": 0.383, "step": 1780 }, { - "epoch": 5.01, - "grad_norm": 9.14256477355957, - "learning_rate": 4.7169634489222116e-05, - "loss": 0.5042, + "epoch": 4.0, + "grad_norm": 3.8304190635681152, + "learning_rate": 2.9635761589403975e-05, + "loss": 0.5135, "step": 1790 }, { - "epoch": 5.01, - "grad_norm": 5.17966890335083, - "learning_rate": 4.7122774133083416e-05, - "loss": 0.4534, + "epoch": 4.0, + "grad_norm": 3.691169023513794, + "learning_rate": 2.980132450331126e-05, + "loss": 0.4897, "step": 1800 }, { - "epoch": 5.01, - "grad_norm": 6.081084728240967, - "learning_rate": 4.707591377694471e-05, - "loss": 0.4221, + "epoch": 4.0, + "grad_norm": 9.651874542236328, + "learning_rate": 2.9966887417218544e-05, + "loss": 0.4799, "step": 1810 }, { - "epoch": 5.01, - "grad_norm": 3.0460875034332275, - "learning_rate": 4.7029053420806e-05, - "loss": 0.4783, + "epoch": 4.0, + "grad_norm": 6.509405136108398, + "learning_rate": 3.0132450331125826e-05, + "loss": 0.459, "step": 1820 }, { - "epoch": 5.02, - "grad_norm": 5.692960262298584, - "learning_rate": 4.698219306466729e-05, - "loss": 0.5559, + "epoch": 4.0, + "grad_norm": 7.874476909637451, + "learning_rate": 3.0298013245033112e-05, + "loss": 0.4221, "step": 1830 }, { - "epoch": 5.02, - "grad_norm": 5.824710845947266, - "learning_rate": 4.6935332708528585e-05, - "loss": 0.486, + "epoch": 4.01, + "grad_norm": 3.1665453910827637, + "learning_rate": 3.0463576158940398e-05, + "loss": 0.3891, "step": 1840 }, { - "epoch": 5.02, - "grad_norm": 8.523110389709473, - "learning_rate": 4.688847235238988e-05, - "loss": 0.4036, + "epoch": 4.01, + "grad_norm": 7.245667934417725, + "learning_rate": 3.062913907284769e-05, + "loss": 0.4019, "step": 1850 }, { - "epoch": 5.02, - "grad_norm": 13.942612648010254, - "learning_rate": 4.684161199625117e-05, - "loss": 0.4903, + "epoch": 4.01, + "grad_norm": 10.771611213684082, + "learning_rate": 3.079470198675497e-05, + "loss": 0.4505, "step": 1860 }, { - "epoch": 5.02, - "grad_norm": 6.322441577911377, - "learning_rate": 4.679475164011247e-05, - "loss": 0.4573, + "epoch": 4.01, + "grad_norm": 8.632268905639648, + "learning_rate": 3.096026490066225e-05, + "loss": 0.4706, "step": 1870 }, { - "epoch": 5.02, - "grad_norm": 3.1088461875915527, - "learning_rate": 4.674789128397376e-05, - "loss": 0.3915, + "epoch": 4.01, + "grad_norm": 7.70095682144165, + "learning_rate": 3.1125827814569535e-05, + "loss": 0.3641, "step": 1880 }, { - "epoch": 5.02, - "grad_norm": 7.44355583190918, - "learning_rate": 4.670103092783505e-05, - "loss": 0.4966, + "epoch": 4.01, + "grad_norm": 9.165621757507324, + "learning_rate": 3.1291390728476825e-05, + "loss": 0.4147, "step": 1890 }, { - "epoch": 5.02, - "grad_norm": 5.484576225280762, - "learning_rate": 4.6654170571696345e-05, - "loss": 0.4336, + "epoch": 4.01, + "grad_norm": 7.8730058670043945, + "learning_rate": 3.145695364238411e-05, + "loss": 0.4285, "step": 1900 }, { - "epoch": 5.02, - "grad_norm": 6.856348037719727, - "learning_rate": 4.6607310215557645e-05, - "loss": 0.5151, + "epoch": 4.01, + "grad_norm": 11.200624465942383, + "learning_rate": 3.162251655629139e-05, + "loss": 0.436, "step": 1910 }, { - "epoch": 5.02, - "grad_norm": 6.7076849937438965, - "learning_rate": 4.656044985941894e-05, - "loss": 0.4332, + "epoch": 4.01, + "grad_norm": 12.971415519714355, + "learning_rate": 3.178807947019868e-05, + "loss": 0.4843, "step": 1920 }, { - "epoch": 5.02, - "grad_norm": 6.39366340637207, - "learning_rate": 4.651358950328022e-05, - "loss": 0.4858, + "epoch": 4.01, + "grad_norm": 5.6903076171875, + "learning_rate": 3.195364238410596e-05, + "loss": 0.4054, "step": 1930 }, { - "epoch": 5.02, - "grad_norm": 7.002589225769043, - "learning_rate": 4.646672914714152e-05, - "loss": 0.4684, + "epoch": 4.01, + "grad_norm": 6.505043029785156, + "learning_rate": 3.2119205298013244e-05, + "loss": 0.4321, "step": 1940 }, { - "epoch": 5.03, - "grad_norm": 10.67752456665039, - "learning_rate": 4.6419868791002814e-05, - "loss": 0.3976, + "epoch": 4.01, + "grad_norm": 12.978421211242676, + "learning_rate": 3.228476821192053e-05, + "loss": 0.4137, "step": 1950 }, { - "epoch": 5.03, - "grad_norm": 12.617927551269531, - "learning_rate": 4.6373008434864106e-05, - "loss": 0.4468, + "epoch": 4.01, + "grad_norm": 6.857866287231445, + "learning_rate": 3.2450331125827816e-05, + "loss": 0.365, "step": 1960 }, { - "epoch": 5.03, - "grad_norm": 6.047290802001953, - "learning_rate": 4.63261480787254e-05, - "loss": 0.5431, + "epoch": 4.01, + "grad_norm": 5.074346542358398, + "learning_rate": 3.26158940397351e-05, + "loss": 0.5182, "step": 1970 }, { - "epoch": 5.03, - "grad_norm": 6.107527256011963, - "learning_rate": 4.62792877225867e-05, - "loss": 0.4569, - "step": 1980 - }, - { - "epoch": 5.03, - "eval_accuracy": 0.7277936962750716, - "eval_loss": 0.5531865954399109, - "eval_runtime": 34.546, - "eval_samples_per_second": 20.205, - "eval_steps_per_second": 1.708, + "epoch": 4.01, + "grad_norm": 7.229710102081299, + "learning_rate": 3.278145695364239e-05, + "loss": 0.4266, "step": 1980 }, { - "epoch": 6.0, - "grad_norm": 5.367207050323486, - "learning_rate": 4.623242736644799e-05, - "loss": 0.37, + "epoch": 4.01, + "grad_norm": 18.746814727783203, + "learning_rate": 3.294701986754967e-05, + "loss": 0.4848, "step": 1990 }, { - "epoch": 6.0, - "grad_norm": 5.771437644958496, - "learning_rate": 4.618556701030928e-05, - "loss": 0.3602, + "epoch": 4.01, + "grad_norm": 6.558658599853516, + "learning_rate": 3.311258278145696e-05, + "loss": 0.439, "step": 2000 }, { - "epoch": 6.0, - "grad_norm": 9.664875984191895, - "learning_rate": 4.6138706654170575e-05, - "loss": 0.3343, + "epoch": 4.01, + "grad_norm": 15.033697128295898, + "learning_rate": 3.3278145695364236e-05, + "loss": 0.368, "step": 2010 }, { - "epoch": 6.0, - "grad_norm": 6.788029193878174, - "learning_rate": 4.609184629803187e-05, - "loss": 0.4078, + "epoch": 4.01, + "grad_norm": 12.098657608032227, + "learning_rate": 3.3443708609271526e-05, + "loss": 0.3267, "step": 2020 }, { - "epoch": 6.0, - "grad_norm": 7.247564315795898, - "learning_rate": 4.604498594189316e-05, - "loss": 0.4155, + "epoch": 4.01, + "grad_norm": 5.617885112762451, + "learning_rate": 3.360927152317881e-05, + "loss": 0.334, "step": 2030 }, { - "epoch": 6.01, - "grad_norm": 4.327235698699951, - "learning_rate": 4.599812558575445e-05, - "loss": 0.3966, + "epoch": 4.01, + "grad_norm": 19.70214080810547, + "learning_rate": 3.37748344370861e-05, + "loss": 0.4613, "step": 2040 }, { - "epoch": 6.01, - "grad_norm": 7.491323947906494, - "learning_rate": 4.595126522961575e-05, - "loss": 0.349, + "epoch": 4.01, + "grad_norm": 6.696241855621338, + "learning_rate": 3.394039735099338e-05, + "loss": 0.3222, "step": 2050 }, { - "epoch": 6.01, - "grad_norm": 9.212530136108398, - "learning_rate": 4.590440487347704e-05, - "loss": 0.4265, + "epoch": 4.01, + "grad_norm": 9.129453659057617, + "learning_rate": 3.410596026490066e-05, + "loss": 0.3567, "step": 2060 }, { - "epoch": 6.01, - "grad_norm": 13.466367721557617, - "learning_rate": 4.5857544517338335e-05, - "loss": 0.4375, + "epoch": 4.01, + "grad_norm": 9.57669448852539, + "learning_rate": 3.4271523178807945e-05, + "loss": 0.469, "step": 2070 }, { - "epoch": 6.01, - "grad_norm": 7.694874286651611, - "learning_rate": 4.581068416119963e-05, - "loss": 0.3759, + "epoch": 4.01, + "grad_norm": 9.590263366699219, + "learning_rate": 3.4437086092715235e-05, + "loss": 0.3506, "step": 2080 }, { - "epoch": 6.01, - "grad_norm": 8.024492263793945, - "learning_rate": 4.576382380506092e-05, - "loss": 0.3936, + "epoch": 4.01, + "grad_norm": 6.603126049041748, + "learning_rate": 3.460264900662252e-05, + "loss": 0.4482, "step": 2090 }, { - "epoch": 6.01, - "grad_norm": 5.053406238555908, - "learning_rate": 4.571696344892221e-05, - "loss": 0.3063, + "epoch": 4.01, + "grad_norm": 31.09659767150879, + "learning_rate": 3.47682119205298e-05, + "loss": 0.4654, "step": 2100 }, { - "epoch": 6.01, - "grad_norm": 7.138169288635254, - "learning_rate": 4.5670103092783504e-05, - "loss": 0.4692, + "epoch": 4.01, + "eval_accuracy": 0.6486486486486487, + "eval_loss": 0.7019912600517273, + "eval_runtime": 42.8716, + "eval_samples_per_second": 20.713, + "eval_steps_per_second": 1.726, + "step": 2100 + }, + { + "epoch": 5.0, + "grad_norm": 5.189599990844727, + "learning_rate": 3.493377483443709e-05, + "loss": 0.3112, "step": 2110 }, { - "epoch": 6.01, - "grad_norm": 3.2747809886932373, - "learning_rate": 4.56232427366448e-05, - "loss": 0.3421, + "epoch": 5.0, + "grad_norm": 14.994132995605469, + "learning_rate": 3.509933774834437e-05, + "loss": 0.4296, "step": 2120 }, { - "epoch": 6.01, - "grad_norm": 5.716726303100586, - "learning_rate": 4.5576382380506096e-05, - "loss": 0.4158, + "epoch": 5.0, + "grad_norm": 5.689457893371582, + "learning_rate": 3.526490066225166e-05, + "loss": 0.2943, "step": 2130 }, { - "epoch": 6.01, - "grad_norm": 5.238852500915527, - "learning_rate": 4.552952202436739e-05, - "loss": 0.3768, + "epoch": 5.0, + "grad_norm": 5.469167232513428, + "learning_rate": 3.543046357615894e-05, + "loss": 0.324, "step": 2140 }, { - "epoch": 6.01, - "grad_norm": 9.766545295715332, - "learning_rate": 4.548266166822868e-05, - "loss": 0.3791, + "epoch": 5.0, + "grad_norm": 9.89396858215332, + "learning_rate": 3.5596026490066226e-05, + "loss": 0.3203, "step": 2150 }, { - "epoch": 6.02, - "grad_norm": 3.8693816661834717, - "learning_rate": 4.543580131208997e-05, - "loss": 0.334, + "epoch": 5.0, + "grad_norm": 15.105340957641602, + "learning_rate": 3.576158940397351e-05, + "loss": 0.4793, "step": 2160 }, { - "epoch": 6.02, - "grad_norm": 9.960877418518066, - "learning_rate": 4.538894095595127e-05, - "loss": 0.4487, + "epoch": 5.0, + "grad_norm": 15.274898529052734, + "learning_rate": 3.59271523178808e-05, + "loss": 0.3326, "step": 2170 }, { - "epoch": 6.02, - "grad_norm": 4.830340385437012, - "learning_rate": 4.534208059981256e-05, - "loss": 0.4052, + "epoch": 5.0, + "grad_norm": 6.226844310760498, + "learning_rate": 3.609271523178808e-05, + "loss": 0.4929, "step": 2180 }, { - "epoch": 6.02, - "grad_norm": 4.533899307250977, - "learning_rate": 4.529522024367385e-05, - "loss": 0.3292, + "epoch": 5.0, + "grad_norm": 5.726044654846191, + "learning_rate": 3.625827814569537e-05, + "loss": 0.3783, "step": 2190 }, { - "epoch": 6.02, - "grad_norm": 4.12489652633667, - "learning_rate": 4.524835988753515e-05, - "loss": 0.342, + "epoch": 5.0, + "grad_norm": 9.240283966064453, + "learning_rate": 3.6423841059602646e-05, + "loss": 0.4878, "step": 2200 }, { - "epoch": 6.02, - "grad_norm": 7.824032306671143, - "learning_rate": 4.520149953139644e-05, - "loss": 0.4713, + "epoch": 5.0, + "grad_norm": 7.1570587158203125, + "learning_rate": 3.6589403973509936e-05, + "loss": 0.3352, "step": 2210 }, { - "epoch": 6.02, - "grad_norm": 6.992900371551514, - "learning_rate": 4.5154639175257733e-05, - "loss": 0.4517, + "epoch": 5.0, + "grad_norm": 8.965234756469727, + "learning_rate": 3.675496688741722e-05, + "loss": 0.4149, "step": 2220 }, { - "epoch": 6.02, - "grad_norm": 2.9177560806274414, - "learning_rate": 4.5107778819119026e-05, - "loss": 0.346, + "epoch": 5.0, + "grad_norm": 4.667807579040527, + "learning_rate": 3.692052980132451e-05, + "loss": 0.2894, "step": 2230 }, { - "epoch": 6.02, - "grad_norm": 5.294494152069092, - "learning_rate": 4.5060918462980325e-05, - "loss": 0.4555, + "epoch": 5.0, + "grad_norm": 13.140071868896484, + "learning_rate": 3.708609271523179e-05, + "loss": 0.3437, "step": 2240 }, { - "epoch": 6.02, - "grad_norm": 3.4280591011047363, - "learning_rate": 4.501405810684162e-05, - "loss": 0.3906, + "epoch": 5.0, + "grad_norm": 6.775124549865723, + "learning_rate": 3.725165562913907e-05, + "loss": 0.2888, "step": 2250 }, { - "epoch": 6.02, - "grad_norm": 5.682974338531494, - "learning_rate": 4.49671977507029e-05, - "loss": 0.3501, + "epoch": 5.01, + "grad_norm": 4.836766719818115, + "learning_rate": 3.741721854304636e-05, + "loss": 0.3891, "step": 2260 }, { - "epoch": 6.02, - "grad_norm": 14.726093292236328, - "learning_rate": 4.49203373945642e-05, - "loss": 0.6106, + "epoch": 5.01, + "grad_norm": 13.269258499145508, + "learning_rate": 3.7582781456953645e-05, + "loss": 0.324, "step": 2270 }, { - "epoch": 6.03, - "grad_norm": 6.266767501831055, - "learning_rate": 4.4873477038425494e-05, - "loss": 0.4206, + "epoch": 5.01, + "grad_norm": 9.380875587463379, + "learning_rate": 3.774834437086093e-05, + "loss": 0.4917, "step": 2280 }, { - "epoch": 6.03, - "grad_norm": 5.161693572998047, - "learning_rate": 4.4826616682286786e-05, - "loss": 0.5073, + "epoch": 5.01, + "grad_norm": 6.058047294616699, + "learning_rate": 3.791390728476821e-05, + "loss": 0.4402, "step": 2290 }, { - "epoch": 6.03, - "grad_norm": 3.678534746170044, - "learning_rate": 4.477975632614808e-05, - "loss": 0.4903, + "epoch": 5.01, + "grad_norm": 10.489774703979492, + "learning_rate": 3.80794701986755e-05, + "loss": 0.4674, "step": 2300 }, { - "epoch": 6.03, - "grad_norm": 3.0561232566833496, - "learning_rate": 4.473289597000938e-05, - "loss": 0.3614, - "step": 2310 - }, - { - "epoch": 6.03, - "eval_accuracy": 0.7363896848137536, - "eval_loss": 0.5283002853393555, - "eval_runtime": 34.2167, - "eval_samples_per_second": 20.399, - "eval_steps_per_second": 1.724, + "epoch": 5.01, + "grad_norm": 6.385855674743652, + "learning_rate": 3.824503311258278e-05, + "loss": 0.3657, "step": 2310 }, { - "epoch": 7.0, - "grad_norm": 4.80795431137085, - "learning_rate": 4.468603561387067e-05, - "loss": 0.342, + "epoch": 5.01, + "grad_norm": 11.147357940673828, + "learning_rate": 3.841059602649007e-05, + "loss": 0.3963, "step": 2320 }, { - "epoch": 7.0, - "grad_norm": 5.0758748054504395, - "learning_rate": 4.463917525773196e-05, - "loss": 0.3799, + "epoch": 5.01, + "grad_norm": 8.97010612487793, + "learning_rate": 3.8576158940397354e-05, + "loss": 0.3673, "step": 2330 }, { - "epoch": 7.0, - "grad_norm": 7.944527626037598, - "learning_rate": 4.4592314901593255e-05, - "loss": 0.3295, + "epoch": 5.01, + "grad_norm": 13.971797943115234, + "learning_rate": 3.8741721854304637e-05, + "loss": 0.3182, "step": 2340 }, { - "epoch": 7.0, - "grad_norm": 8.439969062805176, - "learning_rate": 4.454545454545455e-05, - "loss": 0.4199, + "epoch": 5.01, + "grad_norm": 12.762605667114258, + "learning_rate": 3.890728476821192e-05, + "loss": 0.3467, "step": 2350 }, { - "epoch": 7.0, - "grad_norm": 3.974719762802124, - "learning_rate": 4.449859418931584e-05, - "loss": 0.3785, + "epoch": 5.01, + "grad_norm": 18.55424690246582, + "learning_rate": 3.907284768211921e-05, + "loss": 0.3063, "step": 2360 }, { - "epoch": 7.01, - "grad_norm": 6.712285995483398, - "learning_rate": 4.445173383317713e-05, - "loss": 0.3202, + "epoch": 5.01, + "grad_norm": 20.252593994140625, + "learning_rate": 3.923841059602649e-05, + "loss": 0.334, "step": 2370 }, { - "epoch": 7.01, - "grad_norm": 0.3726706802845001, - "learning_rate": 4.440487347703843e-05, - "loss": 0.1409, + "epoch": 5.01, + "grad_norm": 9.72482967376709, + "learning_rate": 3.940397350993378e-05, + "loss": 0.4933, "step": 2380 }, { - "epoch": 7.01, - "grad_norm": 9.558029174804688, - "learning_rate": 4.435801312089972e-05, - "loss": 0.4147, + "epoch": 5.01, + "grad_norm": 5.5373663902282715, + "learning_rate": 3.956953642384106e-05, + "loss": 0.4095, "step": 2390 }, { - "epoch": 7.01, - "grad_norm": 3.2590224742889404, - "learning_rate": 4.4311152764761015e-05, - "loss": 0.2469, + "epoch": 5.01, + "grad_norm": 5.389358043670654, + "learning_rate": 3.9735099337748346e-05, + "loss": 0.4269, "step": 2400 }, { - "epoch": 7.01, - "grad_norm": 8.232924461364746, - "learning_rate": 4.426429240862231e-05, - "loss": 0.229, + "epoch": 5.01, + "grad_norm": 9.32028865814209, + "learning_rate": 3.990066225165563e-05, + "loss": 0.2847, "step": 2410 }, { - "epoch": 7.01, - "grad_norm": 11.747199058532715, - "learning_rate": 4.42174320524836e-05, - "loss": 0.3219, + "epoch": 5.01, + "grad_norm": 10.698759078979492, + "learning_rate": 4.006622516556292e-05, + "loss": 0.3018, "step": 2420 }, { - "epoch": 7.01, - "grad_norm": 12.6619234085083, - "learning_rate": 4.417057169634489e-05, - "loss": 0.4654, + "epoch": 5.01, + "grad_norm": 9.361750602722168, + "learning_rate": 4.02317880794702e-05, + "loss": 0.3396, "step": 2430 }, { - "epoch": 7.01, - "grad_norm": 10.572305679321289, - "learning_rate": 4.4123711340206185e-05, - "loss": 0.4145, + "epoch": 5.01, + "grad_norm": 7.311129093170166, + "learning_rate": 4.039735099337749e-05, + "loss": 0.4454, "step": 2440 }, { - "epoch": 7.01, - "grad_norm": 7.648813247680664, - "learning_rate": 4.407685098406748e-05, - "loss": 0.3939, + "epoch": 5.01, + "grad_norm": 8.57088851928711, + "learning_rate": 4.056291390728477e-05, + "loss": 0.3589, "step": 2450 }, { - "epoch": 7.01, - "grad_norm": 7.876673698425293, - "learning_rate": 4.4029990627928776e-05, - "loss": 0.4002, + "epoch": 5.01, + "grad_norm": 5.872406482696533, + "learning_rate": 4.0728476821192055e-05, + "loss": 0.3914, "step": 2460 }, { - "epoch": 7.01, - "grad_norm": 2.9009244441986084, - "learning_rate": 4.398313027179007e-05, - "loss": 0.3665, + "epoch": 5.01, + "grad_norm": 3.2365334033966064, + "learning_rate": 4.089403973509934e-05, + "loss": 0.4405, "step": 2470 }, { - "epoch": 7.01, - "grad_norm": 4.1332597732543945, - "learning_rate": 4.393626991565136e-05, - "loss": 0.315, + "epoch": 5.01, + "grad_norm": 11.49345874786377, + "learning_rate": 4.105960264900662e-05, + "loss": 0.3505, "step": 2480 }, { - "epoch": 7.02, - "grad_norm": 8.535113334655762, - "learning_rate": 4.388940955951265e-05, - "loss": 0.3637, + "epoch": 5.01, + "grad_norm": 11.595232963562012, + "learning_rate": 4.122516556291391e-05, + "loss": 0.3939, "step": 2490 }, { - "epoch": 7.02, - "grad_norm": 6.837998390197754, - "learning_rate": 4.384254920337395e-05, - "loss": 0.2805, + "epoch": 5.01, + "grad_norm": 10.280562400817871, + "learning_rate": 4.139072847682119e-05, + "loss": 0.433, "step": 2500 }, { - "epoch": 7.02, - "grad_norm": 9.94541072845459, - "learning_rate": 4.379568884723524e-05, - "loss": 0.3765, + "epoch": 5.01, + "grad_norm": 8.561470031738281, + "learning_rate": 4.155629139072848e-05, + "loss": 0.3621, "step": 2510 }, { - "epoch": 7.02, - "grad_norm": 8.577096939086914, - "learning_rate": 4.374882849109653e-05, - "loss": 0.3092, + "epoch": 5.01, + "grad_norm": 11.474127769470215, + "learning_rate": 4.1721854304635764e-05, + "loss": 0.3897, + "step": 2520 + }, + { + "epoch": 5.01, + "eval_accuracy": 0.6497747747747747, + "eval_loss": 0.7635564208030701, + "eval_runtime": 44.4945, + "eval_samples_per_second": 19.958, + "eval_steps_per_second": 1.663, "step": 2520 }, { - "epoch": 7.02, - "grad_norm": 4.8793253898620605, - "learning_rate": 4.370196813495783e-05, - "loss": 0.2715, + "epoch": 6.0, + "grad_norm": 10.609094619750977, + "learning_rate": 4.1887417218543047e-05, + "loss": 0.3992, "step": 2530 }, { - "epoch": 7.02, - "grad_norm": 6.965157508850098, - "learning_rate": 4.365510777881912e-05, - "loss": 0.4944, + "epoch": 6.0, + "grad_norm": 3.468280076980591, + "learning_rate": 4.205298013245033e-05, + "loss": 0.3226, "step": 2540 }, { - "epoch": 7.02, - "grad_norm": 2.5626721382141113, - "learning_rate": 4.3608247422680414e-05, - "loss": 0.311, + "epoch": 6.0, + "grad_norm": 12.370600700378418, + "learning_rate": 4.221854304635762e-05, + "loss": 0.3053, "step": 2550 }, { - "epoch": 7.02, - "grad_norm": 5.375543594360352, - "learning_rate": 4.3561387066541706e-05, - "loss": 0.4302, + "epoch": 6.0, + "grad_norm": 9.363292694091797, + "learning_rate": 4.23841059602649e-05, + "loss": 0.4458, "step": 2560 }, { - "epoch": 7.02, - "grad_norm": 8.762489318847656, - "learning_rate": 4.3514526710403005e-05, - "loss": 0.2687, + "epoch": 6.0, + "grad_norm": 7.07127571105957, + "learning_rate": 4.254966887417219e-05, + "loss": 0.317, "step": 2570 }, { - "epoch": 7.02, - "grad_norm": 6.056619167327881, - "learning_rate": 4.34676663542643e-05, - "loss": 0.3575, + "epoch": 6.0, + "grad_norm": 10.81989574432373, + "learning_rate": 4.271523178807947e-05, + "loss": 0.4011, "step": 2580 }, { - "epoch": 7.02, - "grad_norm": 5.398069858551025, - "learning_rate": 4.342080599812558e-05, - "loss": 0.2445, + "epoch": 6.0, + "grad_norm": 11.262990951538086, + "learning_rate": 4.288079470198676e-05, + "loss": 0.2711, "step": 2590 }, { - "epoch": 7.02, - "grad_norm": 6.936893939971924, - "learning_rate": 4.337394564198688e-05, - "loss": 0.3377, + "epoch": 6.0, + "grad_norm": 24.4318790435791, + "learning_rate": 4.304635761589404e-05, + "loss": 0.362, "step": 2600 }, { - "epoch": 7.03, - "grad_norm": 6.378572463989258, - "learning_rate": 4.3327085285848174e-05, - "loss": 0.3935, + "epoch": 6.0, + "grad_norm": 15.196390151977539, + "learning_rate": 4.321192052980133e-05, + "loss": 0.3314, "step": 2610 }, { - "epoch": 7.03, - "grad_norm": 15.028035163879395, - "learning_rate": 4.328022492970947e-05, - "loss": 0.238, + "epoch": 6.0, + "grad_norm": 10.590712547302246, + "learning_rate": 4.337748344370861e-05, + "loss": 0.3502, "step": 2620 }, { - "epoch": 7.03, - "grad_norm": 4.001184463500977, - "learning_rate": 4.323336457357076e-05, - "loss": 0.3672, + "epoch": 6.0, + "grad_norm": 9.361169815063477, + "learning_rate": 4.35430463576159e-05, + "loss": 0.3697, "step": 2630 }, { - "epoch": 7.03, - "grad_norm": 0.5959993600845337, - "learning_rate": 4.318650421743206e-05, - "loss": 0.3514, - "step": 2640 - }, - { - "epoch": 7.03, - "eval_accuracy": 0.7406876790830945, - "eval_loss": 0.6699703335762024, - "eval_runtime": 33.8438, - "eval_samples_per_second": 20.624, - "eval_steps_per_second": 1.743, + "epoch": 6.0, + "grad_norm": 11.022934913635254, + "learning_rate": 4.370860927152318e-05, + "loss": 0.3217, "step": 2640 }, { - "epoch": 8.0, - "grad_norm": 6.682882308959961, - "learning_rate": 4.313964386129335e-05, - "loss": 0.3878, + "epoch": 6.0, + "grad_norm": 9.60159683227539, + "learning_rate": 4.3874172185430465e-05, + "loss": 0.368, "step": 2650 }, { - "epoch": 8.0, - "grad_norm": 8.971117973327637, - "learning_rate": 4.309278350515464e-05, - "loss": 0.3186, + "epoch": 6.0, + "grad_norm": 15.481955528259277, + "learning_rate": 4.403973509933775e-05, + "loss": 0.3612, "step": 2660 }, { - "epoch": 8.0, - "grad_norm": 7.282140731811523, - "learning_rate": 4.3045923149015935e-05, - "loss": 0.2842, + "epoch": 6.0, + "grad_norm": 15.360217094421387, + "learning_rate": 4.420529801324503e-05, + "loss": 0.2848, "step": 2670 }, { - "epoch": 8.0, - "grad_norm": 10.455528259277344, - "learning_rate": 4.299906279287723e-05, - "loss": 0.3175, + "epoch": 6.01, + "grad_norm": 10.333962440490723, + "learning_rate": 4.437086092715232e-05, + "loss": 0.2794, "step": 2680 }, { - "epoch": 8.0, - "grad_norm": 8.218666076660156, - "learning_rate": 4.295220243673852e-05, - "loss": 0.3408, + "epoch": 6.01, + "grad_norm": 16.07330894470215, + "learning_rate": 4.45364238410596e-05, + "loss": 0.4981, "step": 2690 }, { - "epoch": 8.01, - "grad_norm": 8.701309204101562, - "learning_rate": 4.290534208059981e-05, - "loss": 0.2705, + "epoch": 6.01, + "grad_norm": 3.810781955718994, + "learning_rate": 4.470198675496689e-05, + "loss": 0.3905, "step": 2700 }, { - "epoch": 8.01, - "grad_norm": 7.51228666305542, - "learning_rate": 4.285848172446111e-05, - "loss": 0.3086, + "epoch": 6.01, + "grad_norm": 7.821160793304443, + "learning_rate": 4.4867549668874174e-05, + "loss": 0.3131, "step": 2710 }, { - "epoch": 8.01, - "grad_norm": 1.298295497894287, - "learning_rate": 4.2811621368322403e-05, - "loss": 0.2298, + "epoch": 6.01, + "grad_norm": 14.027177810668945, + "learning_rate": 4.5033112582781463e-05, + "loss": 0.3232, "step": 2720 }, { - "epoch": 8.01, - "grad_norm": 9.289319038391113, - "learning_rate": 4.2764761012183696e-05, - "loss": 0.2307, + "epoch": 6.01, + "grad_norm": 17.835529327392578, + "learning_rate": 4.519867549668874e-05, + "loss": 0.4435, "step": 2730 }, { - "epoch": 8.01, - "grad_norm": 10.216160774230957, - "learning_rate": 4.271790065604499e-05, - "loss": 0.2715, + "epoch": 6.01, + "grad_norm": 7.726228713989258, + "learning_rate": 4.536423841059603e-05, + "loss": 0.2627, "step": 2740 }, { - "epoch": 8.01, - "grad_norm": 0.6326087117195129, - "learning_rate": 4.267104029990628e-05, - "loss": 0.224, + "epoch": 6.01, + "grad_norm": 13.054713249206543, + "learning_rate": 4.552980132450331e-05, + "loss": 0.3224, "step": 2750 }, { - "epoch": 8.01, - "grad_norm": 0.4399392306804657, - "learning_rate": 4.262417994376757e-05, - "loss": 0.2329, + "epoch": 6.01, + "grad_norm": 8.975753784179688, + "learning_rate": 4.56953642384106e-05, + "loss": 0.3713, "step": 2760 }, { - "epoch": 8.01, - "grad_norm": 18.67351722717285, - "learning_rate": 4.2577319587628865e-05, - "loss": 0.3202, + "epoch": 6.01, + "grad_norm": 15.151286125183105, + "learning_rate": 4.586092715231788e-05, + "loss": 0.494, "step": 2770 }, { - "epoch": 8.01, - "grad_norm": 11.18542194366455, - "learning_rate": 4.253045923149016e-05, - "loss": 0.3705, + "epoch": 6.01, + "grad_norm": 6.243801116943359, + "learning_rate": 4.602649006622517e-05, + "loss": 0.3353, "step": 2780 }, { - "epoch": 8.01, - "grad_norm": 1.94206702709198, - "learning_rate": 4.2483598875351456e-05, - "loss": 0.4143, + "epoch": 6.01, + "grad_norm": 8.108068466186523, + "learning_rate": 4.6192052980132455e-05, + "loss": 0.4029, "step": 2790 }, { - "epoch": 8.01, - "grad_norm": 6.226709365844727, - "learning_rate": 4.243673851921275e-05, - "loss": 0.31, + "epoch": 6.01, + "grad_norm": 13.38729190826416, + "learning_rate": 4.635761589403974e-05, + "loss": 0.3122, "step": 2800 }, { - "epoch": 8.01, - "grad_norm": 14.290252685546875, - "learning_rate": 4.238987816307404e-05, - "loss": 0.2697, + "epoch": 6.01, + "grad_norm": 9.62626838684082, + "learning_rate": 4.652317880794702e-05, + "loss": 0.3107, "step": 2810 }, { - "epoch": 8.02, - "grad_norm": 14.262323379516602, - "learning_rate": 4.234301780693533e-05, - "loss": 0.3874, + "epoch": 6.01, + "grad_norm": 8.30498218536377, + "learning_rate": 4.668874172185431e-05, + "loss": 0.3055, "step": 2820 }, { - "epoch": 8.02, - "grad_norm": 9.656635284423828, - "learning_rate": 4.229615745079663e-05, - "loss": 0.2919, + "epoch": 6.01, + "grad_norm": 8.951045989990234, + "learning_rate": 4.685430463576159e-05, + "loss": 0.3859, "step": 2830 }, { - "epoch": 8.02, - "grad_norm": 3.8038530349731445, - "learning_rate": 4.224929709465792e-05, - "loss": 0.3939, + "epoch": 6.01, + "grad_norm": 5.538322448730469, + "learning_rate": 4.7019867549668875e-05, + "loss": 0.3647, "step": 2840 }, { - "epoch": 8.02, - "grad_norm": 3.729642152786255, - "learning_rate": 4.220243673851921e-05, - "loss": 0.2423, + "epoch": 6.01, + "grad_norm": 6.897611618041992, + "learning_rate": 4.7185430463576164e-05, + "loss": 0.3203, "step": 2850 }, { - "epoch": 8.02, - "grad_norm": 4.79857873916626, - "learning_rate": 4.215557638238051e-05, - "loss": 0.3128, + "epoch": 6.01, + "grad_norm": 5.254495620727539, + "learning_rate": 4.735099337748345e-05, + "loss": 0.4124, "step": 2860 }, { - "epoch": 8.02, - "grad_norm": 6.748260498046875, - "learning_rate": 4.21087160262418e-05, - "loss": 0.3235, + "epoch": 6.01, + "grad_norm": 8.259984016418457, + "learning_rate": 4.751655629139073e-05, + "loss": 0.4248, "step": 2870 }, { - "epoch": 8.02, - "grad_norm": 10.759824752807617, - "learning_rate": 4.2061855670103094e-05, - "loss": 0.285, + "epoch": 6.01, + "grad_norm": 12.430957794189453, + "learning_rate": 4.768211920529801e-05, + "loss": 0.2464, "step": 2880 }, { - "epoch": 8.02, - "grad_norm": 6.050924777984619, - "learning_rate": 4.2014995313964386e-05, - "loss": 0.3038, + "epoch": 6.01, + "grad_norm": 9.76378345489502, + "learning_rate": 4.78476821192053e-05, + "loss": 0.4417, "step": 2890 }, { - "epoch": 8.02, - "grad_norm": 3.649509906768799, - "learning_rate": 4.1968134957825685e-05, - "loss": 0.2936, + "epoch": 6.01, + "grad_norm": 10.517653465270996, + "learning_rate": 4.8013245033112584e-05, + "loss": 0.4046, "step": 2900 }, { - "epoch": 8.02, - "grad_norm": 6.332077980041504, - "learning_rate": 4.192127460168698e-05, - "loss": 0.3153, + "epoch": 6.01, + "grad_norm": 13.035500526428223, + "learning_rate": 4.8178807947019873e-05, + "loss": 0.3439, "step": 2910 }, { - "epoch": 8.02, - "grad_norm": 3.2886719703674316, - "learning_rate": 4.187441424554826e-05, - "loss": 0.2492, + "epoch": 6.01, + "grad_norm": 7.6101603507995605, + "learning_rate": 4.8344370860927156e-05, + "loss": 0.3431, "step": 2920 }, { - "epoch": 8.02, - "grad_norm": 11.733613014221191, - "learning_rate": 4.182755388940956e-05, - "loss": 0.2669, + "epoch": 6.01, + "grad_norm": 14.4600830078125, + "learning_rate": 4.850993377483444e-05, + "loss": 0.3835, "step": 2930 }, { - "epoch": 8.03, - "grad_norm": 3.867345094680786, - "learning_rate": 4.1780693533270855e-05, - "loss": 0.3001, + "epoch": 6.01, + "grad_norm": 19.38602066040039, + "learning_rate": 4.867549668874172e-05, + "loss": 0.3386, "step": 2940 }, { - "epoch": 8.03, - "grad_norm": 7.281511306762695, - "learning_rate": 4.173383317713215e-05, - "loss": 0.3297, - "step": 2950 + "epoch": 6.01, + "eval_accuracy": 0.661036036036036, + "eval_loss": 0.8876528143882751, + "eval_runtime": 43.379, + "eval_samples_per_second": 20.471, + "eval_steps_per_second": 1.706, + "step": 2940 }, { - "epoch": 8.03, - "grad_norm": 10.021977424621582, - "learning_rate": 4.168697282099344e-05, - "loss": 0.2637, - "step": 2960 + "epoch": 7.0, + "grad_norm": 11.12556266784668, + "learning_rate": 4.884105960264901e-05, + "loss": 0.3609, + "step": 2950 }, { - "epoch": 8.03, - "grad_norm": 7.1175971031188965, - "learning_rate": 4.164011246485474e-05, - "loss": 0.2796, - "step": 2970 + "epoch": 7.0, + "grad_norm": 9.327975273132324, + "learning_rate": 4.900662251655629e-05, + "loss": 0.1874, + "step": 2960 }, { - "epoch": 8.03, - "eval_accuracy": 0.7177650429799427, - "eval_loss": 0.7974393367767334, - "eval_runtime": 34.0885, - "eval_samples_per_second": 20.476, - "eval_steps_per_second": 1.731, + "epoch": 7.0, + "grad_norm": 2.0008111000061035, + "learning_rate": 4.917218543046358e-05, + "loss": 0.2868, "step": 2970 }, { - "epoch": 9.0, - "grad_norm": 7.273170471191406, - "learning_rate": 4.159325210871603e-05, - "loss": 0.2985, + "epoch": 7.0, + "grad_norm": 9.07735824584961, + "learning_rate": 4.9337748344370865e-05, + "loss": 0.3089, "step": 2980 }, { - "epoch": 9.0, - "grad_norm": 12.718881607055664, - "learning_rate": 4.154639175257732e-05, - "loss": 0.2799, + "epoch": 7.0, + "grad_norm": 3.0386064052581787, + "learning_rate": 4.950331125827815e-05, + "loss": 0.272, "step": 2990 }, { - "epoch": 9.0, - "grad_norm": 12.384184837341309, - "learning_rate": 4.1499531396438615e-05, - "loss": 0.2361, + "epoch": 7.0, + "grad_norm": 5.1751532554626465, + "learning_rate": 4.966887417218543e-05, + "loss": 0.3166, "step": 3000 }, { - "epoch": 9.0, - "grad_norm": 4.5740156173706055, - "learning_rate": 4.145267104029991e-05, - "loss": 0.1935, + "epoch": 7.0, + "grad_norm": 4.185882568359375, + "learning_rate": 4.983443708609272e-05, + "loss": 0.3783, "step": 3010 }, { - "epoch": 9.0, - "grad_norm": 4.967354774475098, - "learning_rate": 4.14058106841612e-05, - "loss": 0.2195, + "epoch": 7.0, + "grad_norm": 8.280274391174316, + "learning_rate": 5e-05, + "loss": 0.3463, "step": 3020 }, { - "epoch": 9.01, - "grad_norm": 2.65854811668396, - "learning_rate": 4.135895032802249e-05, - "loss": 0.1492, + "epoch": 7.0, + "grad_norm": 15.407179832458496, + "learning_rate": 4.99815987045488e-05, + "loss": 0.3766, "step": 3030 }, { - "epoch": 9.01, - "grad_norm": 22.392955780029297, - "learning_rate": 4.131208997188379e-05, - "loss": 0.3631, + "epoch": 7.0, + "grad_norm": 12.469446182250977, + "learning_rate": 4.99631974090976e-05, + "loss": 0.4571, "step": 3040 }, { - "epoch": 9.01, - "grad_norm": 8.278757095336914, - "learning_rate": 4.1265229615745084e-05, - "loss": 0.201, + "epoch": 7.0, + "grad_norm": 12.229589462280273, + "learning_rate": 4.99447961136464e-05, + "loss": 0.3185, "step": 3050 }, { - "epoch": 9.01, - "grad_norm": 10.572164535522461, - "learning_rate": 4.1218369259606376e-05, - "loss": 0.285, + "epoch": 7.0, + "grad_norm": 10.47951889038086, + "learning_rate": 4.99263948181952e-05, + "loss": 0.4482, "step": 3060 }, { - "epoch": 9.01, - "grad_norm": 7.999050617218018, - "learning_rate": 4.117150890346767e-05, - "loss": 0.202, + "epoch": 7.0, + "grad_norm": 3.1830942630767822, + "learning_rate": 4.9907993522744e-05, + "loss": 0.2548, "step": 3070 }, { - "epoch": 9.01, - "grad_norm": 0.6098045110702515, - "learning_rate": 4.112464854732897e-05, - "loss": 0.3136, + "epoch": 7.0, + "grad_norm": 11.870902061462402, + "learning_rate": 4.98895922272928e-05, + "loss": 0.2811, "step": 3080 }, { - "epoch": 9.01, - "grad_norm": 13.311070442199707, - "learning_rate": 4.107778819119025e-05, - "loss": 0.2286, + "epoch": 7.0, + "grad_norm": 12.750068664550781, + "learning_rate": 4.98711909318416e-05, + "loss": 0.2607, "step": 3090 }, { - "epoch": 9.01, - "grad_norm": 14.218550682067871, - "learning_rate": 4.1030927835051545e-05, - "loss": 0.1248, + "epoch": 7.01, + "grad_norm": 10.34190559387207, + "learning_rate": 4.985278963639041e-05, + "loss": 0.3298, "step": 3100 }, { - "epoch": 9.01, - "grad_norm": 19.886507034301758, - "learning_rate": 4.098406747891284e-05, - "loss": 0.2682, + "epoch": 7.01, + "grad_norm": 6.752932548522949, + "learning_rate": 4.9834388340939203e-05, + "loss": 0.3155, "step": 3110 }, { - "epoch": 9.01, - "grad_norm": 15.638948440551758, - "learning_rate": 4.093720712277414e-05, - "loss": 0.2751, + "epoch": 7.01, + "grad_norm": 11.394509315490723, + "learning_rate": 4.9815987045488004e-05, + "loss": 0.3237, "step": 3120 }, { - "epoch": 9.01, - "grad_norm": 16.060192108154297, - "learning_rate": 4.089034676663543e-05, - "loss": 0.1631, + "epoch": 7.01, + "grad_norm": 24.788921356201172, + "learning_rate": 4.9797585750036804e-05, + "loss": 0.3304, "step": 3130 }, { - "epoch": 9.01, - "grad_norm": 16.14665412902832, - "learning_rate": 4.084348641049672e-05, - "loss": 0.21, + "epoch": 7.01, + "grad_norm": 8.760883331298828, + "learning_rate": 4.9779184454585604e-05, + "loss": 0.3403, "step": 3140 }, { - "epoch": 9.02, - "grad_norm": 12.38498592376709, - "learning_rate": 4.079662605435802e-05, - "loss": 0.2409, + "epoch": 7.01, + "grad_norm": 11.140789031982422, + "learning_rate": 4.9760783159134404e-05, + "loss": 0.3818, "step": 3150 }, { - "epoch": 9.02, - "grad_norm": 21.296875, - "learning_rate": 4.074976569821931e-05, - "loss": 0.3526, + "epoch": 7.01, + "grad_norm": 3.8733274936676025, + "learning_rate": 4.9742381863683204e-05, + "loss": 0.3388, "step": 3160 }, { - "epoch": 9.02, - "grad_norm": 12.729432106018066, - "learning_rate": 4.07029053420806e-05, - "loss": 0.3236, + "epoch": 7.01, + "grad_norm": 5.976226806640625, + "learning_rate": 4.9723980568232004e-05, + "loss": 0.3663, "step": 3170 }, { - "epoch": 9.02, - "grad_norm": 5.113853931427002, - "learning_rate": 4.065604498594189e-05, - "loss": 0.116, + "epoch": 7.01, + "grad_norm": 6.937375068664551, + "learning_rate": 4.9705579272780804e-05, + "loss": 0.3696, "step": 3180 }, { - "epoch": 9.02, - "grad_norm": 15.853752136230469, - "learning_rate": 4.060918462980319e-05, - "loss": 0.3694, + "epoch": 7.01, + "grad_norm": 9.472258567810059, + "learning_rate": 4.9687177977329604e-05, + "loss": 0.3954, "step": 3190 }, { - "epoch": 9.02, - "grad_norm": 3.868764877319336, - "learning_rate": 4.056232427366448e-05, - "loss": 0.1406, + "epoch": 7.01, + "grad_norm": 10.504880905151367, + "learning_rate": 4.9668776681878404e-05, + "loss": 0.3662, "step": 3200 }, { - "epoch": 9.02, - "grad_norm": 3.905461072921753, - "learning_rate": 4.0515463917525774e-05, - "loss": 0.2263, + "epoch": 7.01, + "grad_norm": 2.160273313522339, + "learning_rate": 4.9650375386427205e-05, + "loss": 0.306, "step": 3210 }, { - "epoch": 9.02, - "grad_norm": 9.867708206176758, - "learning_rate": 4.0468603561387067e-05, - "loss": 0.241, + "epoch": 7.01, + "grad_norm": 10.063016891479492, + "learning_rate": 4.9631974090976005e-05, + "loss": 0.4123, "step": 3220 }, { - "epoch": 9.02, - "grad_norm": 7.449389457702637, - "learning_rate": 4.0421743205248366e-05, - "loss": 0.3362, + "epoch": 7.01, + "grad_norm": 8.959742546081543, + "learning_rate": 4.961357279552481e-05, + "loss": 0.4683, "step": 3230 }, { - "epoch": 9.02, - "grad_norm": 6.9868292808532715, - "learning_rate": 4.037488284910966e-05, - "loss": 0.2709, + "epoch": 7.01, + "grad_norm": 11.134523391723633, + "learning_rate": 4.9595171500073605e-05, + "loss": 0.3067, "step": 3240 }, { - "epoch": 9.02, - "grad_norm": 9.967850685119629, - "learning_rate": 4.0328022492970944e-05, - "loss": 0.3062, + "epoch": 7.01, + "grad_norm": 6.577158451080322, + "learning_rate": 4.9576770204622405e-05, + "loss": 0.3519, "step": 3250 }, { - "epoch": 9.02, - "grad_norm": 6.262727737426758, - "learning_rate": 4.028116213683224e-05, - "loss": 0.2394, + "epoch": 7.01, + "grad_norm": 6.515697479248047, + "learning_rate": 4.955836890917121e-05, + "loss": 0.3656, "step": 3260 }, { - "epoch": 9.03, - "grad_norm": 12.531270980834961, - "learning_rate": 4.0234301780693535e-05, - "loss": 0.3621, + "epoch": 7.01, + "grad_norm": 8.94619083404541, + "learning_rate": 4.9539967613720005e-05, + "loss": 0.3805, "step": 3270 }, { - "epoch": 9.03, - "grad_norm": 8.992213249206543, - "learning_rate": 4.018744142455483e-05, - "loss": 0.1948, + "epoch": 7.01, + "grad_norm": 6.963715076446533, + "learning_rate": 4.9521566318268805e-05, + "loss": 0.3076, "step": 3280 }, { - "epoch": 9.03, - "grad_norm": 6.379027843475342, - "learning_rate": 4.014058106841612e-05, - "loss": 0.1911, + "epoch": 7.01, + "grad_norm": 13.25515365600586, + "learning_rate": 4.950316502281761e-05, + "loss": 0.4114, "step": 3290 }, { - "epoch": 9.03, - "grad_norm": 23.431751251220703, - "learning_rate": 4.009372071227742e-05, - "loss": 0.236, - "step": 3300 - }, - { - "epoch": 9.03, - "eval_accuracy": 0.7722063037249284, - "eval_loss": 0.6850898265838623, - "eval_runtime": 33.99, - "eval_samples_per_second": 20.535, - "eval_steps_per_second": 1.736, + "epoch": 7.01, + "grad_norm": 7.593176364898682, + "learning_rate": 4.9484763727366406e-05, + "loss": 0.3189, "step": 3300 }, { - "epoch": 10.0, - "grad_norm": 14.146215438842773, - "learning_rate": 4.004686035613871e-05, - "loss": 0.2476, + "epoch": 7.01, + "grad_norm": 5.747020244598389, + "learning_rate": 4.9466362431915206e-05, + "loss": 0.2708, "step": 3310 }, { - "epoch": 10.0, - "grad_norm": 6.308194160461426, - "learning_rate": 4e-05, - "loss": 0.1314, + "epoch": 7.01, + "grad_norm": 9.585000038146973, + "learning_rate": 4.944796113646401e-05, + "loss": 0.3637, "step": 3320 }, { - "epoch": 10.0, - "grad_norm": 12.289512634277344, - "learning_rate": 3.9953139643861296e-05, - "loss": 0.1203, + "epoch": 7.01, + "grad_norm": 13.932437896728516, + "learning_rate": 4.9429559841012806e-05, + "loss": 0.3064, "step": 3330 }, { - "epoch": 10.0, - "grad_norm": 15.842235565185547, - "learning_rate": 3.990627928772259e-05, - "loss": 0.2797, + "epoch": 7.01, + "grad_norm": 12.580643653869629, + "learning_rate": 4.9411158545561606e-05, + "loss": 0.3294, "step": 3340 }, { - "epoch": 10.0, - "grad_norm": 1.5130534172058105, - "learning_rate": 3.985941893158388e-05, - "loss": 0.1283, + "epoch": 7.01, + "grad_norm": 7.36806583404541, + "learning_rate": 4.939275725011041e-05, + "loss": 0.2394, "step": 3350 }, { - "epoch": 10.01, - "grad_norm": 11.302055358886719, - "learning_rate": 3.981255857544517e-05, - "loss": 0.3701, + "epoch": 7.01, + "grad_norm": 34.076446533203125, + "learning_rate": 4.937435595465921e-05, + "loss": 0.3601, "step": 3360 }, { - "epoch": 10.01, - "grad_norm": 15.355897903442383, - "learning_rate": 3.976569821930647e-05, - "loss": 0.4371, + "epoch": 7.01, + "eval_accuracy": 0.6486486486486487, + "eval_loss": 0.8790870308876038, + "eval_runtime": 41.5269, + "eval_samples_per_second": 21.384, + "eval_steps_per_second": 1.782, + "step": 3360 + }, + { + "epoch": 8.0, + "grad_norm": 14.441483497619629, + "learning_rate": 4.9355954659208006e-05, + "loss": 0.2921, "step": 3370 }, { - "epoch": 10.01, - "grad_norm": 12.716096878051758, - "learning_rate": 3.9718837863167764e-05, - "loss": 0.2433, + "epoch": 8.0, + "grad_norm": 9.203411102294922, + "learning_rate": 4.933755336375681e-05, + "loss": 0.2638, "step": 3380 }, { - "epoch": 10.01, - "grad_norm": 3.5532474517822266, - "learning_rate": 3.9671977507029056e-05, - "loss": 0.0855, + "epoch": 8.0, + "grad_norm": 0.4057348668575287, + "learning_rate": 4.9319152068305613e-05, + "loss": 0.2193, "step": 3390 }, { - "epoch": 10.01, - "grad_norm": 1.783897876739502, - "learning_rate": 3.962511715089035e-05, - "loss": 0.1193, + "epoch": 8.0, + "grad_norm": 13.04542350769043, + "learning_rate": 4.930075077285441e-05, + "loss": 0.446, "step": 3400 }, { - "epoch": 10.01, - "grad_norm": 8.46875286102295, - "learning_rate": 3.957825679475165e-05, - "loss": 0.2303, + "epoch": 8.0, + "grad_norm": 9.762777328491211, + "learning_rate": 4.9282349477403214e-05, + "loss": 0.3489, "step": 3410 }, { - "epoch": 10.01, - "grad_norm": 16.5199031829834, - "learning_rate": 3.953139643861293e-05, - "loss": 0.254, + "epoch": 8.0, + "grad_norm": 8.025638580322266, + "learning_rate": 4.9263948181952014e-05, + "loss": 0.2376, "step": 3420 }, { - "epoch": 10.01, - "grad_norm": 0.3168381452560425, - "learning_rate": 3.9484536082474226e-05, - "loss": 0.1953, + "epoch": 8.0, + "grad_norm": 5.531264781951904, + "learning_rate": 4.924554688650081e-05, + "loss": 0.3041, "step": 3430 }, { - "epoch": 10.01, - "grad_norm": 16.903932571411133, - "learning_rate": 3.943767572633552e-05, - "loss": 0.35, + "epoch": 8.0, + "grad_norm": 7.828507900238037, + "learning_rate": 4.9227145591049614e-05, + "loss": 0.3604, "step": 3440 }, { - "epoch": 10.01, - "grad_norm": 8.773260116577148, - "learning_rate": 3.939081537019682e-05, - "loss": 0.2684, + "epoch": 8.0, + "grad_norm": 8.740315437316895, + "learning_rate": 4.9208744295598414e-05, + "loss": 0.3292, "step": 3450 }, { - "epoch": 10.01, - "grad_norm": 12.934499740600586, - "learning_rate": 3.934395501405811e-05, - "loss": 0.2493, + "epoch": 8.0, + "grad_norm": 12.079410552978516, + "learning_rate": 4.9190343000147214e-05, + "loss": 0.204, "step": 3460 }, { - "epoch": 10.01, - "grad_norm": 11.468120574951172, - "learning_rate": 3.92970946579194e-05, - "loss": 0.154, + "epoch": 8.0, + "grad_norm": 0.8909263610839844, + "learning_rate": 4.9171941704696014e-05, + "loss": 0.2667, "step": 3470 }, { - "epoch": 10.02, - "grad_norm": 0.31763386726379395, - "learning_rate": 3.92502343017807e-05, - "loss": 0.1635, + "epoch": 8.0, + "grad_norm": 15.108928680419922, + "learning_rate": 4.9153540409244814e-05, + "loss": 0.3396, "step": 3480 }, { - "epoch": 10.02, - "grad_norm": 0.5305169224739075, - "learning_rate": 3.920337394564199e-05, - "loss": 0.1666, + "epoch": 8.0, + "grad_norm": 13.09268569946289, + "learning_rate": 4.9135139113793615e-05, + "loss": 0.2719, "step": 3490 }, { - "epoch": 10.02, - "grad_norm": 23.478593826293945, - "learning_rate": 3.915651358950328e-05, - "loss": 0.1499, + "epoch": 8.0, + "grad_norm": 12.801443099975586, + "learning_rate": 4.9116737818342415e-05, + "loss": 0.2951, "step": 3500 }, { - "epoch": 10.02, - "grad_norm": 0.9787814021110535, - "learning_rate": 3.910965323336457e-05, - "loss": 0.3277, + "epoch": 8.0, + "grad_norm": 15.295145034790039, + "learning_rate": 4.9098336522891215e-05, + "loss": 0.2027, "step": 3510 }, { - "epoch": 10.02, - "grad_norm": 10.264555931091309, - "learning_rate": 3.906279287722587e-05, - "loss": 0.2312, + "epoch": 8.01, + "grad_norm": 10.882307052612305, + "learning_rate": 4.9079935227440015e-05, + "loss": 0.459, "step": 3520 }, { - "epoch": 10.02, - "grad_norm": 1.865168571472168, - "learning_rate": 3.901593252108716e-05, - "loss": 0.2381, + "epoch": 8.01, + "grad_norm": 12.717984199523926, + "learning_rate": 4.9061533931988815e-05, + "loss": 0.3292, "step": 3530 }, { - "epoch": 10.02, - "grad_norm": 13.4818754196167, - "learning_rate": 3.8969072164948455e-05, - "loss": 0.263, + "epoch": 8.01, + "grad_norm": 1.0784927606582642, + "learning_rate": 4.9043132636537615e-05, + "loss": 0.2172, "step": 3540 }, { - "epoch": 10.02, - "grad_norm": 11.58323860168457, - "learning_rate": 3.892221180880975e-05, - "loss": 0.2925, + "epoch": 8.01, + "grad_norm": 1.1845206022262573, + "learning_rate": 4.9024731341086415e-05, + "loss": 0.2034, "step": 3550 }, { - "epoch": 10.02, - "grad_norm": 6.375723838806152, - "learning_rate": 3.8875351452671046e-05, - "loss": 0.2063, + "epoch": 8.01, + "grad_norm": 15.95457935333252, + "learning_rate": 4.9006330045635215e-05, + "loss": 0.3306, "step": 3560 }, { - "epoch": 10.02, - "grad_norm": 6.340160846710205, - "learning_rate": 3.882849109653234e-05, - "loss": 0.179, + "epoch": 8.01, + "grad_norm": 11.536294937133789, + "learning_rate": 4.8987928750184016e-05, + "loss": 0.3716, "step": 3570 }, { - "epoch": 10.02, - "grad_norm": 1.2288185358047485, - "learning_rate": 3.8781630740393624e-05, - "loss": 0.1231, + "epoch": 8.01, + "grad_norm": 10.536565780639648, + "learning_rate": 4.8969527454732816e-05, + "loss": 0.2894, "step": 3580 }, { - "epoch": 10.02, - "grad_norm": 4.222806453704834, - "learning_rate": 3.873477038425492e-05, - "loss": 0.198, + "epoch": 8.01, + "grad_norm": 15.887523651123047, + "learning_rate": 4.8951126159281616e-05, + "loss": 0.324, "step": 3590 }, { - "epoch": 10.03, - "grad_norm": 12.188648223876953, - "learning_rate": 3.8687910028116215e-05, - "loss": 0.0827, + "epoch": 8.01, + "grad_norm": 8.051523208618164, + "learning_rate": 4.8932724863830416e-05, + "loss": 0.2115, "step": 3600 }, { - "epoch": 10.03, - "grad_norm": 16.154399871826172, - "learning_rate": 3.864104967197751e-05, - "loss": 0.2194, + "epoch": 8.01, + "grad_norm": 12.734569549560547, + "learning_rate": 4.8914323568379216e-05, + "loss": 0.261, "step": 3610 }, { - "epoch": 10.03, - "grad_norm": 25.385496139526367, - "learning_rate": 3.85941893158388e-05, - "loss": 0.2496, + "epoch": 8.01, + "grad_norm": 21.070165634155273, + "learning_rate": 4.8895922272928016e-05, + "loss": 0.26, "step": 3620 }, { - "epoch": 10.03, - "grad_norm": 0.44989219307899475, - "learning_rate": 3.85473289597001e-05, - "loss": 0.2066, - "step": 3630 - }, - { - "epoch": 10.03, - "eval_accuracy": 0.7707736389684814, - "eval_loss": 0.7625312805175781, - "eval_runtime": 34.4357, - "eval_samples_per_second": 20.27, - "eval_steps_per_second": 1.713, + "epoch": 8.01, + "grad_norm": 14.227327346801758, + "learning_rate": 4.8877520977476816e-05, + "loss": 0.37, "step": 3630 }, { - "epoch": 11.0, - "grad_norm": 15.097335815429688, - "learning_rate": 3.850046860356139e-05, - "loss": 0.1422, + "epoch": 8.01, + "grad_norm": 0.5862560868263245, + "learning_rate": 4.8859119682025616e-05, + "loss": 0.2208, "step": 3640 }, { - "epoch": 11.0, - "grad_norm": 8.706366539001465, - "learning_rate": 3.8453608247422684e-05, - "loss": 0.1621, + "epoch": 8.01, + "grad_norm": 7.712880611419678, + "learning_rate": 4.8840718386574416e-05, + "loss": 0.3196, "step": 3650 }, { - "epoch": 11.0, - "grad_norm": 6.461037635803223, - "learning_rate": 3.8406747891283976e-05, - "loss": 0.2252, + "epoch": 8.01, + "grad_norm": 8.337738037109375, + "learning_rate": 4.8822317091123217e-05, + "loss": 0.3664, "step": 3660 }, { - "epoch": 11.0, - "grad_norm": 18.602127075195312, - "learning_rate": 3.835988753514527e-05, - "loss": 0.3387, + "epoch": 8.01, + "grad_norm": 8.144108772277832, + "learning_rate": 4.880391579567202e-05, + "loss": 0.395, "step": 3670 }, { - "epoch": 11.0, - "grad_norm": 13.997271537780762, - "learning_rate": 3.831302717900656e-05, - "loss": 0.2751, + "epoch": 8.01, + "grad_norm": 5.335818290710449, + "learning_rate": 4.878551450022082e-05, + "loss": 0.208, "step": 3680 }, { - "epoch": 11.01, - "grad_norm": 6.304378986358643, - "learning_rate": 3.826616682286785e-05, - "loss": 0.0713, + "epoch": 8.01, + "grad_norm": 1.657988429069519, + "learning_rate": 4.876711320476962e-05, + "loss": 0.2321, "step": 3690 }, { - "epoch": 11.01, - "grad_norm": 14.0118408203125, - "learning_rate": 3.821930646672915e-05, - "loss": 0.1648, + "epoch": 8.01, + "grad_norm": 11.252223014831543, + "learning_rate": 4.874871190931842e-05, + "loss": 0.3122, "step": 3700 }, { - "epoch": 11.01, - "grad_norm": 22.292020797729492, - "learning_rate": 3.8172446110590444e-05, - "loss": 0.187, + "epoch": 8.01, + "grad_norm": 15.258499145507812, + "learning_rate": 4.873031061386722e-05, + "loss": 0.4487, "step": 3710 }, { - "epoch": 11.01, - "grad_norm": 3.986797332763672, - "learning_rate": 3.8125585754451737e-05, - "loss": 0.244, + "epoch": 8.01, + "grad_norm": 13.362289428710938, + "learning_rate": 4.8711909318416024e-05, + "loss": 0.4194, "step": 3720 }, { - "epoch": 11.01, - "grad_norm": 14.314979553222656, - "learning_rate": 3.807872539831303e-05, - "loss": 0.2893, + "epoch": 8.01, + "grad_norm": 10.974387168884277, + "learning_rate": 4.869350802296482e-05, + "loss": 0.3243, "step": 3730 }, { - "epoch": 11.01, - "grad_norm": 5.869822978973389, - "learning_rate": 3.803186504217433e-05, - "loss": 0.2231, + "epoch": 8.01, + "grad_norm": 7.41167688369751, + "learning_rate": 4.867510672751362e-05, + "loss": 0.3533, "step": 3740 }, { - "epoch": 11.01, - "grad_norm": 2.076101303100586, - "learning_rate": 3.7985004686035613e-05, - "loss": 0.2073, + "epoch": 8.01, + "grad_norm": 5.849924087524414, + "learning_rate": 4.8656705432062424e-05, + "loss": 0.3544, "step": 3750 }, { - "epoch": 11.01, - "grad_norm": 0.970169186592102, - "learning_rate": 3.7938144329896906e-05, - "loss": 0.0698, + "epoch": 8.01, + "grad_norm": 11.34356689453125, + "learning_rate": 4.863830413661122e-05, + "loss": 0.3534, "step": 3760 }, { - "epoch": 11.01, - "grad_norm": 6.208909034729004, - "learning_rate": 3.7891283973758205e-05, - "loss": 0.1289, + "epoch": 8.01, + "grad_norm": 16.076383590698242, + "learning_rate": 4.861990284116002e-05, + "loss": 0.2324, "step": 3770 }, { - "epoch": 11.01, - "grad_norm": 17.37769317626953, - "learning_rate": 3.78444236176195e-05, - "loss": 0.1264, + "epoch": 8.01, + "grad_norm": 7.834190845489502, + "learning_rate": 4.8601501545708825e-05, + "loss": 0.3401, "step": 3780 }, { - "epoch": 11.01, - "grad_norm": 4.121638298034668, - "learning_rate": 3.779756326148079e-05, - "loss": 0.2226, + "epoch": 8.01, + "eval_accuracy": 0.6632882882882883, + "eval_loss": 0.740277111530304, + "eval_runtime": 42.3365, + "eval_samples_per_second": 20.975, + "eval_steps_per_second": 1.748, + "step": 3780 + }, + { + "epoch": 9.0, + "grad_norm": 10.141741752624512, + "learning_rate": 4.858310025025762e-05, + "loss": 0.2218, "step": 3790 }, { - "epoch": 11.01, - "grad_norm": 0.10288643091917038, - "learning_rate": 3.775070290534208e-05, - "loss": 0.2066, + "epoch": 9.0, + "grad_norm": 14.957062721252441, + "learning_rate": 4.856469895480642e-05, + "loss": 0.2532, "step": 3800 }, { - "epoch": 11.02, - "grad_norm": 7.0988240242004395, - "learning_rate": 3.770384254920338e-05, - "loss": 0.2174, + "epoch": 9.0, + "grad_norm": 6.903181552886963, + "learning_rate": 4.8546297659355225e-05, + "loss": 0.2675, "step": 3810 }, { - "epoch": 11.02, - "grad_norm": 10.114404678344727, - "learning_rate": 3.765698219306467e-05, - "loss": 0.3379, + "epoch": 9.0, + "grad_norm": 8.146132469177246, + "learning_rate": 4.852789636390402e-05, + "loss": 0.278, "step": 3820 }, { - "epoch": 11.02, - "grad_norm": 4.689489364624023, - "learning_rate": 3.761012183692596e-05, - "loss": 0.1015, + "epoch": 9.0, + "grad_norm": 16.355253219604492, + "learning_rate": 4.850949506845282e-05, + "loss": 0.2215, "step": 3830 }, { - "epoch": 11.02, - "grad_norm": 16.416460037231445, - "learning_rate": 3.756326148078725e-05, - "loss": 0.1955, + "epoch": 9.0, + "grad_norm": 4.766119003295898, + "learning_rate": 4.8491093773001625e-05, + "loss": 0.2977, "step": 3840 }, { - "epoch": 11.02, - "grad_norm": 23.091764450073242, - "learning_rate": 3.751640112464855e-05, - "loss": 0.3026, + "epoch": 9.0, + "grad_norm": 17.544775009155273, + "learning_rate": 4.8472692477550426e-05, + "loss": 0.2754, "step": 3850 }, { - "epoch": 11.02, - "grad_norm": 8.67167854309082, - "learning_rate": 3.746954076850984e-05, - "loss": 0.197, + "epoch": 9.0, + "grad_norm": 3.9578094482421875, + "learning_rate": 4.845429118209922e-05, + "loss": 0.2533, "step": 3860 }, { - "epoch": 11.02, - "grad_norm": 6.001295566558838, - "learning_rate": 3.7422680412371135e-05, - "loss": 0.2166, + "epoch": 9.0, + "grad_norm": 5.8706231117248535, + "learning_rate": 4.8435889886648026e-05, + "loss": 0.1435, "step": 3870 }, { - "epoch": 11.02, - "grad_norm": 7.23835563659668, - "learning_rate": 3.737582005623243e-05, - "loss": 0.3391, + "epoch": 9.0, + "grad_norm": 5.420246124267578, + "learning_rate": 4.8417488591196826e-05, + "loss": 0.2288, "step": 3880 }, { - "epoch": 11.02, - "grad_norm": 5.40344762802124, - "learning_rate": 3.7328959700093726e-05, - "loss": 0.1672, + "epoch": 9.0, + "grad_norm": 23.27185821533203, + "learning_rate": 4.839908729574562e-05, + "loss": 0.4029, "step": 3890 }, { - "epoch": 11.02, - "grad_norm": 6.540796756744385, - "learning_rate": 3.728209934395502e-05, - "loss": 0.0776, + "epoch": 9.0, + "grad_norm": 9.662599563598633, + "learning_rate": 4.8380686000294426e-05, + "loss": 0.4065, "step": 3900 }, { - "epoch": 11.02, - "grad_norm": 0.964383065700531, - "learning_rate": 3.7235238987816304e-05, - "loss": 0.1802, + "epoch": 9.0, + "grad_norm": 7.802966117858887, + "learning_rate": 4.8362284704843226e-05, + "loss": 0.2416, "step": 3910 }, { - "epoch": 11.02, - "grad_norm": 0.4158385396003723, - "learning_rate": 3.71883786316776e-05, - "loss": 0.0695, + "epoch": 9.0, + "grad_norm": 10.698823928833008, + "learning_rate": 4.834388340939202e-05, + "loss": 0.2022, "step": 3920 }, { - "epoch": 11.03, - "grad_norm": 17.69159698486328, - "learning_rate": 3.7141518275538895e-05, - "loss": 0.2484, + "epoch": 9.0, + "grad_norm": 7.557618141174316, + "learning_rate": 4.8325482113940826e-05, + "loss": 0.3592, "step": 3930 }, { - "epoch": 11.03, - "grad_norm": 13.761899948120117, - "learning_rate": 3.709465791940019e-05, - "loss": 0.2431, + "epoch": 9.01, + "grad_norm": 14.383673667907715, + "learning_rate": 4.8307080818489627e-05, + "loss": 0.1569, "step": 3940 }, { - "epoch": 11.03, - "grad_norm": 0.3444725275039673, - "learning_rate": 3.704779756326148e-05, - "loss": 0.2122, + "epoch": 9.01, + "grad_norm": 14.874628067016602, + "learning_rate": 4.828867952303842e-05, + "loss": 0.3816, "step": 3950 }, { - "epoch": 11.03, - "grad_norm": 3.0498321056365967, - "learning_rate": 3.700093720712278e-05, - "loss": 0.2831, - "step": 3960 - }, - { - "epoch": 11.03, - "eval_accuracy": 0.7707736389684814, - "eval_loss": 0.79632568359375, - "eval_runtime": 33.5427, - "eval_samples_per_second": 20.809, - "eval_steps_per_second": 1.759, + "epoch": 9.01, + "grad_norm": 8.525997161865234, + "learning_rate": 4.827027822758722e-05, + "loss": 0.2609, "step": 3960 }, { - "epoch": 12.0, - "grad_norm": 0.13173483312129974, - "learning_rate": 3.695407685098407e-05, - "loss": 0.0471, + "epoch": 9.01, + "grad_norm": 11.662755966186523, + "learning_rate": 4.825187693213603e-05, + "loss": 0.2357, "step": 3970 }, { - "epoch": 12.0, - "grad_norm": 18.985239028930664, - "learning_rate": 3.6907216494845364e-05, - "loss": 0.0531, + "epoch": 9.01, + "grad_norm": 13.592899322509766, + "learning_rate": 4.823347563668483e-05, + "loss": 0.2936, "step": 3980 }, { - "epoch": 12.0, - "grad_norm": 9.530562400817871, - "learning_rate": 3.6860356138706656e-05, - "loss": 0.2334, + "epoch": 9.01, + "grad_norm": 7.432744026184082, + "learning_rate": 4.821507434123362e-05, + "loss": 0.2854, "step": 3990 }, { - "epoch": 12.0, - "grad_norm": 4.545018672943115, - "learning_rate": 3.681349578256795e-05, - "loss": 0.0164, + "epoch": 9.01, + "grad_norm": 12.174053192138672, + "learning_rate": 4.819667304578243e-05, + "loss": 0.3413, "step": 4000 }, { - "epoch": 12.0, - "grad_norm": 11.210469245910645, - "learning_rate": 3.676663542642924e-05, - "loss": 0.2759, + "epoch": 9.01, + "grad_norm": 20.847490310668945, + "learning_rate": 4.817827175033123e-05, + "loss": 0.1914, "step": 4010 }, { - "epoch": 12.01, - "grad_norm": 15.611169815063477, - "learning_rate": 3.671977507029053e-05, - "loss": 0.1851, + "epoch": 9.01, + "grad_norm": 19.58496856689453, + "learning_rate": 4.815987045488002e-05, + "loss": 0.2485, "step": 4020 }, { - "epoch": 12.01, - "grad_norm": 12.470063209533691, - "learning_rate": 3.667291471415183e-05, - "loss": 0.2275, + "epoch": 9.01, + "grad_norm": 11.753061294555664, + "learning_rate": 4.814146915942883e-05, + "loss": 0.4117, "step": 4030 }, { - "epoch": 12.01, - "grad_norm": 9.493141174316406, - "learning_rate": 3.6626054358013124e-05, - "loss": 0.1243, + "epoch": 9.01, + "grad_norm": 7.240084648132324, + "learning_rate": 4.812306786397763e-05, + "loss": 0.2516, "step": 4040 }, { - "epoch": 12.01, - "grad_norm": 0.25132325291633606, - "learning_rate": 3.657919400187442e-05, - "loss": 0.0885, + "epoch": 9.01, + "grad_norm": 8.860321044921875, + "learning_rate": 4.810466656852642e-05, + "loss": 0.3589, "step": 4050 }, { - "epoch": 12.01, - "grad_norm": 28.46393394470215, - "learning_rate": 3.653233364573571e-05, - "loss": 0.261, + "epoch": 9.01, + "grad_norm": 9.940979957580566, + "learning_rate": 4.808626527307523e-05, + "loss": 0.2426, "step": 4060 }, { - "epoch": 12.01, - "grad_norm": 21.371795654296875, - "learning_rate": 3.648547328959701e-05, - "loss": 0.1305, + "epoch": 9.01, + "grad_norm": 5.785098552703857, + "learning_rate": 4.806786397762403e-05, + "loss": 0.2779, "step": 4070 }, { - "epoch": 12.01, - "grad_norm": 0.9401485323905945, - "learning_rate": 3.6438612933458294e-05, - "loss": 0.1798, + "epoch": 9.01, + "grad_norm": 13.360100746154785, + "learning_rate": 4.804946268217283e-05, + "loss": 0.18, "step": 4080 }, { - "epoch": 12.01, - "grad_norm": 23.651334762573242, - "learning_rate": 3.6391752577319586e-05, - "loss": 0.1997, + "epoch": 9.01, + "grad_norm": 13.76761531829834, + "learning_rate": 4.803106138672163e-05, + "loss": 0.2789, "step": 4090 }, { - "epoch": 12.01, - "grad_norm": 0.2696411609649658, - "learning_rate": 3.6344892221180885e-05, - "loss": 0.1279, + "epoch": 9.01, + "grad_norm": 20.905683517456055, + "learning_rate": 4.801266009127043e-05, + "loss": 0.2769, "step": 4100 }, { - "epoch": 12.01, - "grad_norm": 26.360898971557617, - "learning_rate": 3.629803186504218e-05, - "loss": 0.2128, + "epoch": 9.01, + "grad_norm": 11.389440536499023, + "learning_rate": 4.799425879581923e-05, + "loss": 0.1383, "step": 4110 }, { - "epoch": 12.01, - "grad_norm": 5.624836444854736, - "learning_rate": 3.625117150890347e-05, - "loss": 0.2319, + "epoch": 9.01, + "grad_norm": 22.627151489257812, + "learning_rate": 4.797585750036803e-05, + "loss": 0.3232, "step": 4120 }, { - "epoch": 12.01, - "grad_norm": 9.12028980255127, - "learning_rate": 3.620431115276476e-05, - "loss": 0.1358, + "epoch": 9.01, + "grad_norm": 1.108852505683899, + "learning_rate": 4.795745620491683e-05, + "loss": 0.2509, "step": 4130 }, { - "epoch": 12.02, - "grad_norm": 11.657383918762207, - "learning_rate": 3.615745079662606e-05, - "loss": 0.0597, + "epoch": 9.01, + "grad_norm": 11.086101531982422, + "learning_rate": 4.793905490946563e-05, + "loss": 0.2939, "step": 4140 }, { - "epoch": 12.02, - "grad_norm": 9.352867126464844, - "learning_rate": 3.6110590440487353e-05, - "loss": 0.1819, + "epoch": 9.01, + "grad_norm": 21.736812591552734, + "learning_rate": 4.792065361401443e-05, + "loss": 0.3108, "step": 4150 }, { - "epoch": 12.02, - "grad_norm": 33.10483932495117, - "learning_rate": 3.606373008434864e-05, - "loss": 0.2577, + "epoch": 9.01, + "grad_norm": 12.673864364624023, + "learning_rate": 4.790225231856323e-05, + "loss": 0.2923, "step": 4160 }, { - "epoch": 12.02, - "grad_norm": 7.759758472442627, - "learning_rate": 3.601686972820993e-05, - "loss": 0.1209, + "epoch": 9.01, + "grad_norm": 23.429868698120117, + "learning_rate": 4.788385102311203e-05, + "loss": 0.3268, "step": 4170 }, { - "epoch": 12.02, - "grad_norm": 1.226152777671814, - "learning_rate": 3.597000937207123e-05, - "loss": 0.2366, + "epoch": 9.01, + "grad_norm": 20.748750686645508, + "learning_rate": 4.786544972766083e-05, + "loss": 0.3612, "step": 4180 }, { - "epoch": 12.02, - "grad_norm": 4.4371185302734375, - "learning_rate": 3.592314901593252e-05, - "loss": 0.1248, + "epoch": 9.01, + "grad_norm": 9.398347854614258, + "learning_rate": 4.784704843220963e-05, + "loss": 0.3637, "step": 4190 }, { - "epoch": 12.02, - "grad_norm": 29.921714782714844, - "learning_rate": 3.5876288659793815e-05, - "loss": 0.3106, + "epoch": 9.01, + "grad_norm": 7.497073650360107, + "learning_rate": 4.782864713675843e-05, + "loss": 0.3113, + "step": 4200 + }, + { + "epoch": 9.01, + "eval_accuracy": 0.6959459459459459, + "eval_loss": 0.7315611839294434, + "eval_runtime": 41.8788, + "eval_samples_per_second": 21.204, + "eval_steps_per_second": 1.767, "step": 4200 }, { - "epoch": 12.02, - "grad_norm": 12.740315437316895, - "learning_rate": 3.582942830365511e-05, - "loss": 0.1804, + "epoch": 10.0, + "grad_norm": 2.589111566543579, + "learning_rate": 4.781024584130723e-05, + "loss": 0.3071, "step": 4210 }, { - "epoch": 12.02, - "grad_norm": 23.39232635498047, - "learning_rate": 3.5782567947516406e-05, - "loss": 0.134, + "epoch": 10.0, + "grad_norm": 2.229191541671753, + "learning_rate": 4.779184454585603e-05, + "loss": 0.1844, "step": 4220 }, { - "epoch": 12.02, - "grad_norm": 5.175128936767578, - "learning_rate": 3.57357075913777e-05, - "loss": 0.2728, + "epoch": 10.0, + "grad_norm": 5.470048904418945, + "learning_rate": 4.777344325040483e-05, + "loss": 0.3186, "step": 4230 }, { - "epoch": 12.02, - "grad_norm": 18.326560974121094, - "learning_rate": 3.5688847235238984e-05, - "loss": 0.1816, + "epoch": 10.0, + "grad_norm": 2.975252389907837, + "learning_rate": 4.775504195495363e-05, + "loss": 0.2843, "step": 4240 }, { - "epoch": 12.02, - "grad_norm": 0.42216312885284424, - "learning_rate": 3.5641986879100283e-05, - "loss": 0.0879, + "epoch": 10.0, + "grad_norm": 27.58317756652832, + "learning_rate": 4.773664065950243e-05, + "loss": 0.2342, "step": 4250 }, { - "epoch": 12.03, - "grad_norm": 26.220693588256836, - "learning_rate": 3.5595126522961576e-05, - "loss": 0.094, + "epoch": 10.0, + "grad_norm": 10.897406578063965, + "learning_rate": 4.771823936405123e-05, + "loss": 0.3133, "step": 4260 }, { - "epoch": 12.03, - "grad_norm": 2.731444835662842, - "learning_rate": 3.554826616682287e-05, - "loss": 0.0832, + "epoch": 10.0, + "grad_norm": 18.671857833862305, + "learning_rate": 4.769983806860003e-05, + "loss": 0.247, "step": 4270 }, { - "epoch": 12.03, - "grad_norm": 0.9680123925209045, - "learning_rate": 3.550140581068416e-05, - "loss": 0.0811, + "epoch": 10.0, + "grad_norm": 3.1605987548828125, + "learning_rate": 4.768143677314883e-05, + "loss": 0.2691, "step": 4280 }, { - "epoch": 12.03, - "grad_norm": 69.98845672607422, - "learning_rate": 3.545454545454546e-05, - "loss": 0.1903, - "step": 4290 - }, - { - "epoch": 12.03, - "eval_accuracy": 0.7722063037249284, - "eval_loss": 1.034330129623413, - "eval_runtime": 34.5588, - "eval_samples_per_second": 20.197, - "eval_steps_per_second": 1.707, + "epoch": 10.0, + "grad_norm": 15.147821426391602, + "learning_rate": 4.766303547769763e-05, + "loss": 0.1998, "step": 4290 }, { - "epoch": 13.0, - "grad_norm": 25.59967041015625, - "learning_rate": 3.540768509840675e-05, - "loss": 0.0995, + "epoch": 10.0, + "grad_norm": 9.628533363342285, + "learning_rate": 4.764463418224643e-05, + "loss": 0.4104, "step": 4300 }, { - "epoch": 13.0, - "grad_norm": 24.92293357849121, - "learning_rate": 3.5360824742268044e-05, - "loss": 0.3308, + "epoch": 10.0, + "grad_norm": 3.3812103271484375, + "learning_rate": 4.762623288679523e-05, + "loss": 0.3426, "step": 4310 }, { - "epoch": 13.0, - "grad_norm": 30.706470489501953, - "learning_rate": 3.5313964386129336e-05, - "loss": 0.1114, + "epoch": 10.0, + "grad_norm": 13.017431259155273, + "learning_rate": 4.760783159134403e-05, + "loss": 0.2946, "step": 4320 }, { - "epoch": 13.0, - "grad_norm": 1.1878864765167236, - "learning_rate": 3.526710402999063e-05, - "loss": 0.0588, + "epoch": 10.0, + "grad_norm": 12.399896621704102, + "learning_rate": 4.758943029589283e-05, + "loss": 0.1878, "step": 4330 }, { - "epoch": 13.0, - "grad_norm": 31.14521026611328, - "learning_rate": 3.522024367385192e-05, - "loss": 0.1238, + "epoch": 10.0, + "grad_norm": 21.525562286376953, + "learning_rate": 4.757102900044164e-05, + "loss": 0.2973, "step": 4340 }, { - "epoch": 13.01, - "grad_norm": 5.118488311767578, - "learning_rate": 3.517338331771321e-05, - "loss": 0.1793, + "epoch": 10.0, + "grad_norm": 3.3385937213897705, + "learning_rate": 4.755262770499043e-05, + "loss": 0.1394, "step": 4350 }, { - "epoch": 13.01, - "grad_norm": 30.64320182800293, - "learning_rate": 3.512652296157451e-05, - "loss": 0.1733, + "epoch": 10.01, + "grad_norm": 1.1139613389968872, + "learning_rate": 4.753422640953923e-05, + "loss": 0.22, "step": 4360 }, { - "epoch": 13.01, - "grad_norm": 0.029796045273542404, - "learning_rate": 3.5079662605435805e-05, - "loss": 0.2219, + "epoch": 10.01, + "grad_norm": 11.690268516540527, + "learning_rate": 4.751582511408804e-05, + "loss": 0.2399, "step": 4370 }, { - "epoch": 13.01, - "grad_norm": 12.55972671508789, - "learning_rate": 3.50328022492971e-05, - "loss": 0.1043, + "epoch": 10.01, + "grad_norm": 17.56683921813965, + "learning_rate": 4.749742381863683e-05, + "loss": 0.4817, "step": 4380 }, { - "epoch": 13.01, - "grad_norm": 6.064866065979004, - "learning_rate": 3.498594189315839e-05, - "loss": 0.1679, + "epoch": 10.01, + "grad_norm": 6.885412693023682, + "learning_rate": 4.747902252318563e-05, + "loss": 0.3016, "step": 4390 }, { - "epoch": 13.01, - "grad_norm": 0.1219923123717308, - "learning_rate": 3.493908153701969e-05, - "loss": 0.1371, + "epoch": 10.01, + "grad_norm": 16.683002471923828, + "learning_rate": 4.746062122773444e-05, + "loss": 0.2179, "step": 4400 }, { - "epoch": 13.01, - "grad_norm": 7.214325904846191, - "learning_rate": 3.4892221180880974e-05, - "loss": 0.05, + "epoch": 10.01, + "grad_norm": 21.049219131469727, + "learning_rate": 4.744221993228323e-05, + "loss": 0.2264, "step": 4410 }, { - "epoch": 13.01, - "grad_norm": 54.42354965209961, - "learning_rate": 3.4845360824742266e-05, - "loss": 0.1519, + "epoch": 10.01, + "grad_norm": 0.32361775636672974, + "learning_rate": 4.742381863683203e-05, + "loss": 0.2258, "step": 4420 }, { - "epoch": 13.01, - "grad_norm": 0.27510225772857666, - "learning_rate": 3.4798500468603565e-05, - "loss": 0.1608, + "epoch": 10.01, + "grad_norm": 0.522939920425415, + "learning_rate": 4.740541734138084e-05, + "loss": 0.4103, "step": 4430 }, { - "epoch": 13.01, - "grad_norm": 0.35241690278053284, - "learning_rate": 3.475164011246486e-05, - "loss": 0.1467, + "epoch": 10.01, + "grad_norm": 5.143355846405029, + "learning_rate": 4.738701604592963e-05, + "loss": 0.2144, "step": 4440 }, { - "epoch": 13.01, - "grad_norm": 17.937379837036133, - "learning_rate": 3.470477975632615e-05, - "loss": 0.1619, + "epoch": 10.01, + "grad_norm": 12.447662353515625, + "learning_rate": 4.736861475047843e-05, + "loss": 0.3608, "step": 4450 }, { - "epoch": 13.01, - "grad_norm": 13.159626960754395, - "learning_rate": 3.465791940018744e-05, - "loss": 0.1981, + "epoch": 10.01, + "grad_norm": 10.097562789916992, + "learning_rate": 4.735021345502724e-05, + "loss": 0.2905, "step": 4460 }, { - "epoch": 13.02, - "grad_norm": 19.368940353393555, - "learning_rate": 3.461105904404874e-05, - "loss": 0.3174, + "epoch": 10.01, + "grad_norm": 7.1078362464904785, + "learning_rate": 4.733181215957604e-05, + "loss": 0.2133, "step": 4470 }, { - "epoch": 13.02, - "grad_norm": 0.7653603553771973, - "learning_rate": 3.4564198687910034e-05, - "loss": 0.2421, + "epoch": 10.01, + "grad_norm": 11.503559112548828, + "learning_rate": 4.731341086412483e-05, + "loss": 0.1682, "step": 4480 }, { - "epoch": 13.02, - "grad_norm": 2.392252206802368, - "learning_rate": 3.451733833177132e-05, - "loss": 0.248, + "epoch": 10.01, + "grad_norm": 17.54301643371582, + "learning_rate": 4.729500956867364e-05, + "loss": 0.2728, "step": 4490 }, { - "epoch": 13.02, - "grad_norm": 1.3327248096466064, - "learning_rate": 3.447047797563261e-05, - "loss": 0.0903, + "epoch": 10.01, + "grad_norm": 0.5605434775352478, + "learning_rate": 4.727660827322244e-05, + "loss": 0.2068, "step": 4500 }, { - "epoch": 13.02, - "grad_norm": 8.378661155700684, - "learning_rate": 3.442361761949391e-05, - "loss": 0.1196, + "epoch": 10.01, + "grad_norm": 29.69890022277832, + "learning_rate": 4.725820697777123e-05, + "loss": 0.2811, "step": 4510 }, { - "epoch": 13.02, - "grad_norm": 17.592124938964844, - "learning_rate": 3.43767572633552e-05, - "loss": 0.1543, + "epoch": 10.01, + "grad_norm": 2.86535906791687, + "learning_rate": 4.723980568232004e-05, + "loss": 0.1564, "step": 4520 }, { - "epoch": 13.02, - "grad_norm": 22.954425811767578, - "learning_rate": 3.4329896907216495e-05, - "loss": 0.1726, + "epoch": 10.01, + "grad_norm": 16.985363006591797, + "learning_rate": 4.722140438686884e-05, + "loss": 0.3927, "step": 4530 }, { - "epoch": 13.02, - "grad_norm": 11.798051834106445, - "learning_rate": 3.428303655107779e-05, - "loss": 0.3255, + "epoch": 10.01, + "grad_norm": 11.91352367401123, + "learning_rate": 4.7203003091417633e-05, + "loss": 0.232, "step": 4540 }, { - "epoch": 13.02, - "grad_norm": 20.77471923828125, - "learning_rate": 3.423617619493909e-05, - "loss": 0.2043, + "epoch": 10.01, + "grad_norm": 4.534491539001465, + "learning_rate": 4.718460179596644e-05, + "loss": 0.2632, "step": 4550 }, { - "epoch": 13.02, - "grad_norm": 5.149378299713135, - "learning_rate": 3.418931583880038e-05, - "loss": 0.0589, + "epoch": 10.01, + "grad_norm": 5.70693302154541, + "learning_rate": 4.716620050051524e-05, + "loss": 0.309, "step": 4560 }, { - "epoch": 13.02, - "grad_norm": 8.951346397399902, - "learning_rate": 3.4142455482661665e-05, - "loss": 0.207, + "epoch": 10.01, + "grad_norm": 9.484837532043457, + "learning_rate": 4.7147799205064034e-05, + "loss": 0.2051, "step": 4570 }, { - "epoch": 13.02, - "grad_norm": 0.9169898629188538, - "learning_rate": 3.4095595126522964e-05, - "loss": 0.1265, + "epoch": 10.01, + "grad_norm": 2.0406031608581543, + "learning_rate": 4.712939790961284e-05, + "loss": 0.2524, "step": 4580 }, { - "epoch": 13.03, - "grad_norm": 13.90592098236084, - "learning_rate": 3.4048734770384256e-05, - "loss": 0.2189, + "epoch": 10.01, + "grad_norm": 11.688041687011719, + "learning_rate": 4.711099661416164e-05, + "loss": 0.408, "step": 4590 }, { - "epoch": 13.03, - "grad_norm": 23.327816009521484, - "learning_rate": 3.400187441424555e-05, - "loss": 0.184, + "epoch": 10.01, + "grad_norm": 6.8777899742126465, + "learning_rate": 4.709259531871044e-05, + "loss": 0.2687, "step": 4600 }, { - "epoch": 13.03, - "grad_norm": 14.074636459350586, - "learning_rate": 3.395501405810684e-05, - "loss": 0.1103, + "epoch": 10.01, + "grad_norm": 13.090034484863281, + "learning_rate": 4.707419402325924e-05, + "loss": 0.1895, "step": 4610 }, { - "epoch": 13.03, - "grad_norm": 9.187418937683105, - "learning_rate": 3.390815370196814e-05, - "loss": 0.1169, + "epoch": 10.01, + "grad_norm": 1.0532723665237427, + "learning_rate": 4.705579272780804e-05, + "loss": 0.2096, "step": 4620 }, { - "epoch": 13.03, - "eval_accuracy": 0.7865329512893983, - "eval_loss": 0.8527703285217285, - "eval_runtime": 34.2781, - "eval_samples_per_second": 20.363, - "eval_steps_per_second": 1.721, + "epoch": 10.01, + "eval_accuracy": 0.6981981981981982, + "eval_loss": 0.9519428610801697, + "eval_runtime": 42.0352, + "eval_samples_per_second": 21.125, + "eval_steps_per_second": 1.76, "step": 4620 }, { - "epoch": 14.0, - "grad_norm": 11.900031089782715, - "learning_rate": 3.386129334582943e-05, - "loss": 0.074, + "epoch": 11.0, + "grad_norm": 10.391329765319824, + "learning_rate": 4.703739143235684e-05, + "loss": 0.2347, "step": 4630 }, { - "epoch": 14.0, - "grad_norm": 11.011795043945312, - "learning_rate": 3.3814432989690724e-05, - "loss": 0.126, + "epoch": 11.0, + "grad_norm": 10.35151481628418, + "learning_rate": 4.701899013690564e-05, + "loss": 0.3039, "step": 4640 }, { - "epoch": 14.0, - "grad_norm": 23.668609619140625, - "learning_rate": 3.376757263355202e-05, - "loss": 0.0747, + "epoch": 11.0, + "grad_norm": 15.39012622833252, + "learning_rate": 4.700058884145444e-05, + "loss": 0.1841, "step": 4650 }, { - "epoch": 14.0, - "grad_norm": 0.17650875449180603, - "learning_rate": 3.372071227741331e-05, - "loss": 0.0751, + "epoch": 11.0, + "grad_norm": 1.5340383052825928, + "learning_rate": 4.698218754600324e-05, + "loss": 0.1564, "step": 4660 }, { - "epoch": 14.0, - "grad_norm": 5.125329971313477, - "learning_rate": 3.36738519212746e-05, - "loss": 0.2435, + "epoch": 11.0, + "grad_norm": 8.856043815612793, + "learning_rate": 4.696378625055204e-05, + "loss": 0.1238, "step": 4670 }, { - "epoch": 14.01, - "grad_norm": 21.372135162353516, - "learning_rate": 3.3626991565135894e-05, - "loss": 0.1759, + "epoch": 11.0, + "grad_norm": 6.075847148895264, + "learning_rate": 4.694538495510084e-05, + "loss": 0.15, "step": 4680 }, { - "epoch": 14.01, - "grad_norm": 0.13896770775318146, - "learning_rate": 3.358013120899719e-05, - "loss": 0.0661, + "epoch": 11.0, + "grad_norm": 0.534511923789978, + "learning_rate": 4.692698365964964e-05, + "loss": 0.2028, "step": 4690 }, { - "epoch": 14.01, - "grad_norm": 0.020779293030500412, - "learning_rate": 3.3533270852858485e-05, - "loss": 0.1733, + "epoch": 11.0, + "grad_norm": 19.36056137084961, + "learning_rate": 4.690858236419844e-05, + "loss": 0.2122, "step": 4700 }, { - "epoch": 14.01, - "grad_norm": 14.163589477539062, - "learning_rate": 3.348641049671978e-05, - "loss": 0.1317, + "epoch": 11.0, + "grad_norm": 0.39027050137519836, + "learning_rate": 4.689018106874724e-05, + "loss": 0.2455, "step": 4710 }, { - "epoch": 14.01, - "grad_norm": 1.3063100576400757, - "learning_rate": 3.343955014058107e-05, - "loss": 0.1116, + "epoch": 11.0, + "grad_norm": 16.614545822143555, + "learning_rate": 4.687177977329604e-05, + "loss": 0.291, "step": 4720 }, { - "epoch": 14.01, - "grad_norm": 2.3075990676879883, - "learning_rate": 3.339268978444237e-05, - "loss": 0.0674, + "epoch": 11.0, + "grad_norm": 7.031955242156982, + "learning_rate": 4.685337847784484e-05, + "loss": 0.1445, "step": 4730 }, { - "epoch": 14.01, - "grad_norm": 10.278874397277832, - "learning_rate": 3.3345829428303654e-05, - "loss": 0.2285, + "epoch": 11.0, + "grad_norm": 13.988171577453613, + "learning_rate": 4.683497718239364e-05, + "loss": 0.1988, "step": 4740 }, { - "epoch": 14.01, - "grad_norm": 28.758888244628906, - "learning_rate": 3.3298969072164947e-05, - "loss": 0.0825, + "epoch": 11.0, + "grad_norm": 11.631166458129883, + "learning_rate": 4.681657588694244e-05, + "loss": 0.2773, "step": 4750 }, { - "epoch": 14.01, - "grad_norm": 10.479151725769043, - "learning_rate": 3.3252108716026246e-05, - "loss": 0.1451, + "epoch": 11.0, + "grad_norm": 7.512355327606201, + "learning_rate": 4.679817459149124e-05, + "loss": 0.1132, "step": 4760 }, { - "epoch": 14.01, - "grad_norm": 0.03889832645654678, - "learning_rate": 3.320524835988754e-05, - "loss": 0.0653, + "epoch": 11.0, + "grad_norm": 0.6635879874229431, + "learning_rate": 4.677977329604004e-05, + "loss": 0.1799, "step": 4770 }, { - "epoch": 14.01, - "grad_norm": 1.4351927042007446, - "learning_rate": 3.315838800374883e-05, - "loss": 0.1306, + "epoch": 11.01, + "grad_norm": 59.61662292480469, + "learning_rate": 4.676137200058884e-05, + "loss": 0.2375, "step": 4780 }, { - "epoch": 14.01, - "grad_norm": 28.180889129638672, - "learning_rate": 3.311152764761012e-05, - "loss": 0.2123, + "epoch": 11.01, + "grad_norm": 9.080729484558105, + "learning_rate": 4.674297070513764e-05, + "loss": 0.1571, "step": 4790 }, { - "epoch": 14.02, - "grad_norm": 17.614906311035156, - "learning_rate": 3.306466729147142e-05, - "loss": 0.0832, + "epoch": 11.01, + "grad_norm": 11.073897361755371, + "learning_rate": 4.672456940968644e-05, + "loss": 0.2831, "step": 4800 }, { - "epoch": 14.02, - "grad_norm": 0.06731338798999786, - "learning_rate": 3.3017806935332714e-05, - "loss": 0.0583, + "epoch": 11.01, + "grad_norm": 0.20718024671077728, + "learning_rate": 4.6706168114235243e-05, + "loss": 0.247, "step": 4810 }, { - "epoch": 14.02, - "grad_norm": 1.4568157196044922, - "learning_rate": 3.2970946579194e-05, - "loss": 0.2127, + "epoch": 11.01, + "grad_norm": 6.692288875579834, + "learning_rate": 4.6687766818784044e-05, + "loss": 0.1577, "step": 4820 }, { - "epoch": 14.02, - "grad_norm": 7.513918876647949, - "learning_rate": 3.292408622305529e-05, - "loss": 0.1357, + "epoch": 11.01, + "grad_norm": 21.0302791595459, + "learning_rate": 4.666936552333285e-05, + "loss": 0.2697, "step": 4830 }, { - "epoch": 14.02, - "grad_norm": 0.8259130120277405, - "learning_rate": 3.287722586691659e-05, - "loss": 0.04, + "epoch": 11.01, + "grad_norm": 18.954795837402344, + "learning_rate": 4.6650964227881644e-05, + "loss": 0.2863, "step": 4840 }, { - "epoch": 14.02, - "grad_norm": 0.1765613704919815, - "learning_rate": 3.283036551077788e-05, - "loss": 0.086, + "epoch": 11.01, + "grad_norm": 0.38008683919906616, + "learning_rate": 4.6632562932430444e-05, + "loss": 0.2369, "step": 4850 }, { - "epoch": 14.02, - "grad_norm": 15.376742362976074, - "learning_rate": 3.2783505154639176e-05, - "loss": 0.2212, + "epoch": 11.01, + "grad_norm": 7.475613594055176, + "learning_rate": 4.661416163697925e-05, + "loss": 0.3686, "step": 4860 }, { - "epoch": 14.02, - "grad_norm": 0.4169343411922455, - "learning_rate": 3.273664479850047e-05, - "loss": 0.0478, + "epoch": 11.01, + "grad_norm": 8.689560890197754, + "learning_rate": 4.6595760341528044e-05, + "loss": 0.2117, "step": 4870 }, { - "epoch": 14.02, - "grad_norm": 43.11495590209961, - "learning_rate": 3.268978444236177e-05, - "loss": 0.096, + "epoch": 11.01, + "grad_norm": 9.824551582336426, + "learning_rate": 4.6577359046076844e-05, + "loss": 0.2091, "step": 4880 }, { - "epoch": 14.02, - "grad_norm": 15.717057228088379, - "learning_rate": 3.264292408622306e-05, - "loss": 0.154, + "epoch": 11.01, + "grad_norm": 0.5613351464271545, + "learning_rate": 4.655895775062565e-05, + "loss": 0.1785, "step": 4890 }, { - "epoch": 14.02, - "grad_norm": 24.132003784179688, - "learning_rate": 3.2596063730084345e-05, - "loss": 0.1402, + "epoch": 11.01, + "grad_norm": 15.86489486694336, + "learning_rate": 4.6540556455174444e-05, + "loss": 0.2719, "step": 4900 }, { - "epoch": 14.02, - "grad_norm": 9.180221557617188, - "learning_rate": 3.2549203373945644e-05, - "loss": 0.1438, + "epoch": 11.01, + "grad_norm": 13.821952819824219, + "learning_rate": 4.6522155159723245e-05, + "loss": 0.1034, "step": 4910 }, { - "epoch": 14.03, - "grad_norm": 17.912752151489258, - "learning_rate": 3.2502343017806936e-05, - "loss": 0.3277, + "epoch": 11.01, + "grad_norm": 22.148283004760742, + "learning_rate": 4.650375386427205e-05, + "loss": 0.3639, "step": 4920 }, { - "epoch": 14.03, - "grad_norm": 23.05970573425293, - "learning_rate": 3.245548266166823e-05, - "loss": 0.1899, + "epoch": 11.01, + "grad_norm": 10.938385963439941, + "learning_rate": 4.6485352568820845e-05, + "loss": 0.3123, "step": 4930 }, { - "epoch": 14.03, - "grad_norm": 5.295262813568115, - "learning_rate": 3.240862230552952e-05, - "loss": 0.2962, + "epoch": 11.01, + "grad_norm": 1.883072853088379, + "learning_rate": 4.6466951273369645e-05, + "loss": 0.3987, "step": 4940 }, { - "epoch": 14.03, - "grad_norm": 0.08439239114522934, - "learning_rate": 3.236176194939082e-05, - "loss": 0.3502, - "step": 4950 - }, - { - "epoch": 14.03, - "eval_accuracy": 0.7965616045845272, - "eval_loss": 0.9265322089195251, - "eval_runtime": 33.6194, - "eval_samples_per_second": 20.762, - "eval_steps_per_second": 1.755, + "epoch": 11.01, + "grad_norm": 1.4499850273132324, + "learning_rate": 4.644854997791845e-05, + "loss": 0.1556, "step": 4950 }, { - "epoch": 15.0, - "grad_norm": 27.462636947631836, - "learning_rate": 3.231490159325211e-05, - "loss": 0.1507, + "epoch": 11.01, + "grad_norm": 11.371675491333008, + "learning_rate": 4.643014868246725e-05, + "loss": 0.2313, "step": 4960 }, { - "epoch": 15.0, - "grad_norm": 0.03183213621377945, - "learning_rate": 3.2268041237113405e-05, - "loss": 0.0413, + "epoch": 11.01, + "grad_norm": 9.29699993133545, + "learning_rate": 4.6411747387016045e-05, + "loss": 0.1583, "step": 4970 }, { - "epoch": 15.0, - "grad_norm": 24.36420440673828, - "learning_rate": 3.22211808809747e-05, - "loss": 0.1499, + "epoch": 11.01, + "grad_norm": 18.879003524780273, + "learning_rate": 4.639334609156485e-05, + "loss": 0.3131, "step": 4980 }, { - "epoch": 15.0, - "grad_norm": 2.348003387451172, - "learning_rate": 3.217432052483599e-05, - "loss": 0.2091, + "epoch": 11.01, + "grad_norm": 3.0703296661376953, + "learning_rate": 4.637494479611365e-05, + "loss": 0.3215, "step": 4990 }, { - "epoch": 15.0, - "grad_norm": 14.074554443359375, - "learning_rate": 3.212746016869728e-05, - "loss": 0.1966, + "epoch": 11.01, + "grad_norm": 9.388489723205566, + "learning_rate": 4.6356543500662446e-05, + "loss": 0.2506, "step": 5000 }, { - "epoch": 15.01, - "grad_norm": 13.325398445129395, - "learning_rate": 3.2080599812558574e-05, - "loss": 0.0982, + "epoch": 11.01, + "grad_norm": 15.042057991027832, + "learning_rate": 4.6338142205211246e-05, + "loss": 0.3032, "step": 5010 }, { - "epoch": 15.01, - "grad_norm": 0.06735904514789581, - "learning_rate": 3.203373945641987e-05, - "loss": 0.0819, + "epoch": 11.01, + "grad_norm": 14.531025886535645, + "learning_rate": 4.631974090976005e-05, + "loss": 0.2766, "step": 5020 }, { - "epoch": 15.01, - "grad_norm": 7.472801685333252, - "learning_rate": 3.1986879100281165e-05, - "loss": 0.1468, + "epoch": 11.01, + "grad_norm": 13.1823148727417, + "learning_rate": 4.6301339614308846e-05, + "loss": 0.1502, "step": 5030 }, { - "epoch": 15.01, - "grad_norm": 4.233068943023682, - "learning_rate": 3.194001874414246e-05, - "loss": 0.03, + "epoch": 11.01, + "grad_norm": 12.649144172668457, + "learning_rate": 4.6282938318857646e-05, + "loss": 0.1537, "step": 5040 }, { - "epoch": 15.01, - "grad_norm": 9.645890235900879, - "learning_rate": 3.189315838800375e-05, - "loss": 0.1579, + "epoch": 11.01, + "eval_accuracy": 0.7015765765765766, + "eval_loss": 0.9116391539573669, + "eval_runtime": 41.456, + "eval_samples_per_second": 21.42, + "eval_steps_per_second": 1.785, + "step": 5040 + }, + { + "epoch": 12.0, + "grad_norm": 4.675528526306152, + "learning_rate": 4.626453702340645e-05, + "loss": 0.1384, "step": 5050 }, { - "epoch": 15.01, - "grad_norm": 18.024099349975586, - "learning_rate": 3.184629803186505e-05, - "loss": 0.1401, + "epoch": 12.0, + "grad_norm": 15.369380950927734, + "learning_rate": 4.6246135727955246e-05, + "loss": 0.1938, "step": 5060 }, { - "epoch": 15.01, - "grad_norm": 38.932167053222656, - "learning_rate": 3.1799437675726335e-05, - "loss": 0.1337, + "epoch": 12.0, + "grad_norm": 9.795681953430176, + "learning_rate": 4.6227734432504046e-05, + "loss": 0.1583, "step": 5070 }, { - "epoch": 15.01, - "grad_norm": 0.6836027503013611, - "learning_rate": 3.175257731958763e-05, - "loss": 0.1067, + "epoch": 12.0, + "grad_norm": 16.371030807495117, + "learning_rate": 4.620933313705285e-05, + "loss": 0.2502, "step": 5080 }, { - "epoch": 15.01, - "grad_norm": 13.472026824951172, - "learning_rate": 3.1705716963448926e-05, - "loss": 0.137, + "epoch": 12.0, + "grad_norm": 17.098554611206055, + "learning_rate": 4.6190931841601653e-05, + "loss": 0.1198, "step": 5090 }, { - "epoch": 15.01, - "grad_norm": 49.68353271484375, - "learning_rate": 3.165885660731022e-05, - "loss": 0.2201, + "epoch": 12.0, + "grad_norm": 1.2527225017547607, + "learning_rate": 4.617253054615045e-05, + "loss": 0.0858, "step": 5100 }, { - "epoch": 15.01, - "grad_norm": 0.10768305510282516, - "learning_rate": 3.161199625117151e-05, - "loss": 0.0503, + "epoch": 12.0, + "grad_norm": 5.850952625274658, + "learning_rate": 4.6154129250699254e-05, + "loss": 0.2589, "step": 5110 }, { - "epoch": 15.01, - "grad_norm": 0.02888442948460579, - "learning_rate": 3.15651358950328e-05, - "loss": 0.2246, + "epoch": 12.0, + "grad_norm": 3.29551100730896, + "learning_rate": 4.6135727955248054e-05, + "loss": 0.1703, "step": 5120 }, { - "epoch": 15.02, - "grad_norm": 11.260727882385254, - "learning_rate": 3.15182755388941e-05, - "loss": 0.1655, + "epoch": 12.0, + "grad_norm": 13.772591590881348, + "learning_rate": 4.611732665979685e-05, + "loss": 0.3824, "step": 5130 }, { - "epoch": 15.02, - "grad_norm": 0.16832049190998077, - "learning_rate": 3.1471415182755394e-05, - "loss": 0.2107, + "epoch": 12.0, + "grad_norm": 14.904064178466797, + "learning_rate": 4.6098925364345654e-05, + "loss": 0.1953, "step": 5140 }, { - "epoch": 15.02, - "grad_norm": 0.5747199058532715, - "learning_rate": 3.142455482661668e-05, - "loss": 0.1302, + "epoch": 12.0, + "grad_norm": 21.28123664855957, + "learning_rate": 4.6080524068894454e-05, + "loss": 0.2385, "step": 5150 }, { - "epoch": 15.02, - "grad_norm": 6.570047378540039, - "learning_rate": 3.137769447047797e-05, - "loss": 0.1102, + "epoch": 12.0, + "grad_norm": 0.9189083576202393, + "learning_rate": 4.606212277344325e-05, + "loss": 0.1396, "step": 5160 }, { - "epoch": 15.02, - "grad_norm": 0.02458810992538929, - "learning_rate": 3.133083411433927e-05, - "loss": 0.0711, + "epoch": 12.0, + "grad_norm": 12.280217170715332, + "learning_rate": 4.6043721477992054e-05, + "loss": 0.1751, "step": 5170 }, { - "epoch": 15.02, - "grad_norm": 28.233722686767578, - "learning_rate": 3.1283973758200564e-05, - "loss": 0.1243, + "epoch": 12.0, + "grad_norm": 7.960886478424072, + "learning_rate": 4.6025320182540854e-05, + "loss": 0.0838, "step": 5180 }, { - "epoch": 15.02, - "grad_norm": 34.339900970458984, - "learning_rate": 3.1237113402061856e-05, - "loss": 0.2122, + "epoch": 12.0, + "grad_norm": 3.011474370956421, + "learning_rate": 4.600691888708965e-05, + "loss": 0.1764, "step": 5190 }, { - "epoch": 15.02, - "grad_norm": 0.042488761246204376, - "learning_rate": 3.119025304592315e-05, - "loss": 0.0696, + "epoch": 12.01, + "grad_norm": 19.604820251464844, + "learning_rate": 4.5988517591638455e-05, + "loss": 0.2461, "step": 5200 }, { - "epoch": 15.02, - "grad_norm": 10.747623443603516, - "learning_rate": 3.114339268978445e-05, - "loss": 0.0881, + "epoch": 12.01, + "grad_norm": 12.652880668640137, + "learning_rate": 4.5970116296187255e-05, + "loss": 0.2306, "step": 5210 }, { - "epoch": 15.02, - "grad_norm": 0.14769670367240906, - "learning_rate": 3.109653233364574e-05, - "loss": 0.2615, + "epoch": 12.01, + "grad_norm": 18.40096092224121, + "learning_rate": 4.5951715000736055e-05, + "loss": 0.1667, "step": 5220 }, { - "epoch": 15.02, - "grad_norm": 6.627384662628174, - "learning_rate": 3.104967197750703e-05, - "loss": 0.105, + "epoch": 12.01, + "grad_norm": 13.00020980834961, + "learning_rate": 4.5933313705284855e-05, + "loss": 0.2536, "step": 5230 }, { - "epoch": 15.02, - "grad_norm": 0.06998647749423981, - "learning_rate": 3.1002811621368324e-05, - "loss": 0.1129, + "epoch": 12.01, + "grad_norm": 14.974309921264648, + "learning_rate": 4.5914912409833655e-05, + "loss": 0.2561, "step": 5240 }, { - "epoch": 15.03, - "grad_norm": 14.90441608428955, - "learning_rate": 3.0955951265229617e-05, - "loss": 0.1303, + "epoch": 12.01, + "grad_norm": 11.146673202514648, + "learning_rate": 4.5896511114382455e-05, + "loss": 0.2038, "step": 5250 }, { - "epoch": 15.03, - "grad_norm": 0.6457139849662781, - "learning_rate": 3.090909090909091e-05, - "loss": 0.0176, + "epoch": 12.01, + "grad_norm": 8.248503684997559, + "learning_rate": 4.5878109818931255e-05, + "loss": 0.2505, "step": 5260 }, { - "epoch": 15.03, - "grad_norm": 28.25990867614746, - "learning_rate": 3.08622305529522e-05, - "loss": 0.1868, + "epoch": 12.01, + "grad_norm": 11.573139190673828, + "learning_rate": 4.5859708523480055e-05, + "loss": 0.1565, "step": 5270 }, { - "epoch": 15.03, - "grad_norm": 18.335277557373047, - "learning_rate": 3.08153701968135e-05, - "loss": 0.1728, + "epoch": 12.01, + "grad_norm": 19.135541915893555, + "learning_rate": 4.5841307228028856e-05, + "loss": 0.2578, "step": 5280 }, { - "epoch": 15.03, - "eval_accuracy": 0.8209169054441261, - "eval_loss": 0.8522208333015442, - "eval_runtime": 33.7106, - "eval_samples_per_second": 20.706, - "eval_steps_per_second": 1.75, - "step": 5280 - }, - { - "epoch": 16.0, - "grad_norm": 0.026595309376716614, - "learning_rate": 3.076850984067479e-05, - "loss": 0.0047, + "epoch": 12.01, + "grad_norm": 12.795113563537598, + "learning_rate": 4.5822905932577656e-05, + "loss": 0.2591, "step": 5290 }, { - "epoch": 16.0, - "grad_norm": 16.674278259277344, - "learning_rate": 3.0721649484536085e-05, - "loss": 0.1096, + "epoch": 12.01, + "grad_norm": 16.305692672729492, + "learning_rate": 4.5804504637126456e-05, + "loss": 0.163, "step": 5300 }, { - "epoch": 16.0, - "grad_norm": 1.7340277433395386, - "learning_rate": 3.067478912839738e-05, - "loss": 0.1344, + "epoch": 12.01, + "grad_norm": 6.531643390655518, + "learning_rate": 4.5786103341675256e-05, + "loss": 0.2605, "step": 5310 }, { - "epoch": 16.0, - "grad_norm": 0.22693070769309998, - "learning_rate": 3.062792877225867e-05, - "loss": 0.1013, + "epoch": 12.01, + "grad_norm": 13.617148399353027, + "learning_rate": 4.5767702046224056e-05, + "loss": 0.2031, "step": 5320 }, { - "epoch": 16.0, - "grad_norm": 0.07706239074468613, - "learning_rate": 3.058106841611996e-05, - "loss": 0.1392, + "epoch": 12.01, + "grad_norm": 5.859495162963867, + "learning_rate": 4.5749300750772856e-05, + "loss": 0.1609, "step": 5330 }, { - "epoch": 16.01, - "grad_norm": 18.134891510009766, - "learning_rate": 3.0534208059981254e-05, - "loss": 0.06, + "epoch": 12.01, + "grad_norm": 0.675979733467102, + "learning_rate": 4.5730899455321656e-05, + "loss": 0.2702, "step": 5340 }, { - "epoch": 16.01, - "grad_norm": 1.382936716079712, - "learning_rate": 3.048734770384255e-05, - "loss": 0.1596, + "epoch": 12.01, + "grad_norm": 16.337562561035156, + "learning_rate": 4.5712498159870456e-05, + "loss": 0.1415, "step": 5350 }, { - "epoch": 16.01, - "grad_norm": 20.703399658203125, - "learning_rate": 3.0440487347703846e-05, - "loss": 0.0695, + "epoch": 12.01, + "grad_norm": 4.218993186950684, + "learning_rate": 4.5694096864419257e-05, + "loss": 0.2667, "step": 5360 }, { - "epoch": 16.01, - "grad_norm": 1.053179144859314, - "learning_rate": 3.0393626991565138e-05, - "loss": 0.1434, + "epoch": 12.01, + "grad_norm": 20.66876983642578, + "learning_rate": 4.567569556896806e-05, + "loss": 0.2478, "step": 5370 }, { - "epoch": 16.01, - "grad_norm": 0.017900103703141212, - "learning_rate": 3.0346766635426434e-05, - "loss": 0.0591, + "epoch": 12.01, + "grad_norm": 8.135565757751465, + "learning_rate": 4.565729427351686e-05, + "loss": 0.2205, "step": 5380 }, { - "epoch": 16.01, - "grad_norm": 0.056633152067661285, - "learning_rate": 3.0299906279287726e-05, - "loss": 0.1733, + "epoch": 12.01, + "grad_norm": 9.30663776397705, + "learning_rate": 4.563889297806566e-05, + "loss": 0.328, "step": 5390 }, { - "epoch": 16.01, - "grad_norm": 0.11656715720891953, - "learning_rate": 3.0253045923149015e-05, - "loss": 0.1159, + "epoch": 12.01, + "grad_norm": 16.911775588989258, + "learning_rate": 4.562049168261446e-05, + "loss": 0.2238, "step": 5400 }, { - "epoch": 16.01, - "grad_norm": 2.425400972366333, - "learning_rate": 3.020618556701031e-05, - "loss": 0.1666, + "epoch": 12.01, + "grad_norm": 4.587623119354248, + "learning_rate": 4.560209038716326e-05, + "loss": 0.0763, "step": 5410 }, { - "epoch": 16.01, - "grad_norm": 4.859916687011719, - "learning_rate": 3.0159325210871603e-05, - "loss": 0.0466, + "epoch": 12.01, + "grad_norm": 17.55312728881836, + "learning_rate": 4.558368909171206e-05, + "loss": 0.2904, "step": 5420 }, { - "epoch": 16.01, - "grad_norm": 29.941722869873047, - "learning_rate": 3.01124648547329e-05, - "loss": 0.1313, + "epoch": 12.01, + "grad_norm": 11.453413009643555, + "learning_rate": 4.556528779626086e-05, + "loss": 0.3298, "step": 5430 }, { - "epoch": 16.01, - "grad_norm": 1.4672462940216064, - "learning_rate": 3.006560449859419e-05, - "loss": 0.066, + "epoch": 12.01, + "grad_norm": 3.5977048873901367, + "learning_rate": 4.554688650080966e-05, + "loss": 0.331, "step": 5440 }, { - "epoch": 16.01, - "grad_norm": 20.58746910095215, - "learning_rate": 3.0018744142455487e-05, - "loss": 0.0488, + "epoch": 12.01, + "grad_norm": 8.736082077026367, + "learning_rate": 4.5528485205358464e-05, + "loss": 0.1551, "step": 5450 }, { - "epoch": 16.02, - "grad_norm": 19.143203735351562, - "learning_rate": 2.997188378631678e-05, - "loss": 0.1403, + "epoch": 12.01, + "grad_norm": 3.3765244483947754, + "learning_rate": 4.551008390990726e-05, + "loss": 0.1113, "step": 5460 }, { - "epoch": 16.02, - "grad_norm": 21.062692642211914, - "learning_rate": 2.9925023430178075e-05, - "loss": 0.0366, + "epoch": 12.01, + "eval_accuracy": 0.6970720720720721, + "eval_loss": 1.00467848777771, + "eval_runtime": 41.2332, + "eval_samples_per_second": 21.536, + "eval_steps_per_second": 1.795, + "step": 5460 + }, + { + "epoch": 13.0, + "grad_norm": 23.630643844604492, + "learning_rate": 4.549168261445606e-05, + "loss": 0.083, "step": 5470 }, { - "epoch": 16.02, - "grad_norm": 1.4022785425186157, - "learning_rate": 2.987816307403936e-05, - "loss": 0.0344, + "epoch": 13.0, + "grad_norm": 2.3666303157806396, + "learning_rate": 4.5473281319004865e-05, + "loss": 0.1436, "step": 5480 }, { - "epoch": 16.02, - "grad_norm": 0.7755998373031616, - "learning_rate": 2.9831302717900656e-05, - "loss": 0.1005, + "epoch": 13.0, + "grad_norm": 22.300064086914062, + "learning_rate": 4.545488002355366e-05, + "loss": 0.1631, "step": 5490 }, { - "epoch": 16.02, - "grad_norm": 0.02092009223997593, - "learning_rate": 2.9784442361761948e-05, - "loss": 0.1796, + "epoch": 13.0, + "grad_norm": 14.8043212890625, + "learning_rate": 4.543647872810246e-05, + "loss": 0.1613, "step": 5500 }, { - "epoch": 16.02, - "grad_norm": 27.31386375427246, - "learning_rate": 2.9737582005623244e-05, - "loss": 0.1263, + "epoch": 13.0, + "grad_norm": 0.10374309122562408, + "learning_rate": 4.5418077432651265e-05, + "loss": 0.1761, "step": 5510 }, { - "epoch": 16.02, - "grad_norm": 3.5492141246795654, - "learning_rate": 2.9690721649484536e-05, - "loss": 0.2096, + "epoch": 13.0, + "grad_norm": 8.43531322479248, + "learning_rate": 4.539967613720006e-05, + "loss": 0.1712, "step": 5520 }, { - "epoch": 16.02, - "grad_norm": 23.727418899536133, - "learning_rate": 2.9643861293345832e-05, - "loss": 0.1176, + "epoch": 13.0, + "grad_norm": 8.424771308898926, + "learning_rate": 4.538127484174886e-05, + "loss": 0.2752, "step": 5530 }, { - "epoch": 16.02, - "grad_norm": 2.065145492553711, - "learning_rate": 2.9597000937207124e-05, - "loss": 0.0873, + "epoch": 13.0, + "grad_norm": 12.37260913848877, + "learning_rate": 4.5362873546297665e-05, + "loss": 0.2324, "step": 5540 }, { - "epoch": 16.02, - "grad_norm": 1.5445265769958496, - "learning_rate": 2.955014058106842e-05, - "loss": 0.1622, + "epoch": 13.0, + "grad_norm": 9.709940910339355, + "learning_rate": 4.534447225084646e-05, + "loss": 0.2579, "step": 5550 }, { - "epoch": 16.02, - "grad_norm": 3.4327638149261475, - "learning_rate": 2.9503280224929712e-05, - "loss": 0.1375, + "epoch": 13.0, + "grad_norm": 20.865863800048828, + "learning_rate": 4.532607095539526e-05, + "loss": 0.3222, "step": 5560 }, { - "epoch": 16.02, - "grad_norm": 0.018016502261161804, - "learning_rate": 2.9456419868791e-05, - "loss": 0.0712, + "epoch": 13.0, + "grad_norm": 12.145430564880371, + "learning_rate": 4.5307669659944066e-05, + "loss": 0.1261, "step": 5570 }, { - "epoch": 16.03, - "grad_norm": 11.649871826171875, - "learning_rate": 2.9409559512652297e-05, - "loss": 0.1606, + "epoch": 13.0, + "grad_norm": 7.941616058349609, + "learning_rate": 4.5289268364492866e-05, + "loss": 0.1429, "step": 5580 }, { - "epoch": 16.03, - "grad_norm": 17.827857971191406, - "learning_rate": 2.936269915651359e-05, - "loss": 0.1209, + "epoch": 13.0, + "grad_norm": 4.826683521270752, + "learning_rate": 4.527086706904166e-05, + "loss": 0.2266, "step": 5590 }, { - "epoch": 16.03, - "grad_norm": 0.01631985232234001, - "learning_rate": 2.9315838800374885e-05, - "loss": 0.0295, + "epoch": 13.0, + "grad_norm": 19.701143264770508, + "learning_rate": 4.5252465773590466e-05, + "loss": 0.2436, "step": 5600 }, { - "epoch": 16.03, - "grad_norm": 48.93087387084961, - "learning_rate": 2.9268978444236177e-05, - "loss": 0.0542, - "step": 5610 - }, - { - "epoch": 16.03, - "eval_accuracy": 0.8051575931232091, - "eval_loss": 1.0106927156448364, - "eval_runtime": 33.4281, - "eval_samples_per_second": 20.881, - "eval_steps_per_second": 1.765, + "epoch": 13.0, + "grad_norm": 5.625741958618164, + "learning_rate": 4.5234064478139266e-05, + "loss": 0.1171, "step": 5610 }, { - "epoch": 17.0, - "grad_norm": 3.7330472469329834, - "learning_rate": 2.9222118088097473e-05, - "loss": 0.1299, + "epoch": 13.01, + "grad_norm": 2.8478872776031494, + "learning_rate": 4.521566318268806e-05, + "loss": 0.3495, "step": 5620 }, { - "epoch": 17.0, - "grad_norm": 1.1770386695861816, - "learning_rate": 2.9175257731958765e-05, - "loss": 0.1494, + "epoch": 13.01, + "grad_norm": 7.616860866546631, + "learning_rate": 4.5197261887236866e-05, + "loss": 0.177, "step": 5630 }, { - "epoch": 17.0, - "grad_norm": 6.161425590515137, - "learning_rate": 2.912839737582006e-05, - "loss": 0.0633, + "epoch": 13.01, + "grad_norm": 35.53705978393555, + "learning_rate": 4.5178860591785667e-05, + "loss": 0.1834, "step": 5640 }, { - "epoch": 17.0, - "grad_norm": 0.5936900973320007, - "learning_rate": 2.908153701968135e-05, - "loss": 0.0759, + "epoch": 13.01, + "grad_norm": 0.2759915888309479, + "learning_rate": 4.516045929633446e-05, + "loss": 0.0909, "step": 5650 }, { - "epoch": 17.0, - "grad_norm": 0.04893672466278076, - "learning_rate": 2.9034676663542642e-05, - "loss": 0.0792, + "epoch": 13.01, + "grad_norm": 5.422901630401611, + "learning_rate": 4.514205800088327e-05, + "loss": 0.0757, "step": 5660 }, { - "epoch": 17.01, - "grad_norm": 6.019880294799805, - "learning_rate": 2.8987816307403938e-05, - "loss": 0.1079, + "epoch": 13.01, + "grad_norm": 1.9962157011032104, + "learning_rate": 4.512365670543207e-05, + "loss": 0.0337, "step": 5670 }, { - "epoch": 17.01, - "grad_norm": 45.10232162475586, - "learning_rate": 2.894095595126523e-05, - "loss": 0.0538, + "epoch": 13.01, + "grad_norm": 1.336719274520874, + "learning_rate": 4.510525540998086e-05, + "loss": 0.1988, "step": 5680 }, { - "epoch": 17.01, - "grad_norm": 52.10890197753906, - "learning_rate": 2.8894095595126526e-05, - "loss": 0.0725, + "epoch": 13.01, + "grad_norm": 19.26902961730957, + "learning_rate": 4.508685411452967e-05, + "loss": 0.2792, "step": 5690 }, { - "epoch": 17.01, - "grad_norm": 0.03488897159695625, - "learning_rate": 2.8847235238987818e-05, - "loss": 0.1523, + "epoch": 13.01, + "grad_norm": 0.8596954941749573, + "learning_rate": 4.506845281907847e-05, + "loss": 0.192, "step": 5700 }, { - "epoch": 17.01, - "grad_norm": 0.07146196067333221, - "learning_rate": 2.8800374882849114e-05, - "loss": 0.0272, + "epoch": 13.01, + "grad_norm": 2.8119475841522217, + "learning_rate": 4.505005152362727e-05, + "loss": 0.0473, "step": 5710 }, { - "epoch": 17.01, - "grad_norm": 9.103580474853516, - "learning_rate": 2.8753514526710406e-05, - "loss": 0.0956, + "epoch": 13.01, + "grad_norm": 1.8721100091934204, + "learning_rate": 4.503165022817607e-05, + "loss": 0.1026, "step": 5720 }, { - "epoch": 17.01, - "grad_norm": 0.010441714897751808, - "learning_rate": 2.8706654170571695e-05, - "loss": 0.1186, + "epoch": 13.01, + "grad_norm": 40.60285568237305, + "learning_rate": 4.501324893272487e-05, + "loss": 0.2544, "step": 5730 }, { - "epoch": 17.01, - "grad_norm": 51.32048797607422, - "learning_rate": 2.865979381443299e-05, - "loss": 0.0991, + "epoch": 13.01, + "grad_norm": 14.672572135925293, + "learning_rate": 4.499484763727367e-05, + "loss": 0.192, "step": 5740 }, { - "epoch": 17.01, - "grad_norm": 17.720178604125977, - "learning_rate": 2.8612933458294283e-05, - "loss": 0.0702, + "epoch": 13.01, + "grad_norm": 10.472712516784668, + "learning_rate": 4.497644634182247e-05, + "loss": 0.4008, "step": 5750 }, { - "epoch": 17.01, - "grad_norm": 31.01833724975586, - "learning_rate": 2.856607310215558e-05, - "loss": 0.175, + "epoch": 13.01, + "grad_norm": 4.290433406829834, + "learning_rate": 4.495804504637127e-05, + "loss": 0.3308, "step": 5760 }, { - "epoch": 17.01, - "grad_norm": 0.02792373113334179, - "learning_rate": 2.851921274601687e-05, - "loss": 0.0858, + "epoch": 13.01, + "grad_norm": 9.656917572021484, + "learning_rate": 4.493964375092007e-05, + "loss": 0.2224, "step": 5770 }, { - "epoch": 17.01, - "grad_norm": 48.3105583190918, - "learning_rate": 2.8472352389878167e-05, - "loss": 0.2053, + "epoch": 13.01, + "grad_norm": 8.347408294677734, + "learning_rate": 4.492124245546887e-05, + "loss": 0.1097, "step": 5780 }, { - "epoch": 17.02, - "grad_norm": 9.974466323852539, - "learning_rate": 2.842549203373946e-05, - "loss": 0.0544, + "epoch": 13.01, + "grad_norm": 7.9891743659973145, + "learning_rate": 4.490284116001767e-05, + "loss": 0.1013, "step": 5790 }, { - "epoch": 17.02, - "grad_norm": 0.7237229347229004, - "learning_rate": 2.8378631677600755e-05, - "loss": 0.1306, + "epoch": 13.01, + "grad_norm": 29.41997528076172, + "learning_rate": 4.488443986456647e-05, + "loss": 0.1906, "step": 5800 }, { - "epoch": 17.02, - "grad_norm": 14.930671691894531, - "learning_rate": 2.833177132146204e-05, - "loss": 0.1952, + "epoch": 13.01, + "grad_norm": 0.2109886109828949, + "learning_rate": 4.486603856911527e-05, + "loss": 0.2282, "step": 5810 }, { - "epoch": 17.02, - "grad_norm": 0.019687309861183167, - "learning_rate": 2.8284910965323336e-05, - "loss": 0.1069, + "epoch": 13.01, + "grad_norm": 4.410977840423584, + "learning_rate": 4.484763727366407e-05, + "loss": 0.2242, "step": 5820 }, { - "epoch": 17.02, - "grad_norm": 5.928595066070557, - "learning_rate": 2.823805060918463e-05, - "loss": 0.0161, + "epoch": 13.01, + "grad_norm": 10.800416946411133, + "learning_rate": 4.482923597821287e-05, + "loss": 0.1689, "step": 5830 }, { - "epoch": 17.02, - "grad_norm": 0.03599075973033905, - "learning_rate": 2.8191190253045924e-05, - "loss": 0.09, + "epoch": 13.01, + "grad_norm": 15.845876693725586, + "learning_rate": 4.481083468276167e-05, + "loss": 0.2048, "step": 5840 }, { - "epoch": 17.02, - "grad_norm": 0.05025621876120567, - "learning_rate": 2.8144329896907216e-05, - "loss": 0.0732, + "epoch": 13.01, + "grad_norm": 5.1937785148620605, + "learning_rate": 4.479243338731047e-05, + "loss": 0.1652, "step": 5850 }, { - "epoch": 17.02, - "grad_norm": 40.76129150390625, - "learning_rate": 2.8097469540768512e-05, - "loss": 0.0456, + "epoch": 13.01, + "grad_norm": 0.8186588883399963, + "learning_rate": 4.477403209185927e-05, + "loss": 0.1132, "step": 5860 }, { - "epoch": 17.02, - "grad_norm": 1.6205233335494995, - "learning_rate": 2.8050609184629804e-05, - "loss": 0.0213, + "epoch": 13.01, + "grad_norm": 34.49995803833008, + "learning_rate": 4.475563079640807e-05, + "loss": 0.1222, "step": 5870 }, { - "epoch": 17.02, - "grad_norm": 56.61774444580078, - "learning_rate": 2.80037488284911e-05, - "loss": 0.2101, + "epoch": 13.01, + "grad_norm": 4.207097053527832, + "learning_rate": 4.473722950095687e-05, + "loss": 0.3247, + "step": 5880 + }, + { + "epoch": 13.01, + "eval_accuracy": 0.6846846846846847, + "eval_loss": 1.2167141437530518, + "eval_runtime": 40.5277, + "eval_samples_per_second": 21.911, + "eval_steps_per_second": 1.826, "step": 5880 }, { - "epoch": 17.02, - "grad_norm": 0.4398052394390106, - "learning_rate": 2.7956888472352392e-05, - "loss": 0.1504, + "epoch": 14.0, + "grad_norm": 0.39019277691841125, + "learning_rate": 4.471882820550567e-05, + "loss": 0.1095, "step": 5890 }, { - "epoch": 17.02, - "grad_norm": 3.440358877182007, - "learning_rate": 2.791002811621368e-05, - "loss": 0.0786, + "epoch": 14.0, + "grad_norm": 14.054343223571777, + "learning_rate": 4.470042691005447e-05, + "loss": 0.1827, "step": 5900 }, { - "epoch": 17.03, - "grad_norm": 0.02337775193154812, - "learning_rate": 2.7863167760074977e-05, - "loss": 0.1404, + "epoch": 14.0, + "grad_norm": 18.15593147277832, + "learning_rate": 4.468202561460327e-05, + "loss": 0.1991, "step": 5910 }, { - "epoch": 17.03, - "grad_norm": 36.56870651245117, - "learning_rate": 2.781630740393627e-05, - "loss": 0.0849, + "epoch": 14.0, + "grad_norm": 1.0667266845703125, + "learning_rate": 4.466362431915207e-05, + "loss": 0.0281, "step": 5920 }, { - "epoch": 17.03, - "grad_norm": 0.021380068734288216, - "learning_rate": 2.7769447047797565e-05, - "loss": 0.0257, + "epoch": 14.0, + "grad_norm": 0.5443500876426697, + "learning_rate": 4.464522302370087e-05, + "loss": 0.2, "step": 5930 }, { - "epoch": 17.03, - "grad_norm": 0.030316907912492752, - "learning_rate": 2.7722586691658857e-05, - "loss": 0.0711, - "step": 5940 - }, - { - "epoch": 17.03, - "eval_accuracy": 0.8080229226361032, - "eval_loss": 0.9795148968696594, - "eval_runtime": 34.1521, - "eval_samples_per_second": 20.438, - "eval_steps_per_second": 1.728, + "epoch": 14.0, + "grad_norm": 1.438390851020813, + "learning_rate": 4.462682172824968e-05, + "loss": 0.1323, "step": 5940 }, { - "epoch": 18.0, - "grad_norm": 0.007565322797745466, - "learning_rate": 2.7675726335520153e-05, - "loss": 0.0015, + "epoch": 14.0, + "grad_norm": 0.20778608322143555, + "learning_rate": 4.460842043279847e-05, + "loss": 0.192, "step": 5950 }, { - "epoch": 18.0, - "grad_norm": 0.006239545065909624, - "learning_rate": 2.7628865979381445e-05, - "loss": 0.0589, + "epoch": 14.0, + "grad_norm": 21.353769302368164, + "learning_rate": 4.459001913734727e-05, + "loss": 0.1008, "step": 5960 }, { - "epoch": 18.0, - "grad_norm": 0.024090183898806572, - "learning_rate": 2.758200562324274e-05, - "loss": 0.0892, + "epoch": 14.0, + "grad_norm": 10.89686393737793, + "learning_rate": 4.457161784189608e-05, + "loss": 0.156, "step": 5970 }, { - "epoch": 18.0, - "grad_norm": 8.415689468383789, - "learning_rate": 2.753514526710403e-05, - "loss": 0.1953, + "epoch": 14.0, + "grad_norm": 6.522188663482666, + "learning_rate": 4.455321654644487e-05, + "loss": 0.2051, "step": 5980 }, { - "epoch": 18.0, - "grad_norm": 0.46405917406082153, - "learning_rate": 2.7488284910965322e-05, - "loss": 0.0365, + "epoch": 14.0, + "grad_norm": 17.12128257751465, + "learning_rate": 4.453481525099367e-05, + "loss": 0.2968, "step": 5990 }, { - "epoch": 18.01, - "grad_norm": 0.320950984954834, - "learning_rate": 2.7441424554826618e-05, - "loss": 0.1011, + "epoch": 14.0, + "grad_norm": 10.827217102050781, + "learning_rate": 4.451641395554248e-05, + "loss": 0.0437, "step": 6000 }, { - "epoch": 18.01, - "grad_norm": 25.849971771240234, - "learning_rate": 2.739456419868791e-05, - "loss": 0.0787, + "epoch": 14.0, + "grad_norm": 29.041154861450195, + "learning_rate": 4.449801266009127e-05, + "loss": 0.1536, "step": 6010 }, { - "epoch": 18.01, - "grad_norm": 0.018421674147248268, - "learning_rate": 2.7347703842549206e-05, - "loss": 0.0032, + "epoch": 14.0, + "grad_norm": 5.939600944519043, + "learning_rate": 4.447961136464007e-05, + "loss": 0.147, "step": 6020 }, { - "epoch": 18.01, - "grad_norm": 0.02502196654677391, - "learning_rate": 2.73008434864105e-05, - "loss": 0.173, + "epoch": 14.0, + "grad_norm": 24.224306106567383, + "learning_rate": 4.446121006918888e-05, + "loss": 0.2939, "step": 6030 }, { - "epoch": 18.01, - "grad_norm": 0.05541960895061493, - "learning_rate": 2.7253983130271794e-05, - "loss": 0.1086, + "epoch": 14.01, + "grad_norm": 7.354557991027832, + "learning_rate": 4.444280877373767e-05, + "loss": 0.132, "step": 6040 }, { - "epoch": 18.01, - "grad_norm": 1.3273464441299438, - "learning_rate": 2.7207122774133086e-05, - "loss": 0.0214, + "epoch": 14.01, + "grad_norm": 14.240554809570312, + "learning_rate": 4.442440747828647e-05, + "loss": 0.0965, "step": 6050 }, { - "epoch": 18.01, - "grad_norm": 27.578462600708008, - "learning_rate": 2.7160262417994375e-05, - "loss": 0.0767, + "epoch": 14.01, + "grad_norm": 10.489018440246582, + "learning_rate": 4.440600618283527e-05, + "loss": 0.2011, "step": 6060 }, { - "epoch": 18.01, - "grad_norm": 1.713114857673645, - "learning_rate": 2.711340206185567e-05, - "loss": 0.1884, + "epoch": 14.01, + "grad_norm": 17.361114501953125, + "learning_rate": 4.438760488738408e-05, + "loss": 0.2374, "step": 6070 }, { - "epoch": 18.01, - "grad_norm": 0.0781659409403801, - "learning_rate": 2.7066541705716963e-05, - "loss": 0.1062, + "epoch": 14.01, + "grad_norm": 2.982257604598999, + "learning_rate": 4.436920359193287e-05, + "loss": 0.2142, "step": 6080 }, { - "epoch": 18.01, - "grad_norm": 61.0845832824707, - "learning_rate": 2.701968134957826e-05, - "loss": 0.0516, + "epoch": 14.01, + "grad_norm": 0.11127086728811264, + "learning_rate": 4.435080229648167e-05, + "loss": 0.1552, "step": 6090 }, { - "epoch": 18.01, - "grad_norm": 9.772195816040039, - "learning_rate": 2.697282099343955e-05, - "loss": 0.0589, + "epoch": 14.01, + "grad_norm": 2.568547487258911, + "learning_rate": 4.433240100103048e-05, + "loss": 0.2232, "step": 6100 }, { - "epoch": 18.01, - "grad_norm": 0.18929840624332428, - "learning_rate": 2.6925960637300847e-05, - "loss": 0.1057, + "epoch": 14.01, + "grad_norm": 0.16568288207054138, + "learning_rate": 4.431399970557927e-05, + "loss": 0.2059, "step": 6110 }, { - "epoch": 18.02, - "grad_norm": 0.020669307559728622, - "learning_rate": 2.687910028116214e-05, - "loss": 0.0687, + "epoch": 14.01, + "grad_norm": 7.591290473937988, + "learning_rate": 4.429559841012807e-05, + "loss": 0.1485, "step": 6120 }, { - "epoch": 18.02, - "grad_norm": 5.4998016357421875, - "learning_rate": 2.6832239925023435e-05, - "loss": 0.2466, + "epoch": 14.01, + "grad_norm": 11.021584510803223, + "learning_rate": 4.427719711467688e-05, + "loss": 0.149, "step": 6130 }, { - "epoch": 18.02, - "grad_norm": 2.288482427597046, - "learning_rate": 2.6785379568884727e-05, - "loss": 0.0868, + "epoch": 14.01, + "grad_norm": 0.6921817064285278, + "learning_rate": 4.425879581922567e-05, + "loss": 0.2096, "step": 6140 }, { - "epoch": 18.02, - "grad_norm": 0.02619881182909012, - "learning_rate": 2.6738519212746016e-05, - "loss": 0.0071, + "epoch": 14.01, + "grad_norm": 5.8809661865234375, + "learning_rate": 4.424039452377447e-05, + "loss": 0.1371, "step": 6150 }, { - "epoch": 18.02, - "grad_norm": 0.011477050371468067, - "learning_rate": 2.669165885660731e-05, - "loss": 0.0421, + "epoch": 14.01, + "grad_norm": 2.2032387256622314, + "learning_rate": 4.422199322832328e-05, + "loss": 0.086, "step": 6160 }, { - "epoch": 18.02, - "grad_norm": 0.03269781917333603, - "learning_rate": 2.6644798500468604e-05, - "loss": 0.0465, + "epoch": 14.01, + "grad_norm": 22.69715118408203, + "learning_rate": 4.420359193287207e-05, + "loss": 0.1365, "step": 6170 }, { - "epoch": 18.02, - "grad_norm": 0.03204688802361488, - "learning_rate": 2.6597938144329897e-05, - "loss": 0.0769, + "epoch": 14.01, + "grad_norm": 20.44197654724121, + "learning_rate": 4.418519063742087e-05, + "loss": 0.0839, "step": 6180 }, { - "epoch": 18.02, - "grad_norm": 28.57611656188965, - "learning_rate": 2.6551077788191192e-05, - "loss": 0.0488, + "epoch": 14.01, + "grad_norm": 25.620283126831055, + "learning_rate": 4.416678934196968e-05, + "loss": 0.1959, "step": 6190 }, { - "epoch": 18.02, - "grad_norm": 27.48614501953125, - "learning_rate": 2.6504217432052485e-05, - "loss": 0.0817, + "epoch": 14.01, + "grad_norm": 42.260128021240234, + "learning_rate": 4.414838804651848e-05, + "loss": 0.2688, "step": 6200 }, { - "epoch": 18.02, - "grad_norm": 37.647117614746094, - "learning_rate": 2.645735707591378e-05, - "loss": 0.0928, + "epoch": 14.01, + "grad_norm": 13.617227554321289, + "learning_rate": 4.412998675106727e-05, + "loss": 0.2601, "step": 6210 }, { - "epoch": 18.02, - "grad_norm": 0.01060717087239027, - "learning_rate": 2.6410496719775073e-05, - "loss": 0.034, + "epoch": 14.01, + "grad_norm": 9.865835189819336, + "learning_rate": 4.411158545561608e-05, + "loss": 0.3127, "step": 6220 }, { - "epoch": 18.02, - "grad_norm": 0.24268437922000885, - "learning_rate": 2.636363636363636e-05, - "loss": 0.108, + "epoch": 14.01, + "grad_norm": 3.797058343887329, + "learning_rate": 4.409318416016488e-05, + "loss": 0.201, "step": 6230 }, { - "epoch": 18.03, - "grad_norm": 0.00554469833150506, - "learning_rate": 2.6316776007497657e-05, - "loss": 0.1068, + "epoch": 14.01, + "grad_norm": 4.668950080871582, + "learning_rate": 4.4074782864713673e-05, + "loss": 0.1756, "step": 6240 }, { - "epoch": 18.03, - "grad_norm": 10.710355758666992, - "learning_rate": 2.626991565135895e-05, - "loss": 0.0637, + "epoch": 14.01, + "grad_norm": 21.492088317871094, + "learning_rate": 4.405638156926248e-05, + "loss": 0.2038, "step": 6250 }, { - "epoch": 18.03, - "grad_norm": 23.989526748657227, - "learning_rate": 2.6223055295220245e-05, - "loss": 0.0926, + "epoch": 14.01, + "grad_norm": 11.907729148864746, + "learning_rate": 4.403798027381128e-05, + "loss": 0.235, "step": 6260 }, { - "epoch": 18.03, - "grad_norm": 0.2239922732114792, - "learning_rate": 2.6176194939081538e-05, - "loss": 0.0287, - "step": 6270 - }, - { - "epoch": 18.03, - "eval_accuracy": 0.8094555873925502, - "eval_loss": 1.1470834016799927, - "eval_runtime": 34.1919, - "eval_samples_per_second": 20.414, - "eval_steps_per_second": 1.726, + "epoch": 14.01, + "grad_norm": 13.219657897949219, + "learning_rate": 4.4019578978360074e-05, + "loss": 0.2846, "step": 6270 }, { - "epoch": 19.0, - "grad_norm": 0.3733896315097809, - "learning_rate": 2.6129334582942833e-05, - "loss": 0.0541, + "epoch": 14.01, + "grad_norm": 3.140854597091675, + "learning_rate": 4.400117768290888e-05, + "loss": 0.4261, "step": 6280 }, { - "epoch": 19.0, - "grad_norm": 0.011687755584716797, - "learning_rate": 2.6082474226804126e-05, - "loss": 0.0725, + "epoch": 14.01, + "grad_norm": 6.707206726074219, + "learning_rate": 4.398277638745768e-05, + "loss": 0.106, "step": 6290 }, { - "epoch": 19.0, - "grad_norm": 15.612820625305176, - "learning_rate": 2.603561387066542e-05, - "loss": 0.0287, + "epoch": 14.01, + "grad_norm": 2.215766429901123, + "learning_rate": 4.3964375092006474e-05, + "loss": 0.171, "step": 6300 }, { - "epoch": 19.0, - "grad_norm": 0.0473637618124485, - "learning_rate": 2.598875351452671e-05, - "loss": 0.128, + "epoch": 14.01, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 0.9336600303649902, + "eval_runtime": 40.7056, + "eval_samples_per_second": 21.815, + "eval_steps_per_second": 1.818, + "step": 6300 + }, + { + "epoch": 15.0, + "grad_norm": 1.4444656372070312, + "learning_rate": 4.394597379655528e-05, + "loss": 0.2148, "step": 6310 }, { - "epoch": 19.0, - "grad_norm": 14.158834457397461, - "learning_rate": 2.5941893158388003e-05, - "loss": 0.0102, + "epoch": 15.0, + "grad_norm": 14.322981834411621, + "learning_rate": 4.392757250110408e-05, + "loss": 0.1187, "step": 6320 }, { - "epoch": 19.01, - "grad_norm": 0.03251667320728302, - "learning_rate": 2.58950328022493e-05, - "loss": 0.0536, + "epoch": 15.0, + "grad_norm": 17.432512283325195, + "learning_rate": 4.390917120565288e-05, + "loss": 0.2358, "step": 6330 }, { - "epoch": 19.01, - "grad_norm": 2.301870822906494, - "learning_rate": 2.584817244611059e-05, - "loss": 0.1364, + "epoch": 15.0, + "grad_norm": 10.29634952545166, + "learning_rate": 4.389076991020168e-05, + "loss": 0.1578, "step": 6340 }, { - "epoch": 19.01, - "grad_norm": 36.87099075317383, - "learning_rate": 2.5801312089971886e-05, - "loss": 0.1997, + "epoch": 15.0, + "grad_norm": 0.3197493255138397, + "learning_rate": 4.387236861475048e-05, + "loss": 0.1072, "step": 6350 }, { - "epoch": 19.01, - "grad_norm": 0.012598090805113316, - "learning_rate": 2.575445173383318e-05, - "loss": 0.1696, + "epoch": 15.0, + "grad_norm": 0.4721217751502991, + "learning_rate": 4.385396731929928e-05, + "loss": 0.3094, "step": 6360 }, { - "epoch": 19.01, - "grad_norm": 0.016358235850930214, - "learning_rate": 2.5707591377694474e-05, - "loss": 0.0671, + "epoch": 15.0, + "grad_norm": 20.208770751953125, + "learning_rate": 4.383556602384808e-05, + "loss": 0.1157, "step": 6370 }, { - "epoch": 19.01, - "grad_norm": 3.8293418884277344, - "learning_rate": 2.5660731021555767e-05, - "loss": 0.0023, + "epoch": 15.0, + "grad_norm": 15.79928207397461, + "learning_rate": 4.381716472839688e-05, + "loss": 0.251, "step": 6380 }, { - "epoch": 19.01, - "grad_norm": 32.61289978027344, - "learning_rate": 2.5613870665417056e-05, - "loss": 0.0762, + "epoch": 15.0, + "grad_norm": 0.06318643689155579, + "learning_rate": 4.379876343294568e-05, + "loss": 0.1641, "step": 6390 }, { - "epoch": 19.01, - "grad_norm": 0.12384048849344254, - "learning_rate": 2.556701030927835e-05, - "loss": 0.1048, + "epoch": 15.0, + "grad_norm": 18.822420120239258, + "learning_rate": 4.378036213749448e-05, + "loss": 0.1197, "step": 6400 }, { - "epoch": 19.01, - "grad_norm": 34.71640396118164, - "learning_rate": 2.5520149953139644e-05, - "loss": 0.075, + "epoch": 15.0, + "grad_norm": 0.14875183999538422, + "learning_rate": 4.376196084204328e-05, + "loss": 0.1705, "step": 6410 }, { - "epoch": 19.01, - "grad_norm": 0.02384701929986477, - "learning_rate": 2.547328959700094e-05, - "loss": 0.0328, + "epoch": 15.0, + "grad_norm": 18.02042579650879, + "learning_rate": 4.374355954659208e-05, + "loss": 0.1298, "step": 6420 }, { - "epoch": 19.01, - "grad_norm": 0.028293780982494354, - "learning_rate": 2.542642924086223e-05, - "loss": 0.0533, + "epoch": 15.0, + "grad_norm": 14.635958671569824, + "learning_rate": 4.372515825114088e-05, + "loss": 0.1821, "step": 6430 }, { - "epoch": 19.01, - "grad_norm": 0.06875205785036087, - "learning_rate": 2.5379568884723527e-05, - "loss": 0.0736, + "epoch": 15.0, + "grad_norm": 0.13969016075134277, + "learning_rate": 4.370675695568968e-05, + "loss": 0.1356, "step": 6440 }, { - "epoch": 19.02, - "grad_norm": 0.020702671259641647, - "learning_rate": 2.533270852858482e-05, - "loss": 0.1326, + "epoch": 15.0, + "grad_norm": 0.7694371342658997, + "learning_rate": 4.368835566023848e-05, + "loss": 0.1752, "step": 6450 }, { - "epoch": 19.02, - "grad_norm": 27.13255500793457, - "learning_rate": 2.5285848172446115e-05, - "loss": 0.1638, + "epoch": 15.01, + "grad_norm": 0.2148592621088028, + "learning_rate": 4.366995436478728e-05, + "loss": 0.117, "step": 6460 }, { - "epoch": 19.02, - "grad_norm": 0.04475679248571396, - "learning_rate": 2.5238987816307408e-05, - "loss": 0.0483, + "epoch": 15.01, + "grad_norm": 11.844324111938477, + "learning_rate": 4.365155306933608e-05, + "loss": 0.2095, "step": 6470 }, { - "epoch": 19.02, - "grad_norm": 0.3056570291519165, - "learning_rate": 2.5192127460168697e-05, - "loss": 0.0657, + "epoch": 15.01, + "grad_norm": 8.03518009185791, + "learning_rate": 4.363315177388488e-05, + "loss": 0.0854, "step": 6480 }, { - "epoch": 19.02, - "grad_norm": 0.01589689962565899, - "learning_rate": 2.514526710402999e-05, - "loss": 0.033, + "epoch": 15.01, + "grad_norm": 1.3339135646820068, + "learning_rate": 4.361475047843368e-05, + "loss": 0.143, "step": 6490 }, { - "epoch": 19.02, - "grad_norm": 0.02305714786052704, - "learning_rate": 2.5098406747891285e-05, - "loss": 0.1051, + "epoch": 15.01, + "grad_norm": 0.5493175387382507, + "learning_rate": 4.359634918298248e-05, + "loss": 0.1753, "step": 6500 }, { - "epoch": 19.02, - "grad_norm": 0.0253700353205204, - "learning_rate": 2.5051546391752577e-05, - "loss": 0.0328, + "epoch": 15.01, + "grad_norm": 15.884575843811035, + "learning_rate": 4.357794788753128e-05, + "loss": 0.1704, "step": 6510 }, { - "epoch": 19.02, - "grad_norm": 46.60805892944336, - "learning_rate": 2.5004686035613873e-05, - "loss": 0.0829, + "epoch": 15.01, + "grad_norm": 19.157939910888672, + "learning_rate": 4.3559546592080083e-05, + "loss": 0.1545, "step": 6520 }, { - "epoch": 19.02, - "grad_norm": 0.07593350857496262, - "learning_rate": 2.4957825679475165e-05, - "loss": 0.0278, + "epoch": 15.01, + "grad_norm": 22.01847267150879, + "learning_rate": 4.3541145296628884e-05, + "loss": 0.2814, "step": 6530 }, { - "epoch": 19.02, - "grad_norm": 0.011305336840450764, - "learning_rate": 2.4910965323336457e-05, - "loss": 0.1844, + "epoch": 15.01, + "grad_norm": 3.600910186767578, + "learning_rate": 4.3522744001177684e-05, + "loss": 0.1142, "step": 6540 }, { - "epoch": 19.02, - "grad_norm": 0.02104657143354416, - "learning_rate": 2.4864104967197753e-05, - "loss": 0.0412, + "epoch": 15.01, + "grad_norm": 0.21439813077449799, + "learning_rate": 4.3504342705726484e-05, + "loss": 0.2112, "step": 6550 }, { - "epoch": 19.02, - "grad_norm": 0.05700293555855751, - "learning_rate": 2.4817244611059045e-05, - "loss": 0.004, + "epoch": 15.01, + "grad_norm": 8.411886215209961, + "learning_rate": 4.348594141027529e-05, + "loss": 0.0611, "step": 6560 }, { - "epoch": 19.03, - "grad_norm": 5.577692985534668, - "learning_rate": 2.477038425492034e-05, - "loss": 0.1151, + "epoch": 15.01, + "grad_norm": 21.956642150878906, + "learning_rate": 4.3467540114824084e-05, + "loss": 0.1059, "step": 6570 }, { - "epoch": 19.03, - "grad_norm": 0.020554441958665848, - "learning_rate": 2.472352389878163e-05, - "loss": 0.0564, + "epoch": 15.01, + "grad_norm": 0.07254786044359207, + "learning_rate": 4.3449138819372884e-05, + "loss": 0.0876, "step": 6580 }, { - "epoch": 19.03, - "grad_norm": 0.04367370158433914, - "learning_rate": 2.4676663542642926e-05, - "loss": 0.1458, + "epoch": 15.01, + "grad_norm": 0.7494866847991943, + "learning_rate": 4.343073752392169e-05, + "loss": 0.1531, "step": 6590 }, { - "epoch": 19.03, - "grad_norm": 0.026535367593169212, - "learning_rate": 2.4629803186504218e-05, - "loss": 0.1011, - "step": 6600 - }, - { - "epoch": 19.03, - "eval_accuracy": 0.7851002865329513, - "eval_loss": 1.0893527269363403, - "eval_runtime": 33.9177, - "eval_samples_per_second": 20.579, - "eval_steps_per_second": 1.74, + "epoch": 15.01, + "grad_norm": 27.045339584350586, + "learning_rate": 4.3412336228470484e-05, + "loss": 0.2429, "step": 6600 }, { - "epoch": 20.0, - "grad_norm": 10.764158248901367, - "learning_rate": 2.4582942830365514e-05, - "loss": 0.159, + "epoch": 15.01, + "grad_norm": 0.29273873567581177, + "learning_rate": 4.3393934933019284e-05, + "loss": 0.2683, "step": 6610 }, { - "epoch": 20.0, - "grad_norm": 0.06043768674135208, - "learning_rate": 2.4536082474226803e-05, - "loss": 0.003, + "epoch": 15.01, + "grad_norm": 0.04690911993384361, + "learning_rate": 4.337553363756809e-05, + "loss": 0.1066, "step": 6620 }, { - "epoch": 20.0, - "grad_norm": 0.006598788313567638, - "learning_rate": 2.4489222118088098e-05, - "loss": 0.0761, + "epoch": 15.01, + "grad_norm": 0.5130358338356018, + "learning_rate": 4.3357132342116885e-05, + "loss": 0.111, "step": 6630 }, { - "epoch": 20.0, - "grad_norm": 0.02938709408044815, - "learning_rate": 2.444236176194939e-05, - "loss": 0.069, + "epoch": 15.01, + "grad_norm": 0.7703613042831421, + "learning_rate": 4.3338731046665685e-05, + "loss": 0.097, "step": 6640 }, { - "epoch": 20.0, - "grad_norm": 0.7408674955368042, - "learning_rate": 2.4395501405810686e-05, - "loss": 0.0754, + "epoch": 15.01, + "grad_norm": 25.8164005279541, + "learning_rate": 4.332032975121449e-05, + "loss": 0.2039, "step": 6650 }, { - "epoch": 20.01, - "grad_norm": 14.34585952758789, - "learning_rate": 2.434864104967198e-05, - "loss": 0.0301, + "epoch": 15.01, + "grad_norm": 18.577009201049805, + "learning_rate": 4.3301928455763285e-05, + "loss": 0.2498, "step": 6660 }, { - "epoch": 20.01, - "grad_norm": 0.12531670928001404, - "learning_rate": 2.430178069353327e-05, - "loss": 0.0021, + "epoch": 15.01, + "grad_norm": 17.129587173461914, + "learning_rate": 4.3283527160312085e-05, + "loss": 0.207, "step": 6670 }, { - "epoch": 20.01, - "grad_norm": 0.11880137771368027, - "learning_rate": 2.4254920337394567e-05, - "loss": 0.0008, + "epoch": 15.01, + "grad_norm": 22.360652923583984, + "learning_rate": 4.326512586486089e-05, + "loss": 0.1344, "step": 6680 }, { - "epoch": 20.01, - "grad_norm": 15.903475761413574, - "learning_rate": 2.420805998125586e-05, - "loss": 0.0479, + "epoch": 15.01, + "grad_norm": 0.045382168143987656, + "learning_rate": 4.324672456940969e-05, + "loss": 0.1224, "step": 6690 }, { - "epoch": 20.01, - "grad_norm": 29.952171325683594, - "learning_rate": 2.416119962511715e-05, - "loss": 0.1002, + "epoch": 15.01, + "grad_norm": 0.12095453590154648, + "learning_rate": 4.3228323273958486e-05, + "loss": 0.0751, "step": 6700 }, { - "epoch": 20.01, - "grad_norm": 0.30753955245018005, - "learning_rate": 2.4114339268978444e-05, - "loss": 0.0304, + "epoch": 15.01, + "grad_norm": 0.04379770904779434, + "learning_rate": 4.320992197850729e-05, + "loss": 0.2142, "step": 6710 }, { - "epoch": 20.01, - "grad_norm": 0.014064520597457886, - "learning_rate": 2.406747891283974e-05, - "loss": 0.1304, + "epoch": 15.01, + "grad_norm": 0.022613519802689552, + "learning_rate": 4.319152068305609e-05, + "loss": 0.3076, "step": 6720 }, { - "epoch": 20.01, - "grad_norm": 0.4187520146369934, - "learning_rate": 2.402061855670103e-05, - "loss": 0.1015, + "epoch": 15.01, + "eval_accuracy": 0.7207207207207207, + "eval_loss": 1.181077480316162, + "eval_runtime": 41.2544, + "eval_samples_per_second": 21.525, + "eval_steps_per_second": 1.794, + "step": 6720 + }, + { + "epoch": 16.0, + "grad_norm": 3.539224863052368, + "learning_rate": 4.3173119387604886e-05, + "loss": 0.0385, "step": 6730 }, { - "epoch": 20.01, - "grad_norm": 0.02327924221754074, - "learning_rate": 2.3973758200562327e-05, - "loss": 0.0053, + "epoch": 16.0, + "grad_norm": 3.006033182144165, + "learning_rate": 4.315471809215369e-05, + "loss": 0.0355, "step": 6740 }, { - "epoch": 20.01, - "grad_norm": 16.745342254638672, - "learning_rate": 2.392689784442362e-05, - "loss": 0.2538, + "epoch": 16.0, + "grad_norm": 0.09995649755001068, + "learning_rate": 4.313631679670249e-05, + "loss": 0.1871, "step": 6750 }, { - "epoch": 20.01, - "grad_norm": 0.24011173844337463, - "learning_rate": 2.3880037488284912e-05, - "loss": 0.0485, + "epoch": 16.0, + "grad_norm": 6.6211347579956055, + "learning_rate": 4.3117915501251286e-05, + "loss": 0.0773, "step": 6760 }, { - "epoch": 20.01, - "grad_norm": 29.788806915283203, - "learning_rate": 2.3833177132146208e-05, - "loss": 0.0324, + "epoch": 16.0, + "grad_norm": 1.1593960523605347, + "learning_rate": 4.309951420580009e-05, + "loss": 0.0803, "step": 6770 }, { - "epoch": 20.02, - "grad_norm": 0.7102867960929871, - "learning_rate": 2.37863167760075e-05, - "loss": 0.1429, + "epoch": 16.0, + "grad_norm": 5.994491100311279, + "learning_rate": 4.308111291034889e-05, + "loss": 0.0459, "step": 6780 }, { - "epoch": 20.02, - "grad_norm": 0.08376511186361313, - "learning_rate": 2.3739456419868792e-05, - "loss": 0.0972, + "epoch": 16.0, + "grad_norm": 11.333940505981445, + "learning_rate": 4.3062711614897687e-05, + "loss": 0.237, "step": 6790 }, { - "epoch": 20.02, - "grad_norm": 0.335637629032135, - "learning_rate": 2.3692596063730085e-05, - "loss": 0.062, + "epoch": 16.0, + "grad_norm": 9.39922046661377, + "learning_rate": 4.3044310319446493e-05, + "loss": 0.3263, "step": 6800 }, { - "epoch": 20.02, - "grad_norm": 28.6505069732666, - "learning_rate": 2.364573570759138e-05, - "loss": 0.2001, + "epoch": 16.0, + "grad_norm": 0.28990158438682556, + "learning_rate": 4.3025909023995294e-05, + "loss": 0.16, "step": 6810 }, { - "epoch": 20.02, - "grad_norm": 29.769094467163086, - "learning_rate": 2.3598875351452673e-05, - "loss": 0.1555, + "epoch": 16.0, + "grad_norm": 0.8307206034660339, + "learning_rate": 4.3007507728544094e-05, + "loss": 0.2574, "step": 6820 }, { - "epoch": 20.02, - "grad_norm": 52.66693878173828, - "learning_rate": 2.3552014995313965e-05, - "loss": 0.0873, + "epoch": 16.0, + "grad_norm": 0.03865053132176399, + "learning_rate": 4.2989106433092894e-05, + "loss": 0.0774, "step": 6830 }, { - "epoch": 20.02, - "grad_norm": 0.011765277944505215, - "learning_rate": 2.3505154639175257e-05, - "loss": 0.0721, + "epoch": 16.0, + "grad_norm": 0.4578423500061035, + "learning_rate": 4.2970705137641694e-05, + "loss": 0.0989, "step": 6840 }, { - "epoch": 20.02, - "grad_norm": 0.08442965894937515, - "learning_rate": 2.3458294283036553e-05, - "loss": 0.0497, + "epoch": 16.0, + "grad_norm": 0.41832372546195984, + "learning_rate": 4.2952303842190494e-05, + "loss": 0.0478, "step": 6850 }, { - "epoch": 20.02, - "grad_norm": 0.017878804355859756, - "learning_rate": 2.3411433926897845e-05, - "loss": 0.0389, + "epoch": 16.0, + "grad_norm": 10.938180923461914, + "learning_rate": 4.2933902546739294e-05, + "loss": 0.1905, "step": 6860 }, { - "epoch": 20.02, - "grad_norm": 0.017131801694631577, - "learning_rate": 2.3364573570759138e-05, - "loss": 0.1382, + "epoch": 16.0, + "grad_norm": 24.749025344848633, + "learning_rate": 4.2915501251288094e-05, + "loss": 0.1256, "step": 6870 }, { - "epoch": 20.02, - "grad_norm": 2.3948495388031006, - "learning_rate": 2.3317713214620433e-05, - "loss": 0.0425, + "epoch": 16.01, + "grad_norm": 19.042219161987305, + "learning_rate": 4.2897099955836894e-05, + "loss": 0.1258, "step": 6880 }, { - "epoch": 20.02, - "grad_norm": 9.163774490356445, - "learning_rate": 2.3270852858481726e-05, - "loss": 0.0738, + "epoch": 16.01, + "grad_norm": 0.6163063645362854, + "learning_rate": 4.287869866038569e-05, + "loss": 0.07, "step": 6890 }, { - "epoch": 20.03, - "grad_norm": 0.009526137262582779, - "learning_rate": 2.322399250234302e-05, - "loss": 0.1145, + "epoch": 16.01, + "grad_norm": 17.089820861816406, + "learning_rate": 4.2860297364934495e-05, + "loss": 0.1107, "step": 6900 }, { - "epoch": 20.03, - "grad_norm": 0.11400933563709259, - "learning_rate": 2.317713214620431e-05, - "loss": 0.0527, + "epoch": 16.01, + "grad_norm": 0.10851157456636429, + "learning_rate": 4.2841896069483295e-05, + "loss": 0.3045, "step": 6910 }, { - "epoch": 20.03, - "grad_norm": 1.8810824155807495, - "learning_rate": 2.3130271790065606e-05, - "loss": 0.0034, + "epoch": 16.01, + "grad_norm": 0.45974329113960266, + "learning_rate": 4.282349477403209e-05, + "loss": 0.1239, "step": 6920 }, { - "epoch": 20.03, - "grad_norm": 0.02618522383272648, - "learning_rate": 2.3083411433926898e-05, - "loss": 0.0424, - "step": 6930 - }, - { - "epoch": 20.03, - "eval_accuracy": 0.7822349570200573, - "eval_loss": 1.144364356994629, - "eval_runtime": 34.682, - "eval_samples_per_second": 20.126, - "eval_steps_per_second": 1.701, + "epoch": 16.01, + "grad_norm": 26.82357406616211, + "learning_rate": 4.2805093478580895e-05, + "loss": 0.1921, "step": 6930 }, { - "epoch": 21.0, - "grad_norm": 0.017015540972352028, - "learning_rate": 2.3036551077788194e-05, - "loss": 0.0144, + "epoch": 16.01, + "grad_norm": 10.198452949523926, + "learning_rate": 4.2786692183129695e-05, + "loss": 0.1364, "step": 6940 }, { - "epoch": 21.0, - "grad_norm": 1.5228683948516846, - "learning_rate": 2.2989690721649483e-05, - "loss": 0.049, + "epoch": 16.01, + "grad_norm": 29.6612491607666, + "learning_rate": 4.2768290887678495e-05, + "loss": 0.167, "step": 6950 }, { - "epoch": 21.0, - "grad_norm": 0.00678257504478097, - "learning_rate": 2.294283036551078e-05, - "loss": 0.0779, + "epoch": 16.01, + "grad_norm": 7.377419948577881, + "learning_rate": 4.2749889592227295e-05, + "loss": 0.0909, "step": 6960 }, { - "epoch": 21.0, - "grad_norm": 0.011534970253705978, - "learning_rate": 2.289597000937207e-05, - "loss": 0.1179, + "epoch": 16.01, + "grad_norm": 2.213519334793091, + "learning_rate": 4.2731488296776095e-05, + "loss": 0.0531, "step": 6970 }, { - "epoch": 21.0, - "grad_norm": 22.936405181884766, - "learning_rate": 2.2849109653233367e-05, - "loss": 0.0972, + "epoch": 16.01, + "grad_norm": 0.029641279950737953, + "learning_rate": 4.2713087001324896e-05, + "loss": 0.1466, "step": 6980 }, { - "epoch": 21.01, - "grad_norm": 0.06261157989501953, - "learning_rate": 2.280224929709466e-05, - "loss": 0.0625, + "epoch": 16.01, + "grad_norm": 1.8791080713272095, + "learning_rate": 4.2694685705873696e-05, + "loss": 0.0484, "step": 6990 }, { - "epoch": 21.01, - "grad_norm": 9.488935470581055, - "learning_rate": 2.275538894095595e-05, - "loss": 0.0421, + "epoch": 16.01, + "grad_norm": 0.746714174747467, + "learning_rate": 4.2676284410422496e-05, + "loss": 0.134, "step": 7000 }, { - "epoch": 21.01, - "grad_norm": 11.307842254638672, - "learning_rate": 2.2708528584817247e-05, - "loss": 0.043, + "epoch": 16.01, + "grad_norm": 11.966108322143555, + "learning_rate": 4.2657883114971296e-05, + "loss": 0.1856, "step": 7010 }, { - "epoch": 21.01, - "grad_norm": 0.03459528461098671, - "learning_rate": 2.266166822867854e-05, - "loss": 0.0242, + "epoch": 16.01, + "grad_norm": 0.21873310208320618, + "learning_rate": 4.2639481819520096e-05, + "loss": 0.1875, "step": 7020 }, { - "epoch": 21.01, - "grad_norm": 9.111845016479492, - "learning_rate": 2.261480787253983e-05, - "loss": 0.0573, + "epoch": 16.01, + "grad_norm": 8.706038475036621, + "learning_rate": 4.2621080524068896e-05, + "loss": 0.2378, "step": 7030 }, { - "epoch": 21.01, - "grad_norm": 0.01025596633553505, - "learning_rate": 2.2567947516401124e-05, - "loss": 0.0159, + "epoch": 16.01, + "grad_norm": 0.31858113408088684, + "learning_rate": 4.2602679228617696e-05, + "loss": 0.1298, "step": 7040 }, { - "epoch": 21.01, - "grad_norm": 2.14906907081604, - "learning_rate": 2.252108716026242e-05, - "loss": 0.0856, + "epoch": 16.01, + "grad_norm": 0.04492282494902611, + "learning_rate": 4.2584277933166496e-05, + "loss": 0.2082, "step": 7050 }, { - "epoch": 21.01, - "grad_norm": 11.889404296875, - "learning_rate": 2.2474226804123712e-05, - "loss": 0.0287, + "epoch": 16.01, + "grad_norm": 0.04001040756702423, + "learning_rate": 4.2565876637715296e-05, + "loss": 0.2121, "step": 7060 }, { - "epoch": 21.01, - "grad_norm": 0.046489182859659195, - "learning_rate": 2.2427366447985008e-05, - "loss": 0.0729, + "epoch": 16.01, + "grad_norm": 13.668713569641113, + "learning_rate": 4.2547475342264097e-05, + "loss": 0.1399, "step": 7070 }, { - "epoch": 21.01, - "grad_norm": 0.009697903878986835, - "learning_rate": 2.23805060918463e-05, - "loss": 0.0428, + "epoch": 16.01, + "grad_norm": 10.486247062683105, + "learning_rate": 4.25290740468129e-05, + "loss": 0.1461, "step": 7080 }, { - "epoch": 21.01, - "grad_norm": 0.03072419762611389, - "learning_rate": 2.2333645735707592e-05, - "loss": 0.0601, + "epoch": 16.01, + "grad_norm": 11.53575611114502, + "learning_rate": 4.25106727513617e-05, + "loss": 0.2366, "step": 7090 }, { - "epoch": 21.01, - "grad_norm": 10.731389045715332, - "learning_rate": 2.2286785379568888e-05, - "loss": 0.0125, + "epoch": 16.01, + "grad_norm": 23.928285598754883, + "learning_rate": 4.24922714559105e-05, + "loss": 0.1941, "step": 7100 }, { - "epoch": 21.02, - "grad_norm": 0.04367499053478241, - "learning_rate": 2.223992502343018e-05, - "loss": 0.0223, + "epoch": 16.01, + "grad_norm": 3.083263874053955, + "learning_rate": 4.24738701604593e-05, + "loss": 0.3031, "step": 7110 }, { - "epoch": 21.02, - "grad_norm": 0.013104673475027084, - "learning_rate": 2.2193064667291473e-05, - "loss": 0.1196, + "epoch": 16.01, + "grad_norm": 2.5412213802337646, + "learning_rate": 4.24554688650081e-05, + "loss": 0.1671, "step": 7120 }, { - "epoch": 21.02, - "grad_norm": 0.5789304375648499, - "learning_rate": 2.2146204311152765e-05, - "loss": 0.0593, + "epoch": 16.01, + "grad_norm": 17.73686981201172, + "learning_rate": 4.24370675695569e-05, + "loss": 0.1783, "step": 7130 }, { - "epoch": 21.02, - "grad_norm": 0.005018654279410839, - "learning_rate": 2.209934395501406e-05, - "loss": 0.0508, + "epoch": 16.01, + "grad_norm": 0.07319161295890808, + "learning_rate": 4.24186662741057e-05, + "loss": 0.2927, + "step": 7140 + }, + { + "epoch": 16.01, + "eval_accuracy": 0.7218468468468469, + "eval_loss": 1.0953478813171387, + "eval_runtime": 40.8306, + "eval_samples_per_second": 21.748, + "eval_steps_per_second": 1.812, "step": 7140 }, { - "epoch": 21.02, - "grad_norm": 2.954718828201294, - "learning_rate": 2.2052483598875353e-05, - "loss": 0.0327, + "epoch": 17.0, + "grad_norm": 0.11046060919761658, + "learning_rate": 4.24002649786545e-05, + "loss": 0.093, "step": 7150 }, { - "epoch": 21.02, - "grad_norm": 9.058338165283203, - "learning_rate": 2.2005623242736645e-05, - "loss": 0.0686, + "epoch": 17.0, + "grad_norm": 2.020051956176758, + "learning_rate": 4.23818636832033e-05, + "loss": 0.12, "step": 7160 }, { - "epoch": 21.02, - "grad_norm": 0.009537008590996265, - "learning_rate": 2.1958762886597937e-05, - "loss": 0.0273, + "epoch": 17.0, + "grad_norm": 12.6734037399292, + "learning_rate": 4.23634623877521e-05, + "loss": 0.0306, "step": 7170 }, { - "epoch": 21.02, - "grad_norm": 0.009294161573052406, - "learning_rate": 2.1911902530459233e-05, - "loss": 0.0853, + "epoch": 17.0, + "grad_norm": 3.399958610534668, + "learning_rate": 4.23450610923009e-05, + "loss": 0.1348, "step": 7180 }, { - "epoch": 21.02, - "grad_norm": 0.31812822818756104, - "learning_rate": 2.1865042174320525e-05, - "loss": 0.0038, + "epoch": 17.0, + "grad_norm": 4.962296962738037, + "learning_rate": 4.23266597968497e-05, + "loss": 0.154, "step": 7190 }, { - "epoch": 21.02, - "grad_norm": 21.410701751708984, - "learning_rate": 2.1818181818181818e-05, - "loss": 0.053, + "epoch": 17.0, + "grad_norm": 21.432941436767578, + "learning_rate": 4.23082585013985e-05, + "loss": 0.1807, "step": 7200 }, { - "epoch": 21.02, - "grad_norm": 0.6843286752700806, - "learning_rate": 2.1771321462043114e-05, - "loss": 0.0013, + "epoch": 17.0, + "grad_norm": 2.730048179626465, + "learning_rate": 4.2289857205947305e-05, + "loss": 0.0698, "step": 7210 }, { - "epoch": 21.02, - "grad_norm": 0.013939457014203072, - "learning_rate": 2.1724461105904406e-05, - "loss": 0.0543, + "epoch": 17.0, + "grad_norm": 5.443423271179199, + "learning_rate": 4.22714559104961e-05, + "loss": 0.1003, "step": 7220 }, { - "epoch": 21.03, - "grad_norm": 0.0053044771775603294, - "learning_rate": 2.16776007497657e-05, - "loss": 0.0194, + "epoch": 17.0, + "grad_norm": 35.500003814697266, + "learning_rate": 4.22530546150449e-05, + "loss": 0.189, "step": 7230 }, { - "epoch": 21.03, - "grad_norm": 73.98345947265625, - "learning_rate": 2.163074039362699e-05, - "loss": 0.0579, + "epoch": 17.0, + "grad_norm": 0.07223138213157654, + "learning_rate": 4.2234653319593705e-05, + "loss": 0.3029, "step": 7240 }, { - "epoch": 21.03, - "grad_norm": 0.03918803855776787, - "learning_rate": 2.1583880037488286e-05, - "loss": 0.0161, + "epoch": 17.0, + "grad_norm": 0.2147696614265442, + "learning_rate": 4.22162520241425e-05, + "loss": 0.031, "step": 7250 }, { - "epoch": 21.03, - "grad_norm": 0.0033793123438954353, - "learning_rate": 2.153701968134958e-05, - "loss": 0.0229, - "step": 7260 - }, - { - "epoch": 21.03, - "eval_accuracy": 0.7822349570200573, - "eval_loss": 1.3766086101531982, - "eval_runtime": 34.2388, - "eval_samples_per_second": 20.386, - "eval_steps_per_second": 1.723, + "epoch": 17.0, + "grad_norm": 9.870826721191406, + "learning_rate": 4.21978507286913e-05, + "loss": 0.2191, "step": 7260 }, { - "epoch": 22.0, - "grad_norm": 0.02534145675599575, - "learning_rate": 2.1490159325210874e-05, - "loss": 0.0177, + "epoch": 17.0, + "grad_norm": 16.39332389831543, + "learning_rate": 4.2179449433240106e-05, + "loss": 0.1711, "step": 7270 }, { - "epoch": 22.0, - "grad_norm": 27.288881301879883, - "learning_rate": 2.1443298969072163e-05, - "loss": 0.0594, + "epoch": 17.0, + "grad_norm": 2.216157913208008, + "learning_rate": 4.21610481377889e-05, + "loss": 0.0723, "step": 7280 }, { - "epoch": 22.0, - "grad_norm": 3.099937677383423, - "learning_rate": 2.139643861293346e-05, - "loss": 0.07, + "epoch": 17.0, + "grad_norm": 4.93229866027832, + "learning_rate": 4.21426468423377e-05, + "loss": 0.0542, "step": 7290 }, { - "epoch": 22.0, - "grad_norm": 0.0036865780130028725, - "learning_rate": 2.134957825679475e-05, - "loss": 0.006, + "epoch": 17.01, + "grad_norm": 5.814107894897461, + "learning_rate": 4.2124245546886506e-05, + "loss": 0.0757, "step": 7300 }, { - "epoch": 22.0, - "grad_norm": 59.88230514526367, - "learning_rate": 2.1302717900656047e-05, - "loss": 0.1675, + "epoch": 17.01, + "grad_norm": 29.41144561767578, + "learning_rate": 4.2105844251435306e-05, + "loss": 0.1158, "step": 7310 }, { - "epoch": 22.01, - "grad_norm": 0.12187007069587708, - "learning_rate": 2.125585754451734e-05, - "loss": 0.0348, + "epoch": 17.01, + "grad_norm": 40.187164306640625, + "learning_rate": 4.20874429559841e-05, + "loss": 0.2319, "step": 7320 }, { - "epoch": 22.01, - "grad_norm": 0.011614816263318062, - "learning_rate": 2.120899718837863e-05, - "loss": 0.0227, + "epoch": 17.01, + "grad_norm": 0.42633482813835144, + "learning_rate": 4.2069041660532906e-05, + "loss": 0.078, "step": 7330 }, { - "epoch": 22.01, - "grad_norm": 0.013845368288457394, - "learning_rate": 2.1162136832239927e-05, - "loss": 0.0653, + "epoch": 17.01, + "grad_norm": 14.942743301391602, + "learning_rate": 4.2050640365081706e-05, + "loss": 0.0495, "step": 7340 }, { - "epoch": 22.01, - "grad_norm": 0.009227645583450794, - "learning_rate": 2.111527647610122e-05, - "loss": 0.141, + "epoch": 17.01, + "grad_norm": 6.133449554443359, + "learning_rate": 4.20322390696305e-05, + "loss": 0.265, "step": 7350 }, { - "epoch": 22.01, - "grad_norm": 0.5920379161834717, - "learning_rate": 2.1068416119962515e-05, - "loss": 0.0401, + "epoch": 17.01, + "grad_norm": 0.2768873870372772, + "learning_rate": 4.201383777417931e-05, + "loss": 0.1142, "step": 7360 }, { - "epoch": 22.01, - "grad_norm": 0.006525806616991758, - "learning_rate": 2.1021555763823804e-05, - "loss": 0.0885, + "epoch": 17.01, + "grad_norm": 33.952964782714844, + "learning_rate": 4.199543647872811e-05, + "loss": 0.1683, "step": 7370 }, { - "epoch": 22.01, - "grad_norm": 0.006711127702146769, - "learning_rate": 2.09746954076851e-05, - "loss": 0.0618, + "epoch": 17.01, + "grad_norm": 11.31227970123291, + "learning_rate": 4.19770351832769e-05, + "loss": 0.024, "step": 7380 }, { - "epoch": 22.01, - "grad_norm": 0.004596900660544634, - "learning_rate": 2.0927835051546392e-05, - "loss": 0.0039, + "epoch": 17.01, + "grad_norm": 7.327922821044922, + "learning_rate": 4.195863388782571e-05, + "loss": 0.045, "step": 7390 }, { - "epoch": 22.01, - "grad_norm": 0.22587594389915466, - "learning_rate": 2.0880974695407688e-05, - "loss": 0.0897, + "epoch": 17.01, + "grad_norm": 26.489227294921875, + "learning_rate": 4.194023259237451e-05, + "loss": 0.2488, "step": 7400 }, { - "epoch": 22.01, - "grad_norm": 0.007293607573956251, - "learning_rate": 2.083411433926898e-05, - "loss": 0.1129, + "epoch": 17.01, + "grad_norm": 0.04616143926978111, + "learning_rate": 4.19218312969233e-05, + "loss": 0.2151, "step": 7410 }, { - "epoch": 22.01, - "grad_norm": 0.02033209055662155, - "learning_rate": 2.0787253983130272e-05, - "loss": 0.0583, + "epoch": 17.01, + "grad_norm": 0.2145106941461563, + "learning_rate": 4.190343000147211e-05, + "loss": 0.0606, "step": 7420 }, { - "epoch": 22.01, - "grad_norm": 0.006020266562700272, - "learning_rate": 2.0740393626991568e-05, - "loss": 0.0519, + "epoch": 17.01, + "grad_norm": 5.961447238922119, + "learning_rate": 4.188502870602091e-05, + "loss": 0.1122, "step": 7430 }, { - "epoch": 22.02, - "grad_norm": 0.050974782556295395, - "learning_rate": 2.069353327085286e-05, - "loss": 0.0972, + "epoch": 17.01, + "grad_norm": 10.214862823486328, + "learning_rate": 4.186662741056971e-05, + "loss": 0.2077, "step": 7440 }, { - "epoch": 22.02, - "grad_norm": 15.67349624633789, - "learning_rate": 2.0646672914714153e-05, - "loss": 0.0047, + "epoch": 17.01, + "grad_norm": 0.2347540557384491, + "learning_rate": 4.184822611511851e-05, + "loss": 0.1062, "step": 7450 }, { - "epoch": 22.02, - "grad_norm": 54.079017639160156, - "learning_rate": 2.0599812558575445e-05, - "loss": 0.0765, + "epoch": 17.01, + "grad_norm": 18.635656356811523, + "learning_rate": 4.182982481966731e-05, + "loss": 0.1868, "step": 7460 }, { - "epoch": 22.02, - "grad_norm": 0.28876176476478577, - "learning_rate": 2.055295220243674e-05, - "loss": 0.0463, + "epoch": 17.01, + "grad_norm": 22.011613845825195, + "learning_rate": 4.181142352421611e-05, + "loss": 0.1565, "step": 7470 }, { - "epoch": 22.02, - "grad_norm": 0.01849912479519844, - "learning_rate": 2.0506091846298033e-05, - "loss": 0.0064, + "epoch": 17.01, + "grad_norm": 14.826783180236816, + "learning_rate": 4.179302222876491e-05, + "loss": 0.1408, "step": 7480 }, { - "epoch": 22.02, - "grad_norm": 0.014657631516456604, - "learning_rate": 2.0459231490159325e-05, - "loss": 0.0517, + "epoch": 17.01, + "grad_norm": 11.025404930114746, + "learning_rate": 4.177462093331371e-05, + "loss": 0.2478, "step": 7490 }, { - "epoch": 22.02, - "grad_norm": 0.023860439658164978, - "learning_rate": 2.0412371134020618e-05, - "loss": 0.0165, + "epoch": 17.01, + "grad_norm": 20.00482177734375, + "learning_rate": 4.175621963786251e-05, + "loss": 0.1929, "step": 7500 }, { - "epoch": 22.02, - "grad_norm": 0.274640828371048, - "learning_rate": 2.0365510777881913e-05, - "loss": 0.0707, + "epoch": 17.01, + "grad_norm": 0.18002435564994812, + "learning_rate": 4.173781834241131e-05, + "loss": 0.1091, "step": 7510 }, { - "epoch": 22.02, - "grad_norm": 8.938105583190918, - "learning_rate": 2.0318650421743206e-05, - "loss": 0.0653, + "epoch": 17.01, + "grad_norm": 17.47130584716797, + "learning_rate": 4.171941704696011e-05, + "loss": 0.2577, "step": 7520 }, { - "epoch": 22.02, - "grad_norm": 0.023853203281760216, - "learning_rate": 2.0271790065604498e-05, - "loss": 0.1298, + "epoch": 17.01, + "grad_norm": 22.80291175842285, + "learning_rate": 4.170101575150891e-05, + "loss": 0.1132, "step": 7530 }, { - "epoch": 22.02, - "grad_norm": 0.01518749725073576, - "learning_rate": 2.0224929709465794e-05, - "loss": 0.054, + "epoch": 17.01, + "grad_norm": 0.3076806366443634, + "learning_rate": 4.168261445605771e-05, + "loss": 0.1184, "step": 7540 }, { - "epoch": 22.02, - "grad_norm": 0.2764262855052948, - "learning_rate": 2.0178069353327086e-05, - "loss": 0.1653, + "epoch": 17.01, + "grad_norm": 48.81949234008789, + "learning_rate": 4.166421316060651e-05, + "loss": 0.2314, "step": 7550 }, { - "epoch": 22.03, - "grad_norm": 0.01938408799469471, - "learning_rate": 2.0131208997188382e-05, - "loss": 0.0305, + "epoch": 17.01, + "grad_norm": 99.59304809570312, + "learning_rate": 4.164581186515531e-05, + "loss": 0.1679, "step": 7560 }, { - "epoch": 22.03, - "grad_norm": 0.06406080722808838, - "learning_rate": 2.008434864104967e-05, - "loss": 0.0038, - "step": 7570 + "epoch": 17.01, + "eval_accuracy": 0.7207207207207207, + "eval_loss": 1.2947888374328613, + "eval_runtime": 40.3013, + "eval_samples_per_second": 22.034, + "eval_steps_per_second": 1.836, + "step": 7560 }, { - "epoch": 22.03, - "grad_norm": 0.08752384036779404, - "learning_rate": 2.0037488284910966e-05, - "loss": 0.0272, - "step": 7580 + "epoch": 18.0, + "grad_norm": 35.903743743896484, + "learning_rate": 4.162741056970411e-05, + "loss": 0.1419, + "step": 7570 }, { - "epoch": 22.03, - "grad_norm": 0.00483914278447628, - "learning_rate": 1.999062792877226e-05, - "loss": 0.058, - "step": 7590 + "epoch": 18.0, + "grad_norm": 0.05569310858845711, + "learning_rate": 4.160900927425291e-05, + "loss": 0.0823, + "step": 7580 }, { - "epoch": 22.03, - "eval_accuracy": 0.7893982808022922, - "eval_loss": 1.279589056968689, - "eval_runtime": 34.2577, - "eval_samples_per_second": 20.375, - "eval_steps_per_second": 1.722, + "epoch": 18.0, + "grad_norm": 0.4801734685897827, + "learning_rate": 4.159060797880171e-05, + "loss": 0.2445, "step": 7590 }, { - "epoch": 23.0, - "grad_norm": 1.573994755744934, - "learning_rate": 1.9943767572633554e-05, - "loss": 0.0124, + "epoch": 18.0, + "grad_norm": 0.12532910704612732, + "learning_rate": 4.157220668335051e-05, + "loss": 0.103, "step": 7600 }, { - "epoch": 23.0, - "grad_norm": 0.014693894423544407, - "learning_rate": 1.9896907216494843e-05, - "loss": 0.0532, + "epoch": 18.0, + "grad_norm": 0.5598722100257874, + "learning_rate": 4.155380538789931e-05, + "loss": 0.0963, "step": 7610 }, { - "epoch": 23.0, - "grad_norm": 0.06175214797258377, - "learning_rate": 1.985004686035614e-05, - "loss": 0.0272, + "epoch": 18.0, + "grad_norm": 0.05128243565559387, + "learning_rate": 4.153540409244811e-05, + "loss": 0.1159, "step": 7620 }, { - "epoch": 23.0, - "grad_norm": 0.1148892492055893, - "learning_rate": 1.980318650421743e-05, - "loss": 0.0026, + "epoch": 18.0, + "grad_norm": 0.0423787459731102, + "learning_rate": 4.151700279699691e-05, + "loss": 0.1454, "step": 7630 }, { - "epoch": 23.0, - "grad_norm": 0.019487930461764336, - "learning_rate": 1.9756326148078727e-05, - "loss": 0.0537, + "epoch": 18.0, + "grad_norm": 29.96397590637207, + "learning_rate": 4.149860150154571e-05, + "loss": 0.1761, "step": 7640 }, { - "epoch": 23.01, - "grad_norm": 34.296722412109375, - "learning_rate": 1.970946579194002e-05, - "loss": 0.0903, + "epoch": 18.0, + "grad_norm": 10.358722686767578, + "learning_rate": 4.148020020609451e-05, + "loss": 0.1922, "step": 7650 }, { - "epoch": 23.01, - "grad_norm": 0.38383758068084717, - "learning_rate": 1.9662605435801312e-05, - "loss": 0.0016, + "epoch": 18.0, + "grad_norm": 4.160580158233643, + "learning_rate": 4.146179891064331e-05, + "loss": 0.0424, "step": 7660 }, { - "epoch": 23.01, - "grad_norm": 7.190293788909912, - "learning_rate": 1.9615745079662607e-05, - "loss": 0.1071, + "epoch": 18.0, + "grad_norm": 6.635777950286865, + "learning_rate": 4.144339761519211e-05, + "loss": 0.1561, "step": 7670 }, { - "epoch": 23.01, - "grad_norm": 0.01017380878329277, - "learning_rate": 1.95688847235239e-05, - "loss": 0.0267, + "epoch": 18.0, + "grad_norm": 10.608802795410156, + "learning_rate": 4.142499631974091e-05, + "loss": 0.0766, "step": 7680 }, { - "epoch": 23.01, - "grad_norm": 18.401382446289062, - "learning_rate": 1.9522024367385195e-05, - "loss": 0.0738, + "epoch": 18.0, + "grad_norm": 26.325162887573242, + "learning_rate": 4.140659502428971e-05, + "loss": 0.12, "step": 7690 }, { - "epoch": 23.01, - "grad_norm": 0.0242378581315279, - "learning_rate": 1.9475164011246484e-05, - "loss": 0.1412, + "epoch": 18.0, + "grad_norm": 22.195999145507812, + "learning_rate": 4.138819372883852e-05, + "loss": 0.1439, "step": 7700 }, { - "epoch": 23.01, - "grad_norm": 0.023785755038261414, - "learning_rate": 1.942830365510778e-05, - "loss": 0.0955, + "epoch": 18.0, + "grad_norm": 15.824335098266602, + "learning_rate": 4.136979243338731e-05, + "loss": 0.1322, "step": 7710 }, { - "epoch": 23.01, - "grad_norm": 0.02424936555325985, - "learning_rate": 1.9381443298969072e-05, - "loss": 0.0043, + "epoch": 18.01, + "grad_norm": 0.14473718404769897, + "learning_rate": 4.135139113793611e-05, + "loss": 0.0479, "step": 7720 }, { - "epoch": 23.01, - "grad_norm": 0.38176319003105164, - "learning_rate": 1.9334582942830368e-05, - "loss": 0.0326, + "epoch": 18.01, + "grad_norm": 0.10849491506814957, + "learning_rate": 4.133298984248492e-05, + "loss": 0.0638, "step": 7730 }, { - "epoch": 23.01, - "grad_norm": 0.011367129161953926, - "learning_rate": 1.928772258669166e-05, - "loss": 0.0404, + "epoch": 18.01, + "grad_norm": 43.89323806762695, + "learning_rate": 4.131458854703371e-05, + "loss": 0.1261, "step": 7740 }, { - "epoch": 23.01, - "grad_norm": 0.5925320386886597, - "learning_rate": 1.9240862230552953e-05, - "loss": 0.0306, + "epoch": 18.01, + "grad_norm": 35.16301345825195, + "learning_rate": 4.129618725158251e-05, + "loss": 0.0186, "step": 7750 }, { - "epoch": 23.01, - "grad_norm": 0.07246335595846176, - "learning_rate": 1.919400187441425e-05, - "loss": 0.069, + "epoch": 18.01, + "grad_norm": 0.023130550980567932, + "learning_rate": 4.127778595613132e-05, + "loss": 0.1184, "step": 7760 }, { - "epoch": 23.02, - "grad_norm": 0.29946988821029663, - "learning_rate": 1.914714151827554e-05, - "loss": 0.0996, + "epoch": 18.01, + "grad_norm": 14.438142776489258, + "learning_rate": 4.125938466068011e-05, + "loss": 0.242, "step": 7770 }, { - "epoch": 23.02, - "grad_norm": 0.035218510776758194, - "learning_rate": 1.9100281162136833e-05, - "loss": 0.0348, + "epoch": 18.01, + "grad_norm": 16.877521514892578, + "learning_rate": 4.124098336522891e-05, + "loss": 0.047, "step": 7780 }, { - "epoch": 23.02, - "grad_norm": 0.035237617790699005, - "learning_rate": 1.9053420805998125e-05, - "loss": 0.0059, + "epoch": 18.01, + "grad_norm": 17.4776668548584, + "learning_rate": 4.122258206977772e-05, + "loss": 0.1285, "step": 7790 }, { - "epoch": 23.02, - "grad_norm": 28.824886322021484, - "learning_rate": 1.900656044985942e-05, - "loss": 0.1736, + "epoch": 18.01, + "grad_norm": 0.09450684487819672, + "learning_rate": 4.120418077432651e-05, + "loss": 0.022, "step": 7800 }, { - "epoch": 23.02, - "grad_norm": 0.12462026625871658, - "learning_rate": 1.8959700093720713e-05, - "loss": 0.003, + "epoch": 18.01, + "grad_norm": 1.1784636974334717, + "learning_rate": 4.118577947887531e-05, + "loss": 0.1058, "step": 7810 }, { - "epoch": 23.02, - "grad_norm": 0.6549420952796936, - "learning_rate": 1.8912839737582006e-05, - "loss": 0.0087, + "epoch": 18.01, + "grad_norm": 0.1115993857383728, + "learning_rate": 4.116737818342412e-05, + "loss": 0.1197, "step": 7820 }, { - "epoch": 23.02, - "grad_norm": 0.05357836186885834, - "learning_rate": 1.8865979381443298e-05, - "loss": 0.0214, + "epoch": 18.01, + "grad_norm": 0.26653343439102173, + "learning_rate": 4.114897688797292e-05, + "loss": 0.1181, "step": 7830 }, { - "epoch": 23.02, - "grad_norm": 0.006000145338475704, - "learning_rate": 1.8819119025304594e-05, - "loss": 0.0458, + "epoch": 18.01, + "grad_norm": 10.61689567565918, + "learning_rate": 4.113057559252171e-05, + "loss": 0.1226, "step": 7840 }, { - "epoch": 23.02, - "grad_norm": 0.006179989781230688, - "learning_rate": 1.8772258669165886e-05, - "loss": 0.1173, + "epoch": 18.01, + "grad_norm": 1.0771710872650146, + "learning_rate": 4.111217429707052e-05, + "loss": 0.0459, "step": 7850 }, { - "epoch": 23.02, - "grad_norm": 0.336616575717926, - "learning_rate": 1.872539831302718e-05, - "loss": 0.1676, + "epoch": 18.01, + "grad_norm": 2.459402561187744, + "learning_rate": 4.109377300161932e-05, + "loss": 0.0604, "step": 7860 }, { - "epoch": 23.02, - "grad_norm": 0.0050712330266833305, - "learning_rate": 1.8678537956888474e-05, - "loss": 0.0059, + "epoch": 18.01, + "grad_norm": 0.12830379605293274, + "learning_rate": 4.107537170616811e-05, + "loss": 0.3332, "step": 7870 }, { - "epoch": 23.02, - "grad_norm": 0.005236570257693529, - "learning_rate": 1.8631677600749766e-05, - "loss": 0.1685, + "epoch": 18.01, + "grad_norm": 20.693988800048828, + "learning_rate": 4.105697041071692e-05, + "loss": 0.162, "step": 7880 }, { - "epoch": 23.03, - "grad_norm": 0.06752946227788925, - "learning_rate": 1.8584817244611062e-05, - "loss": 0.0035, + "epoch": 18.01, + "grad_norm": 27.03666114807129, + "learning_rate": 4.103856911526572e-05, + "loss": 0.2202, "step": 7890 }, { - "epoch": 23.03, - "grad_norm": 0.4543127119541168, - "learning_rate": 1.853795688847235e-05, - "loss": 0.1258, + "epoch": 18.01, + "grad_norm": 0.8211888670921326, + "learning_rate": 4.102016781981451e-05, + "loss": 0.0964, "step": 7900 }, { - "epoch": 23.03, - "grad_norm": 0.022716745734214783, - "learning_rate": 1.8491096532333647e-05, - "loss": 0.1408, + "epoch": 18.01, + "grad_norm": 40.739036560058594, + "learning_rate": 4.100176652436331e-05, + "loss": 0.1921, "step": 7910 }, { - "epoch": 23.03, - "grad_norm": 0.02292322926223278, - "learning_rate": 1.844423617619494e-05, - "loss": 0.1045, - "step": 7920 - }, - { - "epoch": 23.03, - "eval_accuracy": 0.7750716332378224, - "eval_loss": 1.3584957122802734, - "eval_runtime": 34.3543, - "eval_samples_per_second": 20.318, - "eval_steps_per_second": 1.717, + "epoch": 18.01, + "grad_norm": 7.381683349609375, + "learning_rate": 4.098336522891212e-05, + "loss": 0.3249, "step": 7920 }, { - "epoch": 24.0, - "grad_norm": 0.01888859085738659, - "learning_rate": 1.8397375820056235e-05, - "loss": 0.0033, + "epoch": 18.01, + "grad_norm": 6.641468524932861, + "learning_rate": 4.096496393346092e-05, + "loss": 0.0733, "step": 7930 }, { - "epoch": 24.0, - "grad_norm": 2.7452802658081055, - "learning_rate": 1.8350515463917524e-05, - "loss": 0.0672, + "epoch": 18.01, + "grad_norm": 23.581151962280273, + "learning_rate": 4.0946562638009713e-05, + "loss": 0.1884, "step": 7940 }, { - "epoch": 24.0, - "grad_norm": 17.95488166809082, - "learning_rate": 1.830365510777882e-05, - "loss": 0.0579, + "epoch": 18.01, + "grad_norm": 25.609771728515625, + "learning_rate": 4.092816134255852e-05, + "loss": 0.2263, "step": 7950 }, { - "epoch": 24.0, - "grad_norm": 28.065513610839844, - "learning_rate": 1.825679475164011e-05, - "loss": 0.0383, + "epoch": 18.01, + "grad_norm": 20.769895553588867, + "learning_rate": 4.090976004710732e-05, + "loss": 0.1547, "step": 7960 }, { - "epoch": 24.0, - "grad_norm": 0.6805478930473328, - "learning_rate": 1.8209934395501407e-05, - "loss": 0.0037, + "epoch": 18.01, + "grad_norm": 0.11588778346776962, + "learning_rate": 4.0891358751656114e-05, + "loss": 0.0908, "step": 7970 }, { - "epoch": 24.01, - "grad_norm": 0.018596457317471504, - "learning_rate": 1.81630740393627e-05, - "loss": 0.0333, + "epoch": 18.01, + "grad_norm": 0.04875311255455017, + "learning_rate": 4.087295745620492e-05, + "loss": 0.1523, "step": 7980 }, { - "epoch": 24.01, - "grad_norm": 0.8254786133766174, - "learning_rate": 1.8116213683223992e-05, - "loss": 0.0006, + "epoch": 18.01, + "eval_accuracy": 0.7015765765765766, + "eval_loss": 1.3631926774978638, + "eval_runtime": 40.2124, + "eval_samples_per_second": 22.083, + "eval_steps_per_second": 1.84, + "step": 7980 + }, + { + "epoch": 19.0, + "grad_norm": 21.000314712524414, + "learning_rate": 4.085455616075372e-05, + "loss": 0.1682, "step": 7990 }, { - "epoch": 24.01, - "grad_norm": 0.014021596871316433, - "learning_rate": 1.8069353327085288e-05, - "loss": 0.0004, + "epoch": 19.0, + "grad_norm": 1.4022904634475708, + "learning_rate": 4.0836154865302514e-05, + "loss": 0.1087, "step": 8000 }, { - "epoch": 24.01, - "grad_norm": 0.009306724183261395, - "learning_rate": 1.802249297094658e-05, - "loss": 0.0018, + "epoch": 19.0, + "grad_norm": 0.13801883161067963, + "learning_rate": 4.081775356985132e-05, + "loss": 0.1159, "step": 8010 }, { - "epoch": 24.01, - "grad_norm": 0.005670236889272928, - "learning_rate": 1.7975632614807876e-05, - "loss": 0.1225, + "epoch": 19.0, + "grad_norm": 16.723587036132812, + "learning_rate": 4.079935227440012e-05, + "loss": 0.1965, "step": 8020 }, { - "epoch": 24.01, - "grad_norm": 0.01667461171746254, - "learning_rate": 1.7928772258669165e-05, - "loss": 0.0701, + "epoch": 19.0, + "grad_norm": 0.8381214737892151, + "learning_rate": 4.0780950978948914e-05, + "loss": 0.0861, "step": 8030 }, { - "epoch": 24.01, - "grad_norm": 0.0773952454328537, - "learning_rate": 1.788191190253046e-05, - "loss": 0.0048, + "epoch": 19.0, + "grad_norm": 0.10108273476362228, + "learning_rate": 4.076254968349772e-05, + "loss": 0.046, "step": 8040 }, { - "epoch": 24.01, - "grad_norm": 57.535404205322266, - "learning_rate": 1.7835051546391753e-05, - "loss": 0.027, + "epoch": 19.0, + "grad_norm": 8.169659614562988, + "learning_rate": 4.074414838804652e-05, + "loss": 0.1479, "step": 8050 }, { - "epoch": 24.01, - "grad_norm": 0.01095606479793787, - "learning_rate": 1.778819119025305e-05, - "loss": 0.0294, + "epoch": 19.0, + "grad_norm": 0.968652069568634, + "learning_rate": 4.072574709259532e-05, + "loss": 0.0547, "step": 8060 }, { - "epoch": 24.01, - "grad_norm": 0.006405522581189871, - "learning_rate": 1.774133083411434e-05, - "loss": 0.0722, + "epoch": 19.0, + "grad_norm": 29.368057250976562, + "learning_rate": 4.070734579714412e-05, + "loss": 0.147, "step": 8070 }, { - "epoch": 24.01, - "grad_norm": 0.003576676594093442, - "learning_rate": 1.7694470477975633e-05, - "loss": 0.0024, + "epoch": 19.0, + "grad_norm": 39.67902755737305, + "learning_rate": 4.068894450169292e-05, + "loss": 0.3396, "step": 8080 }, { - "epoch": 24.01, - "grad_norm": 43.07078552246094, - "learning_rate": 1.764761012183693e-05, - "loss": 0.0385, + "epoch": 19.0, + "grad_norm": 0.5779815912246704, + "learning_rate": 4.067054320624172e-05, + "loss": 0.2295, "step": 8090 }, { - "epoch": 24.02, - "grad_norm": 47.99566650390625, - "learning_rate": 1.760074976569822e-05, - "loss": 0.1164, + "epoch": 19.0, + "grad_norm": 0.8270196318626404, + "learning_rate": 4.065214191079052e-05, + "loss": 0.0877, "step": 8100 }, { - "epoch": 24.02, - "grad_norm": 0.029569357633590698, - "learning_rate": 1.7553889409559513e-05, - "loss": 0.0441, + "epoch": 19.0, + "grad_norm": 12.409184455871582, + "learning_rate": 4.063374061533932e-05, + "loss": 0.1544, "step": 8110 }, { - "epoch": 24.02, - "grad_norm": 0.020374421030282974, - "learning_rate": 1.7507029053420806e-05, - "loss": 0.0203, + "epoch": 19.0, + "grad_norm": 11.662062644958496, + "learning_rate": 4.061533931988812e-05, + "loss": 0.0856, "step": 8120 }, { - "epoch": 24.02, - "grad_norm": 0.2725122570991516, - "learning_rate": 1.74601686972821e-05, - "loss": 0.0015, + "epoch": 19.0, + "grad_norm": 11.891218185424805, + "learning_rate": 4.059693802443692e-05, + "loss": 0.0513, "step": 8130 }, { - "epoch": 24.02, - "grad_norm": 25.096572875976562, - "learning_rate": 1.7413308341143394e-05, - "loss": 0.0571, + "epoch": 19.01, + "grad_norm": 4.517670631408691, + "learning_rate": 4.057853672898572e-05, + "loss": 0.1357, "step": 8140 }, { - "epoch": 24.02, - "grad_norm": 0.2794142961502075, - "learning_rate": 1.7366447985004686e-05, - "loss": 0.0592, + "epoch": 19.01, + "grad_norm": 6.784265995025635, + "learning_rate": 4.056013543353452e-05, + "loss": 0.0232, "step": 8150 }, { - "epoch": 24.02, - "grad_norm": 0.007156948558986187, - "learning_rate": 1.7319587628865978e-05, - "loss": 0.0116, + "epoch": 19.01, + "grad_norm": 0.07984334230422974, + "learning_rate": 4.054173413808332e-05, + "loss": 0.0557, "step": 8160 }, { - "epoch": 24.02, - "grad_norm": 0.06009415537118912, - "learning_rate": 1.7272727272727274e-05, - "loss": 0.0774, + "epoch": 19.01, + "grad_norm": 0.8104623556137085, + "learning_rate": 4.052333284263212e-05, + "loss": 0.1124, "step": 8170 }, { - "epoch": 24.02, - "grad_norm": 0.0062246439047157764, - "learning_rate": 1.7225866916588566e-05, - "loss": 0.007, + "epoch": 19.01, + "grad_norm": 0.036781515926122665, + "learning_rate": 4.050493154718092e-05, + "loss": 0.026, "step": 8180 }, { - "epoch": 24.02, - "grad_norm": 0.0417780727148056, - "learning_rate": 1.717900656044986e-05, - "loss": 0.1236, + "epoch": 19.01, + "grad_norm": 26.195846557617188, + "learning_rate": 4.048653025172972e-05, + "loss": 0.0411, "step": 8190 }, { - "epoch": 24.02, - "grad_norm": 53.34578323364258, - "learning_rate": 1.7132146204311154e-05, - "loss": 0.0527, + "epoch": 19.01, + "grad_norm": 46.0142936706543, + "learning_rate": 4.046812895627852e-05, + "loss": 0.1791, "step": 8200 }, { - "epoch": 24.02, - "grad_norm": 0.20870938897132874, - "learning_rate": 1.7085285848172447e-05, - "loss": 0.0519, + "epoch": 19.01, + "grad_norm": 0.6812270879745483, + "learning_rate": 4.044972766082732e-05, + "loss": 0.114, "step": 8210 }, { - "epoch": 24.03, - "grad_norm": 0.011034387163817883, - "learning_rate": 1.7038425492033742e-05, - "loss": 0.0005, + "epoch": 19.01, + "grad_norm": 0.040096819400787354, + "learning_rate": 4.0431326365376123e-05, + "loss": 0.2042, "step": 8220 }, { - "epoch": 24.03, - "grad_norm": 0.004511510953307152, - "learning_rate": 1.699156513589503e-05, - "loss": 0.0125, + "epoch": 19.01, + "grad_norm": 0.17151233553886414, + "learning_rate": 4.0412925069924924e-05, + "loss": 0.0718, "step": 8230 }, { - "epoch": 24.03, - "grad_norm": 0.4468887746334076, - "learning_rate": 1.6944704779756327e-05, - "loss": 0.0015, + "epoch": 19.01, + "grad_norm": 1.1165339946746826, + "learning_rate": 4.0394523774473724e-05, + "loss": 0.1602, "step": 8240 }, { - "epoch": 24.03, - "grad_norm": 0.002701952587813139, - "learning_rate": 1.689784442361762e-05, - "loss": 0.0379, - "step": 8250 - }, - { - "epoch": 24.03, - "eval_accuracy": 0.7836676217765043, - "eval_loss": 1.315584421157837, - "eval_runtime": 33.9983, - "eval_samples_per_second": 20.53, - "eval_steps_per_second": 1.735, + "epoch": 19.01, + "grad_norm": 0.42848771810531616, + "learning_rate": 4.0376122479022524e-05, + "loss": 0.0959, "step": 8250 }, { - "epoch": 25.0, - "grad_norm": 0.0075071449391543865, - "learning_rate": 1.6850984067478915e-05, - "loss": 0.0626, + "epoch": 19.01, + "grad_norm": 0.10783377289772034, + "learning_rate": 4.0357721183571324e-05, + "loss": 0.0306, "step": 8260 }, { - "epoch": 25.0, - "grad_norm": 0.23369351029396057, - "learning_rate": 1.6804123711340207e-05, - "loss": 0.0197, + "epoch": 19.01, + "grad_norm": 65.03582763671875, + "learning_rate": 4.0339319888120124e-05, + "loss": 0.1638, "step": 8270 }, { - "epoch": 25.0, - "grad_norm": 11.089585304260254, - "learning_rate": 1.67572633552015e-05, - "loss": 0.0019, + "epoch": 19.01, + "grad_norm": 0.21185167133808136, + "learning_rate": 4.0320918592668924e-05, + "loss": 0.0511, "step": 8280 }, { - "epoch": 25.0, - "grad_norm": 0.0038492237217724323, - "learning_rate": 1.6710402999062792e-05, - "loss": 0.0002, + "epoch": 19.01, + "grad_norm": 14.06902027130127, + "learning_rate": 4.0302517297217724e-05, + "loss": 0.0355, "step": 8290 }, { - "epoch": 25.0, - "grad_norm": 0.003863664111122489, - "learning_rate": 1.6663542642924088e-05, - "loss": 0.0012, + "epoch": 19.01, + "grad_norm": 29.087316513061523, + "learning_rate": 4.0284116001766524e-05, + "loss": 0.2198, "step": 8300 }, { - "epoch": 25.01, - "grad_norm": 0.013411330990493298, - "learning_rate": 1.661668228678538e-05, - "loss": 0.0003, + "epoch": 19.01, + "grad_norm": 12.470526695251465, + "learning_rate": 4.0265714706315324e-05, + "loss": 0.068, "step": 8310 }, { - "epoch": 25.01, - "grad_norm": 0.028628764674067497, - "learning_rate": 1.6569821930646672e-05, - "loss": 0.0005, + "epoch": 19.01, + "grad_norm": 0.05231478437781334, + "learning_rate": 4.024731341086413e-05, + "loss": 0.1933, "step": 8320 }, { - "epoch": 25.01, - "grad_norm": 0.010359793901443481, - "learning_rate": 1.6522961574507968e-05, - "loss": 0.0662, + "epoch": 19.01, + "grad_norm": 0.028017813339829445, + "learning_rate": 4.0228912115412925e-05, + "loss": 0.1588, "step": 8330 }, { - "epoch": 25.01, - "grad_norm": 4.15015983581543, - "learning_rate": 1.647610121836926e-05, - "loss": 0.0023, + "epoch": 19.01, + "grad_norm": 10.386894226074219, + "learning_rate": 4.0210510819961725e-05, + "loss": 0.0583, "step": 8340 }, { - "epoch": 25.01, - "grad_norm": 34.17851257324219, - "learning_rate": 1.6429240862230556e-05, - "loss": 0.0411, + "epoch": 19.01, + "grad_norm": 41.2181396484375, + "learning_rate": 4.019210952451053e-05, + "loss": 0.1404, "step": 8350 }, { - "epoch": 25.01, - "grad_norm": 0.0031898904126137495, - "learning_rate": 1.6382380506091845e-05, - "loss": 0.0003, + "epoch": 19.01, + "grad_norm": 11.828994750976562, + "learning_rate": 4.0173708229059325e-05, + "loss": 0.1395, "step": 8360 }, { - "epoch": 25.01, - "grad_norm": 0.002583961235359311, - "learning_rate": 1.633552014995314e-05, - "loss": 0.0485, + "epoch": 19.01, + "grad_norm": 56.479366302490234, + "learning_rate": 4.0155306933608125e-05, + "loss": 0.1967, "step": 8370 }, { - "epoch": 25.01, - "grad_norm": 18.96380615234375, - "learning_rate": 1.6288659793814433e-05, - "loss": 0.1036, + "epoch": 19.01, + "grad_norm": 0.19793842732906342, + "learning_rate": 4.013690563815693e-05, + "loss": 0.1509, "step": 8380 }, { - "epoch": 25.01, - "grad_norm": 45.696075439453125, - "learning_rate": 1.624179943767573e-05, - "loss": 0.0844, + "epoch": 19.01, + "grad_norm": 0.04642114043235779, + "learning_rate": 4.0118504342705725e-05, + "loss": 0.171, "step": 8390 }, { - "epoch": 25.01, - "grad_norm": 0.00760071724653244, - "learning_rate": 1.619493908153702e-05, - "loss": 0.0098, + "epoch": 19.01, + "grad_norm": 0.028800005093216896, + "learning_rate": 4.0100103047254525e-05, + "loss": 0.1059, "step": 8400 }, { - "epoch": 25.01, - "grad_norm": 0.019091518595814705, - "learning_rate": 1.6148078725398313e-05, - "loss": 0.0273, + "epoch": 19.01, + "eval_accuracy": 0.7184684684684685, + "eval_loss": 1.2914650440216064, + "eval_runtime": 39.1072, + "eval_samples_per_second": 22.707, + "eval_steps_per_second": 1.892, + "step": 8400 + }, + { + "epoch": 20.0, + "grad_norm": 0.05289442837238312, + "learning_rate": 4.008170175180333e-05, + "loss": 0.0056, "step": 8410 }, { - "epoch": 25.01, - "grad_norm": 0.004600458778440952, - "learning_rate": 1.610121836925961e-05, - "loss": 0.0008, + "epoch": 20.0, + "grad_norm": 18.77003288269043, + "learning_rate": 4.0063300456352126e-05, + "loss": 0.0863, "step": 8420 }, { - "epoch": 25.02, - "grad_norm": 22.48053741455078, - "learning_rate": 1.60543580131209e-05, - "loss": 0.0469, + "epoch": 20.0, + "grad_norm": 0.9093281030654907, + "learning_rate": 4.0044899160900926e-05, + "loss": 0.0118, "step": 8430 }, { - "epoch": 25.02, - "grad_norm": 0.011721034534275532, - "learning_rate": 1.6007497656982194e-05, - "loss": 0.046, + "epoch": 20.0, + "grad_norm": 0.07237890362739563, + "learning_rate": 4.002649786544973e-05, + "loss": 0.0576, "step": 8440 }, { - "epoch": 25.02, - "grad_norm": 0.004451930057257414, - "learning_rate": 1.5960637300843486e-05, - "loss": 0.0038, + "epoch": 20.0, + "grad_norm": 9.122472763061523, + "learning_rate": 4.000809656999853e-05, + "loss": 0.0942, "step": 8450 }, { - "epoch": 25.02, - "grad_norm": 0.010986747220158577, - "learning_rate": 1.591377694470478e-05, - "loss": 0.125, + "epoch": 20.0, + "grad_norm": 0.02487068995833397, + "learning_rate": 3.9989695274547326e-05, + "loss": 0.0443, "step": 8460 }, { - "epoch": 25.02, - "grad_norm": 41.15037155151367, - "learning_rate": 1.5866916588566074e-05, - "loss": 0.0238, + "epoch": 20.0, + "grad_norm": 0.02156475931406021, + "learning_rate": 3.997129397909613e-05, + "loss": 0.1681, "step": 8470 }, { - "epoch": 25.02, - "grad_norm": 0.006322337780147791, - "learning_rate": 1.5820056232427366e-05, - "loss": 0.0012, + "epoch": 20.0, + "grad_norm": 0.025435185059905052, + "learning_rate": 3.995289268364493e-05, + "loss": 0.1307, "step": 8480 }, { - "epoch": 25.02, - "grad_norm": 0.007438218221068382, - "learning_rate": 1.577319587628866e-05, - "loss": 0.0558, + "epoch": 20.0, + "grad_norm": 0.6154837608337402, + "learning_rate": 3.9934491388193727e-05, + "loss": 0.1182, "step": 8490 }, { - "epoch": 25.02, - "grad_norm": 0.007493423763662577, - "learning_rate": 1.5726335520149954e-05, - "loss": 0.0244, + "epoch": 20.0, + "grad_norm": 0.3116990327835083, + "learning_rate": 3.9916090092742533e-05, + "loss": 0.0448, "step": 8500 }, { - "epoch": 25.02, - "grad_norm": 0.005765008274465799, - "learning_rate": 1.5679475164011247e-05, - "loss": 0.0097, + "epoch": 20.0, + "grad_norm": 11.546723365783691, + "learning_rate": 3.9897688797291334e-05, + "loss": 0.1507, "step": 8510 }, { - "epoch": 25.02, - "grad_norm": 0.3964434266090393, - "learning_rate": 1.563261480787254e-05, - "loss": 0.0018, + "epoch": 20.0, + "grad_norm": 9.730113983154297, + "learning_rate": 3.987928750184013e-05, + "loss": 0.1857, "step": 8520 }, { - "epoch": 25.02, - "grad_norm": 2.117600440979004, - "learning_rate": 1.5585754451733835e-05, - "loss": 0.0519, + "epoch": 20.0, + "grad_norm": 0.044611260294914246, + "learning_rate": 3.9860886206388934e-05, + "loss": 0.1907, "step": 8530 }, { - "epoch": 25.02, - "grad_norm": 0.10996006429195404, - "learning_rate": 1.5538894095595127e-05, - "loss": 0.0021, + "epoch": 20.0, + "grad_norm": 0.4382248818874359, + "learning_rate": 3.9842484910937734e-05, + "loss": 0.0964, "step": 8540 }, { - "epoch": 25.03, - "grad_norm": 0.0033693662844598293, - "learning_rate": 1.5492033739456423e-05, - "loss": 0.0811, + "epoch": 20.0, + "grad_norm": 0.0616886205971241, + "learning_rate": 3.9824083615486534e-05, + "loss": 0.0944, "step": 8550 }, { - "epoch": 25.03, - "grad_norm": 10.247265815734863, - "learning_rate": 1.544517338331771e-05, - "loss": 0.0058, + "epoch": 20.01, + "grad_norm": 28.374303817749023, + "learning_rate": 3.9805682320035334e-05, + "loss": 0.0524, "step": 8560 }, { - "epoch": 25.03, - "grad_norm": 0.2538563013076782, - "learning_rate": 1.5398313027179007e-05, - "loss": 0.03, + "epoch": 20.01, + "grad_norm": 0.11457622051239014, + "learning_rate": 3.9787281024584134e-05, + "loss": 0.1207, "step": 8570 }, { - "epoch": 25.03, - "grad_norm": 0.00820246897637844, - "learning_rate": 1.53514526710403e-05, - "loss": 0.0945, - "step": 8580 - }, - { - "epoch": 25.03, - "eval_accuracy": 0.7922636103151862, - "eval_loss": 1.3046531677246094, - "eval_runtime": 33.5684, - "eval_samples_per_second": 20.793, - "eval_steps_per_second": 1.758, + "epoch": 20.01, + "grad_norm": 1.5453132390975952, + "learning_rate": 3.9768879729132934e-05, + "loss": 0.0961, "step": 8580 }, { - "epoch": 26.0, - "grad_norm": 0.0038312424439936876, - "learning_rate": 1.5304592314901595e-05, - "loss": 0.0008, + "epoch": 20.01, + "grad_norm": 0.6138853430747986, + "learning_rate": 3.9750478433681734e-05, + "loss": 0.0976, "step": 8590 }, { - "epoch": 26.0, - "grad_norm": 0.06059965118765831, - "learning_rate": 1.525773195876289e-05, - "loss": 0.0115, + "epoch": 20.01, + "grad_norm": 0.33729997277259827, + "learning_rate": 3.9732077138230535e-05, + "loss": 0.0299, "step": 8600 }, { - "epoch": 26.0, - "grad_norm": 0.004089924972504377, - "learning_rate": 1.521087160262418e-05, - "loss": 0.0002, + "epoch": 20.01, + "grad_norm": 0.14739558100700378, + "learning_rate": 3.9713675842779335e-05, + "loss": 0.0697, "step": 8610 }, { - "epoch": 26.0, - "grad_norm": 0.0043753357604146, - "learning_rate": 1.5164011246485474e-05, - "loss": 0.0727, + "epoch": 20.01, + "grad_norm": 0.4316738247871399, + "learning_rate": 3.9695274547328135e-05, + "loss": 0.2268, "step": 8620 }, { - "epoch": 26.0, - "grad_norm": 5.026497840881348, - "learning_rate": 1.5117150890346768e-05, - "loss": 0.0639, + "epoch": 20.01, + "grad_norm": 0.3644164502620697, + "learning_rate": 3.9676873251876935e-05, + "loss": 0.0166, "step": 8630 }, { - "epoch": 26.01, - "grad_norm": 0.014356585219502449, - "learning_rate": 1.5070290534208062e-05, - "loss": 0.036, + "epoch": 20.01, + "grad_norm": 29.693737030029297, + "learning_rate": 3.9658471956425735e-05, + "loss": 0.2011, "step": 8640 }, { - "epoch": 26.01, - "grad_norm": 0.24706010520458221, - "learning_rate": 1.5023430178069353e-05, - "loss": 0.001, + "epoch": 20.01, + "grad_norm": 0.11193890124559402, + "learning_rate": 3.9640070660974535e-05, + "loss": 0.163, "step": 8650 }, { - "epoch": 26.01, - "grad_norm": 0.06222040206193924, - "learning_rate": 1.4976569821930647e-05, - "loss": 0.0436, + "epoch": 20.01, + "grad_norm": 0.500665545463562, + "learning_rate": 3.9621669365523335e-05, + "loss": 0.1297, "step": 8660 }, { - "epoch": 26.01, - "grad_norm": 0.0071958452463150024, - "learning_rate": 1.492970946579194e-05, - "loss": 0.0006, + "epoch": 20.01, + "grad_norm": 1.7424372434616089, + "learning_rate": 3.9603268070072135e-05, + "loss": 0.3028, "step": 8670 }, { - "epoch": 26.01, - "grad_norm": 0.011005638167262077, - "learning_rate": 1.4882849109653235e-05, - "loss": 0.0528, + "epoch": 20.01, + "grad_norm": 0.35591185092926025, + "learning_rate": 3.9584866774620936e-05, + "loss": 0.1396, "step": 8680 }, { - "epoch": 26.01, - "grad_norm": 0.09897544980049133, - "learning_rate": 1.4835988753514527e-05, - "loss": 0.0099, + "epoch": 20.01, + "grad_norm": 0.6528427600860596, + "learning_rate": 3.9566465479169736e-05, + "loss": 0.2106, "step": 8690 }, { - "epoch": 26.01, - "grad_norm": 1.022418737411499, - "learning_rate": 1.4789128397375821e-05, - "loss": 0.006, + "epoch": 20.01, + "grad_norm": 8.507621765136719, + "learning_rate": 3.9548064183718536e-05, + "loss": 0.0937, "step": 8700 }, { - "epoch": 26.01, - "grad_norm": 0.0073012434877455235, - "learning_rate": 1.4742268041237115e-05, - "loss": 0.0008, + "epoch": 20.01, + "grad_norm": 0.22332046926021576, + "learning_rate": 3.9529662888267336e-05, + "loss": 0.098, "step": 8710 }, { - "epoch": 26.01, - "grad_norm": 0.07601054012775421, - "learning_rate": 1.4695407685098409e-05, - "loss": 0.0256, + "epoch": 20.01, + "grad_norm": 0.1772357076406479, + "learning_rate": 3.9511261592816136e-05, + "loss": 0.1744, "step": 8720 }, { - "epoch": 26.01, - "grad_norm": 0.010327538475394249, - "learning_rate": 1.46485473289597e-05, - "loss": 0.0105, + "epoch": 20.01, + "grad_norm": 18.10626792907715, + "learning_rate": 3.9492860297364936e-05, + "loss": 0.0796, "step": 8730 }, { - "epoch": 26.01, - "grad_norm": 0.4567221999168396, - "learning_rate": 1.4601686972820994e-05, - "loss": 0.0338, + "epoch": 20.01, + "grad_norm": 0.09350816160440445, + "learning_rate": 3.9474459001913736e-05, + "loss": 0.0159, "step": 8740 }, { - "epoch": 26.01, - "grad_norm": 0.03632340207695961, - "learning_rate": 1.4554826616682288e-05, - "loss": 0.0004, + "epoch": 20.01, + "grad_norm": 0.45954638719558716, + "learning_rate": 3.9456057706462536e-05, + "loss": 0.081, "step": 8750 }, { - "epoch": 26.02, - "grad_norm": 0.005994404200464487, - "learning_rate": 1.4507966260543582e-05, - "loss": 0.0004, + "epoch": 20.01, + "grad_norm": 0.6258265972137451, + "learning_rate": 3.9437656411011336e-05, + "loss": 0.2643, "step": 8760 }, { - "epoch": 26.02, - "grad_norm": 0.004464009311050177, - "learning_rate": 1.4461105904404872e-05, - "loss": 0.0654, + "epoch": 20.01, + "grad_norm": 48.40532684326172, + "learning_rate": 3.9419255115560137e-05, + "loss": 0.1233, "step": 8770 }, { - "epoch": 26.02, - "grad_norm": 0.5976660251617432, - "learning_rate": 1.4414245548266168e-05, - "loss": 0.0374, + "epoch": 20.01, + "grad_norm": 1.5936936140060425, + "learning_rate": 3.940085382010894e-05, + "loss": 0.0819, "step": 8780 }, { - "epoch": 26.02, - "grad_norm": 0.003634576452895999, - "learning_rate": 1.4367385192127462e-05, - "loss": 0.0921, + "epoch": 20.01, + "grad_norm": 0.1590225249528885, + "learning_rate": 3.938245252465774e-05, + "loss": 0.1419, "step": 8790 }, { - "epoch": 26.02, - "grad_norm": 0.005951224360615015, - "learning_rate": 1.4320524835988756e-05, - "loss": 0.0096, + "epoch": 20.01, + "grad_norm": 0.9333840012550354, + "learning_rate": 3.936405122920654e-05, + "loss": 0.0307, "step": 8800 }, { - "epoch": 26.02, - "grad_norm": 0.0037890600506216288, - "learning_rate": 1.4273664479850047e-05, - "loss": 0.0005, + "epoch": 20.01, + "grad_norm": 3.0169289112091064, + "learning_rate": 3.9345649933755344e-05, + "loss": 0.3203, "step": 8810 }, { - "epoch": 26.02, - "grad_norm": 0.00492624519392848, - "learning_rate": 1.422680412371134e-05, - "loss": 0.0106, + "epoch": 20.01, + "grad_norm": 0.12335663288831711, + "learning_rate": 3.932724863830414e-05, + "loss": 0.1741, "step": 8820 }, { - "epoch": 26.02, - "grad_norm": 0.0032419913914054632, - "learning_rate": 1.4179943767572635e-05, - "loss": 0.0068, + "epoch": 20.01, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 1.231528401374817, + "eval_runtime": 39.1219, + "eval_samples_per_second": 22.698, + "eval_steps_per_second": 1.892, + "step": 8820 + }, + { + "epoch": 21.0, + "grad_norm": 0.8017550110816956, + "learning_rate": 3.930884734285294e-05, + "loss": 0.1648, "step": 8830 }, { - "epoch": 26.02, - "grad_norm": 0.006293057929724455, - "learning_rate": 1.4133083411433929e-05, - "loss": 0.0646, + "epoch": 21.0, + "grad_norm": 0.028409045189619064, + "learning_rate": 3.9290446047401744e-05, + "loss": 0.1764, "step": 8840 }, { - "epoch": 26.02, - "grad_norm": 0.00998301524668932, - "learning_rate": 1.4086223055295219e-05, - "loss": 0.0163, + "epoch": 21.0, + "grad_norm": 0.8386111259460449, + "learning_rate": 3.927204475195054e-05, + "loss": 0.3142, "step": 8850 }, { - "epoch": 26.02, - "grad_norm": 0.08308320492506027, - "learning_rate": 1.4039362699156513e-05, - "loss": 0.0003, + "epoch": 21.0, + "grad_norm": 5.57110595703125, + "learning_rate": 3.925364345649934e-05, + "loss": 0.2943, "step": 8860 }, { - "epoch": 26.02, - "grad_norm": 0.0030508143827319145, - "learning_rate": 1.3992502343017807e-05, - "loss": 0.064, + "epoch": 21.0, + "grad_norm": 18.612104415893555, + "learning_rate": 3.9235242161048144e-05, + "loss": 0.0672, "step": 8870 }, { - "epoch": 26.03, - "grad_norm": 0.3473857045173645, - "learning_rate": 1.3945641986879101e-05, - "loss": 0.0006, + "epoch": 21.0, + "grad_norm": 1.4769333600997925, + "learning_rate": 3.921684086559694e-05, + "loss": 0.0753, "step": 8880 }, { - "epoch": 26.03, - "grad_norm": 0.002809175057336688, - "learning_rate": 1.3898781630740395e-05, - "loss": 0.0004, + "epoch": 21.0, + "grad_norm": 41.26362991333008, + "learning_rate": 3.919843957014574e-05, + "loss": 0.1466, "step": 8890 }, { - "epoch": 26.03, - "grad_norm": 1.4648064374923706, - "learning_rate": 1.3851921274601688e-05, - "loss": 0.0478, + "epoch": 21.0, + "grad_norm": 0.1891375631093979, + "learning_rate": 3.9180038274694545e-05, + "loss": 0.0904, "step": 8900 }, { - "epoch": 26.03, - "grad_norm": 0.008449913933873177, - "learning_rate": 1.3805060918462982e-05, - "loss": 0.0528, - "step": 8910 - }, - { - "epoch": 26.03, - "eval_accuracy": 0.7893982808022922, - "eval_loss": 1.3669614791870117, - "eval_runtime": 34.4841, - "eval_samples_per_second": 20.241, - "eval_steps_per_second": 1.711, + "epoch": 21.0, + "grad_norm": 0.03610742464661598, + "learning_rate": 3.916163697924334e-05, + "loss": 0.0738, "step": 8910 }, { - "epoch": 27.0, - "grad_norm": 0.009650280699133873, - "learning_rate": 1.3758200562324276e-05, - "loss": 0.0131, + "epoch": 21.0, + "grad_norm": 36.820213317871094, + "learning_rate": 3.914323568379214e-05, + "loss": 0.1528, "step": 8920 }, { - "epoch": 27.0, - "grad_norm": 0.00797184742987156, - "learning_rate": 1.371134020618557e-05, - "loss": 0.0228, + "epoch": 21.0, + "grad_norm": 42.04949951171875, + "learning_rate": 3.9124834388340945e-05, + "loss": 0.1982, "step": 8930 }, { - "epoch": 27.0, - "grad_norm": 0.004884254653006792, - "learning_rate": 1.366447985004686e-05, - "loss": 0.0002, + "epoch": 21.0, + "grad_norm": 7.4204864501953125, + "learning_rate": 3.9106433092889745e-05, + "loss": 0.0767, "step": 8940 }, { - "epoch": 27.0, - "grad_norm": 0.0034428162034600973, - "learning_rate": 1.3617619493908154e-05, - "loss": 0.0017, + "epoch": 21.0, + "grad_norm": 3.0327413082122803, + "learning_rate": 3.908803179743854e-05, + "loss": 0.11, "step": 8950 }, { - "epoch": 27.0, - "grad_norm": 0.004850251600146294, - "learning_rate": 1.3570759137769448e-05, - "loss": 0.0611, + "epoch": 21.0, + "grad_norm": 1.1203861236572266, + "learning_rate": 3.906963050198734e-05, + "loss": 0.0456, "step": 8960 }, { - "epoch": 27.01, - "grad_norm": 0.028377506881952286, - "learning_rate": 1.3523898781630742e-05, - "loss": 0.0002, + "epoch": 21.0, + "grad_norm": 4.915249347686768, + "learning_rate": 3.9051229206536146e-05, + "loss": 0.2851, "step": 8970 }, { - "epoch": 27.01, - "grad_norm": 0.0027832870837301016, - "learning_rate": 1.3477038425492033e-05, - "loss": 0.0475, + "epoch": 21.01, + "grad_norm": 0.033022407442331314, + "learning_rate": 3.903282791108494e-05, + "loss": 0.1003, "step": 8980 }, { - "epoch": 27.01, - "grad_norm": 0.11803574860095978, - "learning_rate": 1.3430178069353327e-05, - "loss": 0.0005, + "epoch": 21.01, + "grad_norm": 34.32133102416992, + "learning_rate": 3.901442661563374e-05, + "loss": 0.0292, "step": 8990 }, { - "epoch": 27.01, - "grad_norm": 0.23726528882980347, - "learning_rate": 1.338331771321462e-05, - "loss": 0.0003, + "epoch": 21.01, + "grad_norm": 2.097433090209961, + "learning_rate": 3.8996025320182546e-05, + "loss": 0.0478, "step": 9000 }, { - "epoch": 27.01, - "grad_norm": 0.00947210006415844, - "learning_rate": 1.3336457357075915e-05, - "loss": 0.0004, + "epoch": 21.01, + "grad_norm": 0.08751551806926727, + "learning_rate": 3.897762402473134e-05, + "loss": 0.0678, "step": 9010 }, { - "epoch": 27.01, - "grad_norm": 0.006413722410798073, - "learning_rate": 1.3289597000937207e-05, - "loss": 0.0002, + "epoch": 21.01, + "grad_norm": 11.18073558807373, + "learning_rate": 3.895922272928014e-05, + "loss": 0.0649, "step": 9020 }, { - "epoch": 27.01, - "grad_norm": 0.0032037473283708096, - "learning_rate": 1.3242736644798501e-05, - "loss": 0.0022, + "epoch": 21.01, + "grad_norm": 35.868499755859375, + "learning_rate": 3.8940821433828946e-05, + "loss": 0.0511, "step": 9030 }, { - "epoch": 27.01, - "grad_norm": 0.0032357927411794662, - "learning_rate": 1.3195876288659795e-05, - "loss": 0.0336, + "epoch": 21.01, + "grad_norm": 9.828630447387695, + "learning_rate": 3.892242013837774e-05, + "loss": 0.2034, "step": 9040 }, { - "epoch": 27.01, - "grad_norm": 1.7443645000457764, - "learning_rate": 1.314901593252109e-05, - "loss": 0.0008, + "epoch": 21.01, + "grad_norm": 12.086048126220703, + "learning_rate": 3.890401884292654e-05, + "loss": 0.0642, "step": 9050 }, { - "epoch": 27.01, - "grad_norm": 0.002807668410241604, - "learning_rate": 1.310215557638238e-05, - "loss": 0.0002, + "epoch": 21.01, + "grad_norm": 8.521302223205566, + "learning_rate": 3.888561754747535e-05, + "loss": 0.2202, "step": 9060 }, { - "epoch": 27.01, - "grad_norm": 0.0025303384754806757, - "learning_rate": 1.3055295220243674e-05, - "loss": 0.0005, + "epoch": 21.01, + "grad_norm": 13.418307304382324, + "learning_rate": 3.886721625202415e-05, + "loss": 0.1072, "step": 9070 }, { - "epoch": 27.01, - "grad_norm": 0.37589800357818604, - "learning_rate": 1.3008434864104968e-05, - "loss": 0.0713, + "epoch": 21.01, + "grad_norm": 0.07701459527015686, + "learning_rate": 3.884881495657294e-05, + "loss": 0.0174, "step": 9080 }, { - "epoch": 27.02, - "grad_norm": 0.002870983211323619, - "learning_rate": 1.2961574507966262e-05, - "loss": 0.0204, + "epoch": 21.01, + "grad_norm": 6.8916192054748535, + "learning_rate": 3.883041366112175e-05, + "loss": 0.2348, "step": 9090 }, { - "epoch": 27.02, - "grad_norm": 0.017298812046647072, - "learning_rate": 1.2914714151827554e-05, - "loss": 0.0268, + "epoch": 21.01, + "grad_norm": 0.03478574380278587, + "learning_rate": 3.881201236567055e-05, + "loss": 0.0187, "step": 9100 }, { - "epoch": 27.02, - "grad_norm": 0.00800339411944151, - "learning_rate": 1.2867853795688848e-05, - "loss": 0.1058, + "epoch": 21.01, + "grad_norm": 19.356990814208984, + "learning_rate": 3.879361107021934e-05, + "loss": 0.131, "step": 9110 }, { - "epoch": 27.02, - "grad_norm": 0.0037049567326903343, - "learning_rate": 1.2820993439550142e-05, - "loss": 0.0002, + "epoch": 21.01, + "grad_norm": 0.17833730578422546, + "learning_rate": 3.877520977476815e-05, + "loss": 0.0998, "step": 9120 }, { - "epoch": 27.02, - "grad_norm": 0.0456349216401577, - "learning_rate": 1.2774133083411436e-05, - "loss": 0.013, + "epoch": 21.01, + "grad_norm": 0.059898946434259415, + "learning_rate": 3.875680847931695e-05, + "loss": 0.0682, "step": 9130 }, { - "epoch": 27.02, - "grad_norm": 0.003435475053265691, - "learning_rate": 1.2727272727272727e-05, - "loss": 0.0392, + "epoch": 21.01, + "grad_norm": 15.192434310913086, + "learning_rate": 3.873840718386574e-05, + "loss": 0.0871, "step": 9140 }, { - "epoch": 27.02, - "grad_norm": 0.0036417213268578053, - "learning_rate": 1.268041237113402e-05, - "loss": 0.0003, + "epoch": 21.01, + "grad_norm": 0.03356494382023811, + "learning_rate": 3.872000588841455e-05, + "loss": 0.082, "step": 9150 }, { - "epoch": 27.02, - "grad_norm": 0.01712891273200512, - "learning_rate": 1.2633552014995315e-05, - "loss": 0.0151, + "epoch": 21.01, + "grad_norm": 10.07889175415039, + "learning_rate": 3.870160459296335e-05, + "loss": 0.1058, "step": 9160 }, { - "epoch": 27.02, - "grad_norm": 0.22050367295742035, - "learning_rate": 1.2586691658856609e-05, - "loss": 0.0004, + "epoch": 21.01, + "grad_norm": 0.051915716379880905, + "learning_rate": 3.868320329751215e-05, + "loss": 0.0869, "step": 9170 }, { - "epoch": 27.02, - "grad_norm": 0.07363563030958176, - "learning_rate": 1.2539831302717903e-05, - "loss": 0.0084, + "epoch": 21.01, + "grad_norm": 21.866188049316406, + "learning_rate": 3.866480200206095e-05, + "loss": 0.1787, "step": 9180 }, { - "epoch": 27.02, - "grad_norm": 0.0359061174094677, - "learning_rate": 1.2492970946579195e-05, - "loss": 0.0008, + "epoch": 21.01, + "grad_norm": 0.09002427756786346, + "learning_rate": 3.864640070660975e-05, + "loss": 0.0945, "step": 9190 }, { - "epoch": 27.02, - "grad_norm": 0.007665242068469524, - "learning_rate": 1.2446110590440487e-05, - "loss": 0.0623, + "epoch": 21.01, + "grad_norm": 36.970481872558594, + "learning_rate": 3.862799941115855e-05, + "loss": 0.1424, "step": 9200 }, { - "epoch": 27.03, - "grad_norm": 0.0135026965290308, - "learning_rate": 1.2399250234301781e-05, - "loss": 0.011, + "epoch": 21.01, + "grad_norm": 0.6947605013847351, + "learning_rate": 3.860959811570735e-05, + "loss": 0.1761, "step": 9210 }, { - "epoch": 27.03, - "grad_norm": 3.0960659980773926, - "learning_rate": 1.2352389878163074e-05, - "loss": 0.05, + "epoch": 21.01, + "grad_norm": 0.016832459717988968, + "learning_rate": 3.859119682025615e-05, + "loss": 0.0497, "step": 9220 }, { - "epoch": 27.03, - "grad_norm": 0.01721220277249813, - "learning_rate": 1.2305529522024368e-05, - "loss": 0.0707, + "epoch": 21.01, + "grad_norm": 3.5753579139709473, + "learning_rate": 3.857279552480495e-05, + "loss": 0.1527, "step": 9230 }, { - "epoch": 27.03, - "grad_norm": 0.004766266793012619, - "learning_rate": 1.2258669165885662e-05, - "loss": 0.0002, + "epoch": 21.01, + "grad_norm": 0.09591725468635559, + "learning_rate": 3.855439422935375e-05, + "loss": 0.0629, "step": 9240 }, { - "epoch": 27.03, - "eval_accuracy": 0.7965616045845272, - "eval_loss": 1.3398090600967407, - "eval_runtime": 33.8835, - "eval_samples_per_second": 20.6, - "eval_steps_per_second": 1.741, + "epoch": 21.01, + "eval_accuracy": 0.722972972972973, + "eval_loss": 1.394756555557251, + "eval_runtime": 38.8213, + "eval_samples_per_second": 22.874, + "eval_steps_per_second": 1.906, "step": 9240 }, { - "epoch": 28.0, - "grad_norm": 0.0029590607155114412, - "learning_rate": 1.2211808809746956e-05, - "loss": 0.0831, + "epoch": 22.0, + "grad_norm": 0.05974861979484558, + "learning_rate": 3.853599293390255e-05, + "loss": 0.0187, "step": 9250 }, { - "epoch": 28.0, - "grad_norm": 0.0406717024743557, - "learning_rate": 1.2164948453608248e-05, - "loss": 0.0004, + "epoch": 22.0, + "grad_norm": 0.03310147300362587, + "learning_rate": 3.851759163845135e-05, + "loss": 0.0019, "step": 9260 }, { - "epoch": 28.0, - "grad_norm": 0.03818991780281067, - "learning_rate": 1.2118088097469542e-05, - "loss": 0.0003, + "epoch": 22.0, + "grad_norm": 28.746809005737305, + "learning_rate": 3.849919034300015e-05, + "loss": 0.115, "step": 9270 }, { - "epoch": 28.0, - "grad_norm": 0.007692787330597639, - "learning_rate": 1.2071227741330834e-05, - "loss": 0.0096, + "epoch": 22.0, + "grad_norm": 1.6740992069244385, + "learning_rate": 3.848078904754895e-05, + "loss": 0.2023, "step": 9280 }, { - "epoch": 28.0, - "grad_norm": 0.08763420581817627, - "learning_rate": 1.2024367385192128e-05, - "loss": 0.0663, + "epoch": 22.0, + "grad_norm": 40.7819938659668, + "learning_rate": 3.846238775209775e-05, + "loss": 0.1216, "step": 9290 }, { - "epoch": 28.01, - "grad_norm": 0.00407880125567317, - "learning_rate": 1.197750702905342e-05, - "loss": 0.0031, + "epoch": 22.0, + "grad_norm": 0.050575271248817444, + "learning_rate": 3.844398645664655e-05, + "loss": 0.074, "step": 9300 }, { - "epoch": 28.01, - "grad_norm": 0.006175840273499489, - "learning_rate": 1.1930646672914715e-05, - "loss": 0.0865, + "epoch": 22.0, + "grad_norm": 87.55679321289062, + "learning_rate": 3.842558516119535e-05, + "loss": 0.2161, "step": 9310 }, { - "epoch": 28.01, - "grad_norm": 6.280770301818848, - "learning_rate": 1.1883786316776007e-05, - "loss": 0.0024, + "epoch": 22.0, + "grad_norm": 0.12832655012607574, + "learning_rate": 3.840718386574415e-05, + "loss": 0.0803, "step": 9320 }, { - "epoch": 28.01, - "grad_norm": 12.87985610961914, - "learning_rate": 1.1836925960637301e-05, - "loss": 0.0468, + "epoch": 22.0, + "grad_norm": 42.25579833984375, + "learning_rate": 3.838878257029295e-05, + "loss": 0.1586, "step": 9330 }, { - "epoch": 28.01, - "grad_norm": 0.04478368163108826, - "learning_rate": 1.1790065604498595e-05, - "loss": 0.0002, + "epoch": 22.0, + "grad_norm": 28.92885971069336, + "learning_rate": 3.837038127484175e-05, + "loss": 0.0364, "step": 9340 }, { - "epoch": 28.01, - "grad_norm": 0.005779411643743515, - "learning_rate": 1.1743205248359889e-05, - "loss": 0.0002, + "epoch": 22.0, + "grad_norm": 1.6303467750549316, + "learning_rate": 3.835197997939055e-05, + "loss": 0.0356, "step": 9350 }, { - "epoch": 28.01, - "grad_norm": 0.0021354747004806995, - "learning_rate": 1.1696344892221181e-05, - "loss": 0.0002, + "epoch": 22.0, + "grad_norm": 9.19245719909668, + "learning_rate": 3.833357868393935e-05, + "loss": 0.107, "step": 9360 }, { - "epoch": 28.01, - "grad_norm": 5.800276756286621, - "learning_rate": 1.1649484536082475e-05, - "loss": 0.01, + "epoch": 22.0, + "grad_norm": 2.077812671661377, + "learning_rate": 3.831517738848815e-05, + "loss": 0.0545, "step": 9370 }, { - "epoch": 28.01, - "grad_norm": 0.0021325971465557814, - "learning_rate": 1.1602624179943768e-05, - "loss": 0.0133, - "step": 9380 + "epoch": 22.0, + "grad_norm": 0.058572422713041306, + "learning_rate": 3.829677609303695e-05, + "loss": 0.0071, + "step": 9380 }, { - "epoch": 28.01, - "grad_norm": 0.003749684663489461, - "learning_rate": 1.1555763823805062e-05, - "loss": 0.0003, + "epoch": 22.0, + "grad_norm": 50.71233367919922, + "learning_rate": 3.827837479758575e-05, + "loss": 0.0593, "step": 9390 }, { - "epoch": 28.01, - "grad_norm": 0.006699393503367901, - "learning_rate": 1.1508903467666354e-05, - "loss": 0.0058, + "epoch": 22.01, + "grad_norm": 31.309873580932617, + "learning_rate": 3.825997350213455e-05, + "loss": 0.1554, "step": 9400 }, { - "epoch": 28.01, - "grad_norm": 0.0032480955123901367, - "learning_rate": 1.1462043111527648e-05, - "loss": 0.0184, + "epoch": 22.01, + "grad_norm": 9.297453880310059, + "learning_rate": 3.824157220668335e-05, + "loss": 0.0624, "step": 9410 }, { - "epoch": 28.02, - "grad_norm": 0.0017323597567155957, - "learning_rate": 1.141518275538894e-05, - "loss": 0.0002, + "epoch": 22.01, + "grad_norm": 0.3019231855869293, + "learning_rate": 3.822317091123215e-05, + "loss": 0.0708, "step": 9420 }, { - "epoch": 28.02, - "grad_norm": 0.0023274635896086693, - "learning_rate": 1.1368322399250234e-05, - "loss": 0.0002, + "epoch": 22.01, + "grad_norm": 16.121776580810547, + "learning_rate": 3.820476961578096e-05, + "loss": 0.0671, "step": 9430 }, { - "epoch": 28.02, - "grad_norm": 0.003413414815440774, - "learning_rate": 1.1321462043111528e-05, - "loss": 0.0014, + "epoch": 22.01, + "grad_norm": 8.922002792358398, + "learning_rate": 3.818636832032975e-05, + "loss": 0.1623, "step": 9440 }, { - "epoch": 28.02, - "grad_norm": 0.01558777131140232, - "learning_rate": 1.1274601686972822e-05, - "loss": 0.042, + "epoch": 22.01, + "grad_norm": 5.869600296020508, + "learning_rate": 3.816796702487855e-05, + "loss": 0.1411, "step": 9450 }, { - "epoch": 28.02, - "grad_norm": 0.003739135107025504, - "learning_rate": 1.1227741330834115e-05, - "loss": 0.0136, + "epoch": 22.01, + "grad_norm": 0.02841232158243656, + "learning_rate": 3.814956572942736e-05, + "loss": 0.1246, "step": 9460 }, { - "epoch": 28.02, - "grad_norm": 0.043951474130153656, - "learning_rate": 1.1180880974695409e-05, - "loss": 0.0804, + "epoch": 22.01, + "grad_norm": 20.73468589782715, + "learning_rate": 3.813116443397615e-05, + "loss": 0.1599, "step": 9470 }, { - "epoch": 28.02, - "grad_norm": 0.0028257304802536964, - "learning_rate": 1.1134020618556703e-05, - "loss": 0.0002, + "epoch": 22.01, + "grad_norm": 33.60063171386719, + "learning_rate": 3.811276313852495e-05, + "loss": 0.1519, "step": 9480 }, { - "epoch": 28.02, - "grad_norm": 0.0037733283825218678, - "learning_rate": 1.1087160262417995e-05, - "loss": 0.0004, + "epoch": 22.01, + "grad_norm": 39.62193298339844, + "learning_rate": 3.809436184307376e-05, + "loss": 0.0782, "step": 9490 }, { - "epoch": 28.02, - "grad_norm": 0.056008391082286835, - "learning_rate": 1.1040299906279289e-05, - "loss": 0.0319, + "epoch": 22.01, + "grad_norm": 13.764589309692383, + "learning_rate": 3.807596054762255e-05, + "loss": 0.2313, "step": 9500 }, { - "epoch": 28.02, - "grad_norm": 0.0031692145857959986, - "learning_rate": 1.0993439550140581e-05, - "loss": 0.0685, + "epoch": 22.01, + "grad_norm": 0.05422932282090187, + "learning_rate": 3.805755925217135e-05, + "loss": 0.0522, "step": 9510 }, { - "epoch": 28.02, - "grad_norm": 0.004492649342864752, - "learning_rate": 1.0946579194001875e-05, - "loss": 0.0137, + "epoch": 22.01, + "grad_norm": 0.43672868609428406, + "learning_rate": 3.803915795672016e-05, + "loss": 0.0095, "step": 9520 }, { - "epoch": 28.02, - "grad_norm": 0.0059276544488966465, - "learning_rate": 1.0899718837863168e-05, - "loss": 0.0007, + "epoch": 22.01, + "grad_norm": 5.034006595611572, + "learning_rate": 3.802075666126895e-05, + "loss": 0.2069, "step": 9530 }, { - "epoch": 28.03, - "grad_norm": 0.004752593580633402, - "learning_rate": 1.0852858481724462e-05, - "loss": 0.0193, + "epoch": 22.01, + "grad_norm": 0.8128895163536072, + "learning_rate": 3.800235536581775e-05, + "loss": 0.0059, "step": 9540 }, { - "epoch": 28.03, - "grad_norm": 0.006295239552855492, - "learning_rate": 1.0805998125585754e-05, - "loss": 0.0037, + "epoch": 22.01, + "grad_norm": 0.04529860243201256, + "learning_rate": 3.798395407036656e-05, + "loss": 0.0623, "step": 9550 }, { - "epoch": 28.03, - "grad_norm": 41.76048278808594, - "learning_rate": 1.0759137769447048e-05, - "loss": 0.0485, + "epoch": 22.01, + "grad_norm": 0.05076577514410019, + "learning_rate": 3.796555277491536e-05, + "loss": 0.0635, "step": 9560 }, { - "epoch": 28.03, - "grad_norm": 0.0014590555801987648, - "learning_rate": 1.0712277413308342e-05, - "loss": 0.0562, - "step": 9570 - }, - { - "epoch": 28.03, - "eval_accuracy": 0.7979942693409742, - "eval_loss": 1.3444451093673706, - "eval_runtime": 34.4815, - "eval_samples_per_second": 20.243, - "eval_steps_per_second": 1.711, + "epoch": 22.01, + "grad_norm": 0.09839289635419846, + "learning_rate": 3.794715147946415e-05, + "loss": 0.1778, "step": 9570 }, { - "epoch": 29.0, - "grad_norm": 0.00453572254627943, - "learning_rate": 1.0665417057169636e-05, - "loss": 0.0002, + "epoch": 22.01, + "grad_norm": 3.5754637718200684, + "learning_rate": 3.792875018401296e-05, + "loss": 0.1769, "step": 9580 }, { - "epoch": 29.0, - "grad_norm": 0.04278896749019623, - "learning_rate": 1.0618556701030928e-05, - "loss": 0.0332, + "epoch": 22.01, + "grad_norm": 22.517118453979492, + "learning_rate": 3.791034888856176e-05, + "loss": 0.1084, "step": 9590 }, { - "epoch": 29.0, - "grad_norm": 0.0018639545887708664, - "learning_rate": 1.0571696344892222e-05, - "loss": 0.0002, + "epoch": 22.01, + "grad_norm": 28.474924087524414, + "learning_rate": 3.789194759311055e-05, + "loss": 0.1328, "step": 9600 }, { - "epoch": 29.0, - "grad_norm": 0.005231024231761694, - "learning_rate": 1.0524835988753515e-05, - "loss": 0.0009, + "epoch": 22.01, + "grad_norm": 0.3125380277633667, + "learning_rate": 3.787354629765936e-05, + "loss": 0.1563, "step": 9610 }, { - "epoch": 29.0, - "grad_norm": 0.0017461972311139107, - "learning_rate": 1.0477975632614809e-05, - "loss": 0.0001, + "epoch": 22.01, + "grad_norm": 5.8578104972839355, + "learning_rate": 3.785514500220816e-05, + "loss": 0.1266, "step": 9620 }, { - "epoch": 29.01, - "grad_norm": 7.924661636352539, - "learning_rate": 1.0431115276476101e-05, - "loss": 0.0308, + "epoch": 22.01, + "grad_norm": 0.23546111583709717, + "learning_rate": 3.783674370675695e-05, + "loss": 0.1167, "step": 9630 }, { - "epoch": 29.01, - "grad_norm": 0.012182795442640781, - "learning_rate": 1.0384254920337395e-05, - "loss": 0.0043, + "epoch": 22.01, + "grad_norm": 0.32572802901268005, + "learning_rate": 3.781834241130576e-05, + "loss": 0.1068, "step": 9640 }, { - "epoch": 29.01, - "grad_norm": 0.0014665969647467136, - "learning_rate": 1.0337394564198687e-05, - "loss": 0.0002, + "epoch": 22.01, + "grad_norm": 0.057987648993730545, + "learning_rate": 3.779994111585456e-05, + "loss": 0.0996, "step": 9650 }, { - "epoch": 29.01, - "grad_norm": 0.0027665491215884686, - "learning_rate": 1.0290534208059981e-05, - "loss": 0.0097, + "epoch": 22.01, + "grad_norm": 0.020183347165584564, + "learning_rate": 3.7781539820403354e-05, + "loss": 0.0075, "step": 9660 }, { - "epoch": 29.01, - "grad_norm": 0.0026171233039349318, - "learning_rate": 1.0243673851921275e-05, - "loss": 0.0543, + "epoch": 22.01, + "eval_accuracy": 0.7376126126126126, + "eval_loss": 1.1434566974639893, + "eval_runtime": 38.9969, + "eval_samples_per_second": 22.771, + "eval_steps_per_second": 1.898, + "step": 9660 + }, + { + "epoch": 23.0, + "grad_norm": 1.9055886268615723, + "learning_rate": 3.776313852495216e-05, + "loss": 0.0124, "step": 9670 }, { - "epoch": 29.01, - "grad_norm": 0.005574346520006657, - "learning_rate": 1.019681349578257e-05, - "loss": 0.0001, + "epoch": 23.0, + "grad_norm": 3.3849780559539795, + "learning_rate": 3.774473722950096e-05, + "loss": 0.1073, "step": 9680 }, { - "epoch": 29.01, - "grad_norm": 0.07008225470781326, - "learning_rate": 1.0149953139643862e-05, - "loss": 0.0321, + "epoch": 23.0, + "grad_norm": 24.8551025390625, + "learning_rate": 3.772633593404976e-05, + "loss": 0.0921, "step": 9690 }, { - "epoch": 29.01, - "grad_norm": 58.78215408325195, - "learning_rate": 1.0103092783505156e-05, - "loss": 0.106, + "epoch": 23.0, + "grad_norm": 0.3700391948223114, + "learning_rate": 3.770793463859856e-05, + "loss": 0.1239, "step": 9700 }, { - "epoch": 29.01, - "grad_norm": 0.010860403068363667, - "learning_rate": 1.0056232427366448e-05, - "loss": 0.0625, + "epoch": 23.0, + "grad_norm": 0.06266848742961884, + "learning_rate": 3.768953334314736e-05, + "loss": 0.0568, "step": 9710 }, { - "epoch": 29.01, - "grad_norm": 0.09613881260156631, - "learning_rate": 1.0009372071227742e-05, - "loss": 0.007, + "epoch": 23.0, + "grad_norm": 3.016946315765381, + "learning_rate": 3.767113204769616e-05, + "loss": 0.0159, "step": 9720 }, { - "epoch": 29.01, - "grad_norm": 0.01900539919734001, - "learning_rate": 9.962511715089034e-06, - "loss": 0.0109, + "epoch": 23.0, + "grad_norm": 0.05134722962975502, + "learning_rate": 3.765273075224496e-05, + "loss": 0.0665, "step": 9730 }, { - "epoch": 29.01, - "grad_norm": 0.009411687031388283, - "learning_rate": 9.915651358950328e-06, - "loss": 0.0018, + "epoch": 23.0, + "grad_norm": 0.021403346210718155, + "learning_rate": 3.763432945679376e-05, + "loss": 0.0372, "step": 9740 }, { - "epoch": 29.02, - "grad_norm": 16.337764739990234, - "learning_rate": 9.86879100281162e-06, - "loss": 0.0593, + "epoch": 23.0, + "grad_norm": 0.06313654035329819, + "learning_rate": 3.761592816134256e-05, + "loss": 0.0333, "step": 9750 }, { - "epoch": 29.02, - "grad_norm": 0.028839366510510445, - "learning_rate": 9.821930646672915e-06, - "loss": 0.0604, + "epoch": 23.0, + "grad_norm": 0.017108239233493805, + "learning_rate": 3.759752686589136e-05, + "loss": 0.0688, "step": 9760 }, { - "epoch": 29.02, - "grad_norm": 0.017779843881726265, - "learning_rate": 9.775070290534209e-06, - "loss": 0.0365, + "epoch": 23.0, + "grad_norm": 36.144142150878906, + "learning_rate": 3.757912557044016e-05, + "loss": 0.1642, "step": 9770 }, { - "epoch": 29.02, - "grad_norm": 0.003166783368214965, - "learning_rate": 9.728209934395503e-06, - "loss": 0.0053, + "epoch": 23.0, + "grad_norm": 0.028163446113467216, + "learning_rate": 3.756072427498896e-05, + "loss": 0.0751, "step": 9780 }, { - "epoch": 29.02, - "grad_norm": 0.0021175253205001354, - "learning_rate": 9.681349578256797e-06, - "loss": 0.0001, + "epoch": 23.0, + "grad_norm": 0.11214728653430939, + "learning_rate": 3.754232297953776e-05, + "loss": 0.0801, "step": 9790 }, { - "epoch": 29.02, - "grad_norm": 0.20540370047092438, - "learning_rate": 9.634489222118089e-06, - "loss": 0.0003, + "epoch": 23.0, + "grad_norm": 0.01722540520131588, + "learning_rate": 3.752392168408656e-05, + "loss": 0.0971, "step": 9800 }, { - "epoch": 29.02, - "grad_norm": 0.002469886327162385, - "learning_rate": 9.587628865979383e-06, - "loss": 0.0002, + "epoch": 23.0, + "grad_norm": 8.70751667022705, + "learning_rate": 3.750552038863536e-05, + "loss": 0.2075, "step": 9810 }, { - "epoch": 29.02, - "grad_norm": 0.012953460216522217, - "learning_rate": 9.540768509840675e-06, - "loss": 0.0271, + "epoch": 23.01, + "grad_norm": 0.03713701665401459, + "learning_rate": 3.748711909318416e-05, + "loss": 0.02, "step": 9820 }, { - "epoch": 29.02, - "grad_norm": 0.02899201773107052, - "learning_rate": 9.49390815370197e-06, - "loss": 0.0414, + "epoch": 23.01, + "grad_norm": 0.012665356509387493, + "learning_rate": 3.746871779773296e-05, + "loss": 0.0393, "step": 9830 }, { - "epoch": 29.02, - "grad_norm": 0.0029837109614163637, - "learning_rate": 9.447047797563262e-06, - "loss": 0.0002, + "epoch": 23.01, + "grad_norm": 0.6546823382377625, + "learning_rate": 3.745031650228176e-05, + "loss": 0.1762, "step": 9840 }, { - "epoch": 29.02, - "grad_norm": 61.894805908203125, - "learning_rate": 9.400187441424556e-06, - "loss": 0.0608, + "epoch": 23.01, + "grad_norm": 0.022101113572716713, + "learning_rate": 3.743191520683056e-05, + "loss": 0.0973, "step": 9850 }, { - "epoch": 29.02, - "grad_norm": 0.003072848543524742, - "learning_rate": 9.353327085285848e-06, - "loss": 0.0014, + "epoch": 23.01, + "grad_norm": 80.03093719482422, + "learning_rate": 3.741351391137936e-05, + "loss": 0.1646, "step": 9860 }, { - "epoch": 29.03, - "grad_norm": 0.0036488294135779142, - "learning_rate": 9.306466729147142e-06, - "loss": 0.001, + "epoch": 23.01, + "grad_norm": 13.314861297607422, + "learning_rate": 3.739511261592816e-05, + "loss": 0.1341, "step": 9870 }, { - "epoch": 29.03, - "grad_norm": 0.002994240028783679, - "learning_rate": 9.259606373008434e-06, - "loss": 0.0007, + "epoch": 23.01, + "grad_norm": 0.23819276690483093, + "learning_rate": 3.737671132047696e-05, + "loss": 0.1729, "step": 9880 }, { - "epoch": 29.03, - "grad_norm": 0.01359494123607874, - "learning_rate": 9.212746016869728e-06, - "loss": 0.0007, + "epoch": 23.01, + "grad_norm": 0.4850609600543976, + "learning_rate": 3.735831002502576e-05, + "loss": 0.1631, "step": 9890 }, { - "epoch": 29.03, - "grad_norm": 0.0029486478306353092, - "learning_rate": 9.165885660731022e-06, - "loss": 0.0002, - "step": 9900 - }, - { - "epoch": 29.03, - "eval_accuracy": 0.8166189111747851, - "eval_loss": 1.2678812742233276, - "eval_runtime": 33.8703, - "eval_samples_per_second": 20.608, - "eval_steps_per_second": 1.742, + "epoch": 23.01, + "grad_norm": 0.18072141706943512, + "learning_rate": 3.733990872957456e-05, + "loss": 0.0321, "step": 9900 }, { - "epoch": 30.0, - "grad_norm": 0.0020161494612693787, - "learning_rate": 9.119025304592316e-06, - "loss": 0.0002, + "epoch": 23.01, + "grad_norm": 0.23707066476345062, + "learning_rate": 3.732150743412336e-05, + "loss": 0.0487, "step": 9910 }, { - "epoch": 30.0, - "grad_norm": 0.001892567495815456, - "learning_rate": 9.072164948453609e-06, - "loss": 0.0069, + "epoch": 23.01, + "grad_norm": 0.026308251544833183, + "learning_rate": 3.730310613867217e-05, + "loss": 0.0343, "step": 9920 }, { - "epoch": 30.0, - "grad_norm": 0.003333768341690302, - "learning_rate": 9.025304592314903e-06, - "loss": 0.0017, + "epoch": 23.01, + "grad_norm": 0.01976301707327366, + "learning_rate": 3.7284704843220963e-05, + "loss": 0.0336, "step": 9930 }, { - "epoch": 30.0, - "grad_norm": 5.158819675445557, - "learning_rate": 8.978444236176195e-06, - "loss": 0.0472, + "epoch": 23.01, + "grad_norm": 0.024655556306242943, + "learning_rate": 3.7266303547769764e-05, + "loss": 0.1337, "step": 9940 }, { - "epoch": 30.0, - "grad_norm": 0.010453186929225922, - "learning_rate": 8.931583880037489e-06, - "loss": 0.0017, + "epoch": 23.01, + "grad_norm": 0.06664633005857468, + "learning_rate": 3.724790225231857e-05, + "loss": 0.1152, "step": 9950 }, { - "epoch": 30.01, - "grad_norm": 0.006460112985223532, - "learning_rate": 8.884723523898781e-06, - "loss": 0.0028, + "epoch": 23.01, + "grad_norm": 6.069345474243164, + "learning_rate": 3.7229500956867364e-05, + "loss": 0.0569, "step": 9960 }, { - "epoch": 30.01, - "grad_norm": 1.868633508682251, - "learning_rate": 8.837863167760075e-06, - "loss": 0.0004, + "epoch": 23.01, + "grad_norm": 0.15154345333576202, + "learning_rate": 3.7211099661416164e-05, + "loss": 0.114, "step": 9970 }, { - "epoch": 30.01, - "grad_norm": 0.0062409755773842335, - "learning_rate": 8.791002811621368e-06, - "loss": 0.0182, + "epoch": 23.01, + "grad_norm": 6.500603675842285, + "learning_rate": 3.719269836596497e-05, + "loss": 0.1888, "step": 9980 }, { - "epoch": 30.01, - "grad_norm": 0.002611867617815733, - "learning_rate": 8.744142455482662e-06, - "loss": 0.0002, + "epoch": 23.01, + "grad_norm": 0.02962004393339157, + "learning_rate": 3.7174297070513764e-05, + "loss": 0.1188, "step": 9990 }, { - "epoch": 30.01, - "grad_norm": 0.003704243106767535, - "learning_rate": 8.697282099343956e-06, - "loss": 0.0001, + "epoch": 23.01, + "grad_norm": 30.147991180419922, + "learning_rate": 3.7155895775062564e-05, + "loss": 0.126, "step": 10000 }, { - "epoch": 30.01, - "grad_norm": 0.0019702170975506306, - "learning_rate": 8.65042174320525e-06, - "loss": 0.0574, + "epoch": 23.01, + "grad_norm": 34.535369873046875, + "learning_rate": 3.7137494479611364e-05, + "loss": 0.1595, "step": 10010 }, { - "epoch": 30.01, - "grad_norm": 0.006285363808274269, - "learning_rate": 8.603561387066542e-06, - "loss": 0.0001, + "epoch": 23.01, + "grad_norm": 0.49603599309921265, + "learning_rate": 3.7119093184160165e-05, + "loss": 0.0493, "step": 10020 }, { - "epoch": 30.01, - "grad_norm": 0.0028792780358344316, - "learning_rate": 8.556701030927836e-06, - "loss": 0.0059, + "epoch": 23.01, + "grad_norm": 1.476478934288025, + "learning_rate": 3.7100691888708965e-05, + "loss": 0.2013, "step": 10030 }, { - "epoch": 30.01, - "grad_norm": 0.0020247281063348055, - "learning_rate": 8.509840674789128e-06, - "loss": 0.0005, + "epoch": 23.01, + "grad_norm": 1.7116336822509766, + "learning_rate": 3.7082290593257765e-05, + "loss": 0.1436, "step": 10040 }, { - "epoch": 30.01, - "grad_norm": 0.0075753917917609215, - "learning_rate": 8.462980318650422e-06, - "loss": 0.0039, + "epoch": 23.01, + "grad_norm": 0.0445764921605587, + "learning_rate": 3.706388929780657e-05, + "loss": 0.0622, "step": 10050 }, { - "epoch": 30.01, - "grad_norm": 0.0014275303110480309, - "learning_rate": 8.416119962511715e-06, - "loss": 0.0304, + "epoch": 23.01, + "grad_norm": 0.06141388788819313, + "learning_rate": 3.7045488002355365e-05, + "loss": 0.0872, "step": 10060 }, { - "epoch": 30.01, - "grad_norm": 0.0023228314239531755, - "learning_rate": 8.369259606373009e-06, - "loss": 0.0002, + "epoch": 23.01, + "grad_norm": 0.41972872614860535, + "learning_rate": 3.7027086706904165e-05, + "loss": 0.1058, "step": 10070 }, { - "epoch": 30.02, - "grad_norm": 0.2118721902370453, - "learning_rate": 8.322399250234301e-06, - "loss": 0.0763, + "epoch": 23.01, + "grad_norm": 96.0462646484375, + "learning_rate": 3.700868541145297e-05, + "loss": 0.1692, "step": 10080 }, { - "epoch": 30.02, - "grad_norm": 0.12885122001171112, - "learning_rate": 8.275538894095595e-06, - "loss": 0.0003, + "epoch": 23.01, + "eval_accuracy": 0.7128378378378378, + "eval_loss": 1.39983069896698, + "eval_runtime": 38.9582, + "eval_samples_per_second": 22.794, + "eval_steps_per_second": 1.899, + "step": 10080 + }, + { + "epoch": 24.0, + "grad_norm": 0.01138946134597063, + "learning_rate": 3.6990284116001765e-05, + "loss": 0.1309, "step": 10090 }, { - "epoch": 30.02, - "grad_norm": 0.01231481321156025, - "learning_rate": 8.228678537956889e-06, - "loss": 0.0004, + "epoch": 24.0, + "grad_norm": 3.2111175060272217, + "learning_rate": 3.6971882820550565e-05, + "loss": 0.0148, "step": 10100 }, { - "epoch": 30.02, - "grad_norm": 0.003690751502290368, - "learning_rate": 8.181818181818183e-06, - "loss": 0.0019, + "epoch": 24.0, + "grad_norm": 0.03468929976224899, + "learning_rate": 3.695348152509937e-05, + "loss": 0.0766, "step": 10110 }, { - "epoch": 30.02, - "grad_norm": 0.0029746764339506626, - "learning_rate": 8.134957825679477e-06, - "loss": 0.058, + "epoch": 24.0, + "grad_norm": 0.09320088475942612, + "learning_rate": 3.6935080229648166e-05, + "loss": 0.0566, "step": 10120 }, { - "epoch": 30.02, - "grad_norm": 0.003089478937909007, - "learning_rate": 8.08809746954077e-06, - "loss": 0.0789, + "epoch": 24.0, + "grad_norm": 0.05267590656876564, + "learning_rate": 3.6916678934196966e-05, + "loss": 0.0825, "step": 10130 }, { - "epoch": 30.02, - "grad_norm": 0.0020090111065655947, - "learning_rate": 8.041237113402063e-06, - "loss": 0.0001, + "epoch": 24.0, + "grad_norm": 0.060753244906663895, + "learning_rate": 3.689827763874577e-05, + "loss": 0.0806, "step": 10140 }, { - "epoch": 30.02, - "grad_norm": 0.23149579763412476, - "learning_rate": 7.994376757263356e-06, - "loss": 0.0285, + "epoch": 24.0, + "grad_norm": 0.0423295758664608, + "learning_rate": 3.6879876343294566e-05, + "loss": 0.0056, "step": 10150 }, { - "epoch": 30.02, - "grad_norm": 0.006695437245070934, - "learning_rate": 7.94751640112465e-06, - "loss": 0.0811, + "epoch": 24.0, + "grad_norm": 0.01127055287361145, + "learning_rate": 3.6861475047843366e-05, + "loss": 0.1037, "step": 10160 }, { - "epoch": 30.02, - "grad_norm": 0.03278821334242821, - "learning_rate": 7.900656044985942e-06, - "loss": 0.0118, + "epoch": 24.0, + "grad_norm": 0.08633752912282944, + "learning_rate": 3.684307375239217e-05, + "loss": 0.1213, "step": 10170 }, { - "epoch": 30.02, - "grad_norm": 0.002686940599232912, - "learning_rate": 7.853795688847236e-06, - "loss": 0.0013, + "epoch": 24.0, + "grad_norm": 30.071001052856445, + "learning_rate": 3.682467245694097e-05, + "loss": 0.1545, "step": 10180 }, { - "epoch": 30.02, - "grad_norm": 0.004242202267050743, - "learning_rate": 7.806935332708528e-06, - "loss": 0.0229, + "epoch": 24.0, + "grad_norm": 0.028279367834329605, + "learning_rate": 3.6806271161489766e-05, + "loss": 0.0697, "step": 10190 }, { - "epoch": 30.03, - "grad_norm": 0.12063409388065338, - "learning_rate": 7.760074976569822e-06, - "loss": 0.0447, + "epoch": 24.0, + "grad_norm": 0.04578516632318497, + "learning_rate": 3.678786986603857e-05, + "loss": 0.1627, "step": 10200 }, { - "epoch": 30.03, - "grad_norm": 0.0015561155742034316, - "learning_rate": 7.713214620431115e-06, - "loss": 0.0445, + "epoch": 24.0, + "grad_norm": 0.11059720069169998, + "learning_rate": 3.6769468570587373e-05, + "loss": 0.0776, "step": 10210 }, { - "epoch": 30.03, - "grad_norm": 0.003858069656416774, - "learning_rate": 7.666354264292409e-06, - "loss": 0.0108, + "epoch": 24.0, + "grad_norm": 53.34998321533203, + "learning_rate": 3.675106727513617e-05, + "loss": 0.0741, "step": 10220 }, { - "epoch": 30.03, - "grad_norm": 0.001974466722458601, - "learning_rate": 7.619493908153702e-06, - "loss": 0.0018, - "step": 10230 - }, - { - "epoch": 30.03, - "eval_accuracy": 0.7965616045845272, - "eval_loss": 1.3747639656066895, - "eval_runtime": 33.9072, - "eval_samples_per_second": 20.586, - "eval_steps_per_second": 1.74, + "epoch": 24.0, + "grad_norm": 0.013538829982280731, + "learning_rate": 3.6732665979684974e-05, + "loss": 0.1207, "step": 10230 }, { - "epoch": 31.0, - "grad_norm": 0.0030528667848557234, - "learning_rate": 7.572633552014996e-06, - "loss": 0.0006, + "epoch": 24.01, + "grad_norm": 0.0954127162694931, + "learning_rate": 3.6714264684233774e-05, + "loss": 0.1385, "step": 10240 }, { - "epoch": 31.0, - "grad_norm": 0.07026159763336182, - "learning_rate": 7.525773195876289e-06, - "loss": 0.0002, + "epoch": 24.01, + "grad_norm": 31.611495971679688, + "learning_rate": 3.669586338878257e-05, + "loss": 0.1752, "step": 10250 }, { - "epoch": 31.0, - "grad_norm": 0.013108909130096436, - "learning_rate": 7.478912839737583e-06, - "loss": 0.0083, + "epoch": 24.01, + "grad_norm": 0.2513565719127655, + "learning_rate": 3.6677462093331374e-05, + "loss": 0.0656, "step": 10260 }, { - "epoch": 31.0, - "grad_norm": 0.003818312892690301, - "learning_rate": 7.432052483598875e-06, - "loss": 0.0001, + "epoch": 24.01, + "grad_norm": 0.0673178881406784, + "learning_rate": 3.6659060797880174e-05, + "loss": 0.0656, "step": 10270 }, { - "epoch": 31.0, - "grad_norm": 53.924110412597656, - "learning_rate": 7.385192127460169e-06, - "loss": 0.0049, + "epoch": 24.01, + "grad_norm": 0.04620659723877907, + "learning_rate": 3.664065950242897e-05, + "loss": 0.102, "step": 10280 }, { - "epoch": 31.01, - "grad_norm": 6.479648590087891, - "learning_rate": 7.3383317713214616e-06, - "loss": 0.066, + "epoch": 24.01, + "grad_norm": 11.861926078796387, + "learning_rate": 3.6622258206977774e-05, + "loss": 0.0844, "step": 10290 }, { - "epoch": 31.01, - "grad_norm": 0.005377662368118763, - "learning_rate": 7.2914714151827556e-06, - "loss": 0.0042, + "epoch": 24.01, + "grad_norm": 8.202778816223145, + "learning_rate": 3.6603856911526575e-05, + "loss": 0.1471, "step": 10300 }, { - "epoch": 31.01, - "grad_norm": 0.0028942872304469347, - "learning_rate": 7.244611059044049e-06, - "loss": 0.0006, + "epoch": 24.01, + "grad_norm": 2.2646937370300293, + "learning_rate": 3.6585455616075375e-05, + "loss": 0.0442, "step": 10310 }, { - "epoch": 31.01, - "grad_norm": 0.0026112585328519344, - "learning_rate": 7.197750702905343e-06, - "loss": 0.0074, + "epoch": 24.01, + "grad_norm": 0.02684628963470459, + "learning_rate": 3.6567054320624175e-05, + "loss": 0.0659, "step": 10320 }, { - "epoch": 31.01, - "grad_norm": 0.00254084006883204, - "learning_rate": 7.150890346766635e-06, - "loss": 0.0001, + "epoch": 24.01, + "grad_norm": 0.07974495738744736, + "learning_rate": 3.6548653025172975e-05, + "loss": 0.0989, "step": 10330 }, { - "epoch": 31.01, - "grad_norm": 0.027123264968395233, - "learning_rate": 7.104029990627929e-06, - "loss": 0.0002, + "epoch": 24.01, + "grad_norm": 12.994035720825195, + "learning_rate": 3.6530251729721775e-05, + "loss": 0.0064, "step": 10340 }, { - "epoch": 31.01, - "grad_norm": 0.0028091182466596365, - "learning_rate": 7.057169634489222e-06, - "loss": 0.0005, + "epoch": 24.01, + "grad_norm": 0.007294784765690565, + "learning_rate": 3.6511850434270575e-05, + "loss": 0.0494, "step": 10350 }, { - "epoch": 31.01, - "grad_norm": 0.0014785215025767684, - "learning_rate": 7.010309278350516e-06, - "loss": 0.0001, + "epoch": 24.01, + "grad_norm": 0.032260965555906296, + "learning_rate": 3.6493449138819375e-05, + "loss": 0.0361, "step": 10360 }, { - "epoch": 31.01, - "grad_norm": 0.002326256362721324, - "learning_rate": 6.9634489222118085e-06, - "loss": 0.0312, + "epoch": 24.01, + "grad_norm": 51.3294677734375, + "learning_rate": 3.6475047843368175e-05, + "loss": 0.2203, "step": 10370 }, { - "epoch": 31.01, - "grad_norm": 0.020408490672707558, - "learning_rate": 6.9165885660731026e-06, - "loss": 0.0289, + "epoch": 24.01, + "grad_norm": 0.16499635577201843, + "learning_rate": 3.6456646547916975e-05, + "loss": 0.0403, "step": 10380 }, { - "epoch": 31.01, - "grad_norm": 0.0020945239812135696, - "learning_rate": 6.869728209934395e-06, - "loss": 0.0003, + "epoch": 24.01, + "grad_norm": 0.5237520337104797, + "learning_rate": 3.6438245252465776e-05, + "loss": 0.1665, "step": 10390 }, { - "epoch": 31.01, - "grad_norm": 0.050324421375989914, - "learning_rate": 6.822867853795689e-06, - "loss": 0.0001, + "epoch": 24.01, + "grad_norm": 30.010053634643555, + "learning_rate": 3.6419843957014576e-05, + "loss": 0.0648, "step": 10400 }, { - "epoch": 31.02, - "grad_norm": 0.09508836269378662, - "learning_rate": 6.776007497656983e-06, - "loss": 0.0003, + "epoch": 24.01, + "grad_norm": 0.030312929302453995, + "learning_rate": 3.6401442661563376e-05, + "loss": 0.127, "step": 10410 }, { - "epoch": 31.02, - "grad_norm": 0.0019429631065577269, - "learning_rate": 6.729147141518276e-06, - "loss": 0.0076, + "epoch": 24.01, + "grad_norm": 0.0383584164083004, + "learning_rate": 3.6383041366112176e-05, + "loss": 0.0259, "step": 10420 }, { - "epoch": 31.02, - "grad_norm": 0.0016007090453058481, - "learning_rate": 6.68228678537957e-06, - "loss": 0.0132, + "epoch": 24.01, + "grad_norm": 26.257125854492188, + "learning_rate": 3.6364640070660976e-05, + "loss": 0.0096, "step": 10430 }, { - "epoch": 31.02, - "grad_norm": 4.64786958694458, - "learning_rate": 6.635426429240862e-06, - "loss": 0.0006, + "epoch": 24.01, + "grad_norm": 23.165929794311523, + "learning_rate": 3.6346238775209776e-05, + "loss": 0.1543, "step": 10440 }, { - "epoch": 31.02, - "grad_norm": 63.0573616027832, - "learning_rate": 6.588566073102156e-06, - "loss": 0.0605, + "epoch": 24.01, + "grad_norm": 18.72815704345703, + "learning_rate": 3.6327837479758576e-05, + "loss": 0.1438, "step": 10450 }, { - "epoch": 31.02, - "grad_norm": 0.001429658499546349, - "learning_rate": 6.541705716963449e-06, - "loss": 0.0001, + "epoch": 24.01, + "grad_norm": 0.012924473732709885, + "learning_rate": 3.6309436184307376e-05, + "loss": 0.0911, "step": 10460 }, { - "epoch": 31.02, - "grad_norm": 0.012436199001967907, - "learning_rate": 6.494845360824743e-06, - "loss": 0.0055, + "epoch": 24.01, + "grad_norm": 16.28953742980957, + "learning_rate": 3.6291034888856176e-05, + "loss": 0.1934, "step": 10470 }, { - "epoch": 31.02, - "grad_norm": 0.002089001704007387, - "learning_rate": 6.447985004686036e-06, - "loss": 0.0004, + "epoch": 24.01, + "grad_norm": 2.0924813747406006, + "learning_rate": 3.627263359340498e-05, + "loss": 0.1541, "step": 10480 }, { - "epoch": 31.02, - "grad_norm": 0.006997866556048393, - "learning_rate": 6.40112464854733e-06, - "loss": 0.1141, + "epoch": 24.01, + "grad_norm": 10.997538566589355, + "learning_rate": 3.625423229795378e-05, + "loss": 0.1034, "step": 10490 }, { - "epoch": 31.02, - "grad_norm": 0.018327118828892708, - "learning_rate": 6.354264292408622e-06, - "loss": 0.0071, + "epoch": 24.01, + "grad_norm": 0.01888904720544815, + "learning_rate": 3.623583100250258e-05, + "loss": 0.0347, "step": 10500 }, { - "epoch": 31.02, - "grad_norm": 0.30546697974205017, - "learning_rate": 6.307403936269916e-06, - "loss": 0.0002, + "epoch": 24.01, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 1.4803065061569214, + "eval_runtime": 38.8312, + "eval_samples_per_second": 22.868, + "eval_steps_per_second": 1.906, + "step": 10500 + }, + { + "epoch": 25.0, + "grad_norm": 32.71762466430664, + "learning_rate": 3.621742970705138e-05, + "loss": 0.0376, "step": 10510 }, { - "epoch": 31.02, - "grad_norm": 64.27317810058594, - "learning_rate": 6.260543580131209e-06, - "loss": 0.0119, + "epoch": 25.0, + "grad_norm": 1.0197267532348633, + "learning_rate": 3.619902841160018e-05, + "loss": 0.0717, "step": 10520 }, { - "epoch": 31.03, - "grad_norm": 0.0014340607449412346, - "learning_rate": 6.2136832239925025e-06, - "loss": 0.0002, + "epoch": 25.0, + "grad_norm": 0.022604642435908318, + "learning_rate": 3.618062711614898e-05, + "loss": 0.1172, "step": 10530 }, { - "epoch": 31.03, - "grad_norm": 0.012228988111019135, - "learning_rate": 6.166822867853796e-06, - "loss": 0.0011, + "epoch": 25.0, + "grad_norm": 25.57185173034668, + "learning_rate": 3.6162225820697784e-05, + "loss": 0.1384, "step": 10540 }, { - "epoch": 31.03, - "grad_norm": 0.0018130154348909855, - "learning_rate": 6.11996251171509e-06, - "loss": 0.0012, + "epoch": 25.0, + "grad_norm": 0.04132658615708351, + "learning_rate": 3.614382452524658e-05, + "loss": 0.0613, "step": 10550 }, { - "epoch": 31.03, - "grad_norm": 0.0017277301521971822, - "learning_rate": 6.073102155576383e-06, - "loss": 0.0371, - "step": 10560 - }, - { - "epoch": 31.03, - "eval_accuracy": 0.8080229226361032, - "eval_loss": 1.40940523147583, - "eval_runtime": 33.9429, - "eval_samples_per_second": 20.564, - "eval_steps_per_second": 1.738, + "epoch": 25.0, + "grad_norm": 0.005044014658778906, + "learning_rate": 3.612542322979538e-05, + "loss": 0.0643, "step": 10560 }, { - "epoch": 32.0, - "grad_norm": 0.0025587286800146103, - "learning_rate": 6.026241799437676e-06, - "loss": 0.0003, + "epoch": 25.0, + "grad_norm": 32.0972900390625, + "learning_rate": 3.6107021934344184e-05, + "loss": 0.1037, "step": 10570 }, { - "epoch": 32.0, - "grad_norm": 0.002236501080915332, - "learning_rate": 5.97938144329897e-06, - "loss": 0.009, + "epoch": 25.0, + "grad_norm": 0.08468390256166458, + "learning_rate": 3.608862063889298e-05, + "loss": 0.0824, "step": 10580 }, { - "epoch": 32.0, - "grad_norm": 0.00986202247440815, - "learning_rate": 5.932521087160263e-06, - "loss": 0.0003, + "epoch": 25.0, + "grad_norm": 0.014037560671567917, + "learning_rate": 3.607021934344178e-05, + "loss": 0.0706, "step": 10590 }, { - "epoch": 32.0, - "grad_norm": 0.001730900607071817, - "learning_rate": 5.885660731021556e-06, - "loss": 0.0176, + "epoch": 25.0, + "grad_norm": 0.02923491969704628, + "learning_rate": 3.6051818047990585e-05, + "loss": 0.0577, "step": 10600 }, { - "epoch": 32.0, - "grad_norm": 0.004606322385370731, - "learning_rate": 5.8388003748828495e-06, - "loss": 0.0001, + "epoch": 25.0, + "grad_norm": 0.03039034642279148, + "learning_rate": 3.603341675253938e-05, + "loss": 0.0432, "step": 10610 }, { - "epoch": 32.01, - "grad_norm": 0.00272651226259768, - "learning_rate": 5.791940018744143e-06, - "loss": 0.0002, + "epoch": 25.0, + "grad_norm": 0.15451429784297943, + "learning_rate": 3.601501545708818e-05, + "loss": 0.1319, "step": 10620 }, { - "epoch": 32.01, - "grad_norm": 0.0064768255688250065, - "learning_rate": 5.745079662605436e-06, - "loss": 0.0012, + "epoch": 25.0, + "grad_norm": 37.398616790771484, + "learning_rate": 3.5996614161636985e-05, + "loss": 0.1519, "step": 10630 }, { - "epoch": 32.01, - "grad_norm": 0.002017725259065628, - "learning_rate": 5.69821930646673e-06, - "loss": 0.0001, + "epoch": 25.0, + "grad_norm": 1.254022240638733, + "learning_rate": 3.597821286618578e-05, + "loss": 0.0676, "step": 10640 }, { - "epoch": 32.01, - "grad_norm": 0.0018518833676353097, - "learning_rate": 5.651358950328023e-06, - "loss": 0.0001, + "epoch": 25.0, + "grad_norm": 18.051490783691406, + "learning_rate": 3.595981157073458e-05, + "loss": 0.057, "step": 10650 }, { - "epoch": 32.01, - "grad_norm": 0.002825092989951372, - "learning_rate": 5.604498594189316e-06, - "loss": 0.0002, + "epoch": 25.01, + "grad_norm": 0.7600337266921997, + "learning_rate": 3.5941410275283385e-05, + "loss": 0.1598, "step": 10660 }, { - "epoch": 32.01, - "grad_norm": 0.0021178831811994314, - "learning_rate": 5.557638238050609e-06, - "loss": 0.0013, + "epoch": 25.01, + "grad_norm": 20.58978843688965, + "learning_rate": 3.5923008979832186e-05, + "loss": 0.3004, "step": 10670 }, { - "epoch": 32.01, - "grad_norm": 0.0012423048028722405, - "learning_rate": 5.5107778819119025e-06, - "loss": 0.072, + "epoch": 25.01, + "grad_norm": 0.7854387164115906, + "learning_rate": 3.590460768438098e-05, + "loss": 0.1261, "step": 10680 }, { - "epoch": 32.01, - "grad_norm": 0.002971925074234605, - "learning_rate": 5.4639175257731965e-06, - "loss": 0.0001, + "epoch": 25.01, + "grad_norm": 31.682432174682617, + "learning_rate": 3.5886206388929786e-05, + "loss": 0.1037, "step": 10690 }, { - "epoch": 32.01, - "grad_norm": 0.001466022222302854, - "learning_rate": 5.41705716963449e-06, - "loss": 0.0001, + "epoch": 25.01, + "grad_norm": 1.783712387084961, + "learning_rate": 3.5867805093478586e-05, + "loss": 0.0997, "step": 10700 }, { - "epoch": 32.01, - "grad_norm": 0.002207581652328372, - "learning_rate": 5.370196813495783e-06, - "loss": 0.0001, + "epoch": 25.01, + "grad_norm": 9.035534858703613, + "learning_rate": 3.584940379802738e-05, + "loss": 0.0946, "step": 10710 }, { - "epoch": 32.01, - "grad_norm": 0.0013427763478830457, - "learning_rate": 5.323336457357076e-06, - "loss": 0.0543, + "epoch": 25.01, + "grad_norm": 21.357057571411133, + "learning_rate": 3.5831002502576186e-05, + "loss": 0.0336, "step": 10720 }, { - "epoch": 32.01, - "grad_norm": 0.0013656431110575795, - "learning_rate": 5.276476101218369e-06, - "loss": 0.0191, + "epoch": 25.01, + "grad_norm": 7.840404033660889, + "learning_rate": 3.5812601207124986e-05, + "loss": 0.1378, "step": 10730 }, { - "epoch": 32.02, - "grad_norm": 0.5649217367172241, - "learning_rate": 5.229615745079663e-06, - "loss": 0.0178, + "epoch": 25.01, + "grad_norm": 10.447696685791016, + "learning_rate": 3.579419991167378e-05, + "loss": 0.0945, "step": 10740 }, { - "epoch": 32.02, - "grad_norm": 0.0019292469369247556, - "learning_rate": 5.182755388940956e-06, - "loss": 0.008, + "epoch": 25.01, + "grad_norm": 0.049855832010507584, + "learning_rate": 3.5775798616222587e-05, + "loss": 0.0878, "step": 10750 }, { - "epoch": 32.02, - "grad_norm": 0.005805399268865585, - "learning_rate": 5.1358950328022495e-06, - "loss": 0.0001, + "epoch": 25.01, + "grad_norm": 16.85457992553711, + "learning_rate": 3.575739732077139e-05, + "loss": 0.0705, "step": 10760 }, { - "epoch": 32.02, - "grad_norm": 1.1440871953964233, - "learning_rate": 5.089034676663543e-06, - "loss": 0.067, + "epoch": 25.01, + "grad_norm": 0.26030638813972473, + "learning_rate": 3.573899602532018e-05, + "loss": 0.0213, "step": 10770 }, { - "epoch": 32.02, - "grad_norm": 0.0036515570245683193, - "learning_rate": 5.042174320524836e-06, - "loss": 0.0027, + "epoch": 25.01, + "grad_norm": 5.856488227844238, + "learning_rate": 3.572059472986899e-05, + "loss": 0.0092, "step": 10780 }, { - "epoch": 32.02, - "grad_norm": 0.0021176172886043787, - "learning_rate": 4.995313964386129e-06, - "loss": 0.0175, + "epoch": 25.01, + "grad_norm": 0.013858279213309288, + "learning_rate": 3.570219343441779e-05, + "loss": 0.1014, "step": 10790 }, { - "epoch": 32.02, - "grad_norm": 0.04823027923703194, - "learning_rate": 4.948453608247423e-06, - "loss": 0.0059, + "epoch": 25.01, + "grad_norm": 12.681320190429688, + "learning_rate": 3.568379213896659e-05, + "loss": 0.0263, "step": 10800 }, { - "epoch": 32.02, - "grad_norm": 0.006527293939143419, - "learning_rate": 4.901593252108716e-06, - "loss": 0.0001, + "epoch": 25.01, + "grad_norm": 12.907841682434082, + "learning_rate": 3.566539084351539e-05, + "loss": 0.1036, "step": 10810 }, { - "epoch": 32.02, - "grad_norm": 0.02050858922302723, - "learning_rate": 4.854732895970009e-06, - "loss": 0.0013, + "epoch": 25.01, + "grad_norm": 0.20276179909706116, + "learning_rate": 3.564698954806419e-05, + "loss": 0.0841, "step": 10820 }, { - "epoch": 32.02, - "grad_norm": 0.0015726288547739387, - "learning_rate": 4.8078725398313025e-06, - "loss": 0.0001, + "epoch": 25.01, + "grad_norm": 0.01858721859753132, + "learning_rate": 3.562858825261299e-05, + "loss": 0.191, "step": 10830 }, { - "epoch": 32.02, - "grad_norm": 0.00158556061796844, - "learning_rate": 4.761012183692596e-06, - "loss": 0.0394, + "epoch": 25.01, + "grad_norm": 0.039093319326639175, + "learning_rate": 3.561018695716178e-05, + "loss": 0.0442, "step": 10840 }, { - "epoch": 32.02, - "grad_norm": 0.0019515565363690257, - "learning_rate": 4.71415182755389e-06, - "loss": 0.0048, + "epoch": 25.01, + "grad_norm": 35.638572692871094, + "learning_rate": 3.559178566171059e-05, + "loss": 0.0427, "step": 10850 }, { - "epoch": 32.03, - "grad_norm": 0.0021331189200282097, - "learning_rate": 4.667291471415184e-06, - "loss": 0.0001, + "epoch": 25.01, + "grad_norm": 0.050236549228429794, + "learning_rate": 3.557338436625939e-05, + "loss": 0.0294, "step": 10860 }, { - "epoch": 32.03, - "grad_norm": 0.002155827358365059, - "learning_rate": 4.620431115276477e-06, - "loss": 0.0003, + "epoch": 25.01, + "grad_norm": 0.2372741997241974, + "learning_rate": 3.555498307080818e-05, + "loss": 0.0848, "step": 10870 }, { - "epoch": 32.03, - "grad_norm": 0.0019439981551840901, - "learning_rate": 4.57357075913777e-06, - "loss": 0.0002, + "epoch": 25.01, + "grad_norm": 0.03720271214842796, + "learning_rate": 3.553658177535699e-05, + "loss": 0.022, "step": 10880 }, { - "epoch": 32.03, - "grad_norm": 0.0034947495441883802, - "learning_rate": 4.526710402999063e-06, - "loss": 0.0157, - "step": 10890 - }, - { - "epoch": 32.03, - "eval_accuracy": 0.8022922636103151, - "eval_loss": 1.4391096830368042, - "eval_runtime": 33.8979, - "eval_samples_per_second": 20.591, - "eval_steps_per_second": 1.741, + "epoch": 25.01, + "grad_norm": 19.287675857543945, + "learning_rate": 3.551818047990579e-05, + "loss": 0.083, "step": 10890 }, { - "epoch": 33.0, - "grad_norm": 0.007783001288771629, - "learning_rate": 4.479850046860356e-06, - "loss": 0.0002, + "epoch": 25.01, + "grad_norm": 0.08347965776920319, + "learning_rate": 3.549977918445458e-05, + "loss": 0.0453, "step": 10900 }, { - "epoch": 33.0, - "grad_norm": 0.004935835022479296, - "learning_rate": 4.43298969072165e-06, - "loss": 0.0065, + "epoch": 25.01, + "grad_norm": 22.4985294342041, + "learning_rate": 3.548137788900339e-05, + "loss": 0.1279, "step": 10910 }, { - "epoch": 33.0, - "grad_norm": 0.0015104643534868956, - "learning_rate": 4.3861293345829435e-06, - "loss": 0.0765, + "epoch": 25.01, + "grad_norm": 0.03293849155306816, + "learning_rate": 3.546297659355219e-05, + "loss": 0.0396, "step": 10920 }, { - "epoch": 33.0, - "grad_norm": 0.0014216667041182518, - "learning_rate": 4.339268978444237e-06, - "loss": 0.0018, + "epoch": 25.01, + "eval_accuracy": 0.7004504504504504, + "eval_loss": 1.6456875801086426, + "eval_runtime": 38.6971, + "eval_samples_per_second": 22.947, + "eval_steps_per_second": 1.912, + "step": 10920 + }, + { + "epoch": 26.0, + "grad_norm": 0.10634168982505798, + "learning_rate": 3.544457529810099e-05, + "loss": 0.1908, "step": 10930 }, { - "epoch": 33.0, - "grad_norm": 0.022745607420802116, - "learning_rate": 4.29240862230553e-06, - "loss": 0.0081, + "epoch": 26.0, + "grad_norm": 0.012839309871196747, + "learning_rate": 3.542617400264979e-05, + "loss": 0.1257, "step": 10940 }, { - "epoch": 33.01, - "grad_norm": 0.0021193595603108406, - "learning_rate": 4.245548266166823e-06, - "loss": 0.0003, + "epoch": 26.0, + "grad_norm": 20.624330520629883, + "learning_rate": 3.540777270719859e-05, + "loss": 0.1339, "step": 10950 }, { - "epoch": 33.01, - "grad_norm": 0.0011568386107683182, - "learning_rate": 4.198687910028116e-06, - "loss": 0.0001, + "epoch": 26.0, + "grad_norm": 0.8575695753097534, + "learning_rate": 3.538937141174739e-05, + "loss": 0.1957, "step": 10960 }, { - "epoch": 33.01, - "grad_norm": 0.019418692216277122, - "learning_rate": 4.15182755388941e-06, - "loss": 0.1781, + "epoch": 26.0, + "grad_norm": 0.24717579782009125, + "learning_rate": 3.537097011629619e-05, + "loss": 0.0824, "step": 10970 }, { - "epoch": 33.01, - "grad_norm": 34.7269401550293, - "learning_rate": 4.104967197750703e-06, - "loss": 0.0067, + "epoch": 26.0, + "grad_norm": 0.057337842881679535, + "learning_rate": 3.535256882084499e-05, + "loss": 0.0173, "step": 10980 }, { - "epoch": 33.01, - "grad_norm": 0.0017960992408916354, - "learning_rate": 4.0581068416119964e-06, - "loss": 0.0001, + "epoch": 26.0, + "grad_norm": 1.2183499336242676, + "learning_rate": 3.533416752539379e-05, + "loss": 0.0373, "step": 10990 }, { - "epoch": 33.01, - "grad_norm": 0.01256940234452486, - "learning_rate": 4.01124648547329e-06, - "loss": 0.0003, + "epoch": 26.0, + "grad_norm": 0.007798346225172281, + "learning_rate": 3.531576622994259e-05, + "loss": 0.0586, "step": 11000 }, { - "epoch": 33.01, - "grad_norm": 0.0063404180109500885, - "learning_rate": 3.964386129334583e-06, - "loss": 0.0001, + "epoch": 26.0, + "grad_norm": 14.570333480834961, + "learning_rate": 3.529736493449139e-05, + "loss": 0.0178, "step": 11010 }, { - "epoch": 33.01, - "grad_norm": 0.018559589982032776, - "learning_rate": 3.917525773195877e-06, - "loss": 0.0001, + "epoch": 26.0, + "grad_norm": 17.163837432861328, + "learning_rate": 3.527896363904019e-05, + "loss": 0.1456, "step": 11020 }, { - "epoch": 33.01, - "grad_norm": 0.0015321632381528616, - "learning_rate": 3.87066541705717e-06, - "loss": 0.0001, + "epoch": 26.0, + "grad_norm": 0.017764287069439888, + "learning_rate": 3.526056234358899e-05, + "loss": 0.0556, "step": 11030 }, { - "epoch": 33.01, - "grad_norm": 0.020633699372410774, - "learning_rate": 3.823805060918463e-06, - "loss": 0.0001, + "epoch": 26.0, + "grad_norm": 0.01081930659711361, + "learning_rate": 3.524216104813779e-05, + "loss": 0.1175, "step": 11040 }, { - "epoch": 33.01, - "grad_norm": 0.0021525041665881872, - "learning_rate": 3.7769447047797563e-06, - "loss": 0.0601, + "epoch": 26.0, + "grad_norm": 0.011412628926336765, + "learning_rate": 3.522375975268659e-05, + "loss": 0.039, "step": 11050 }, { - "epoch": 33.01, - "grad_norm": 0.0019328080816194415, - "learning_rate": 3.73008434864105e-06, - "loss": 0.0002, + "epoch": 26.0, + "grad_norm": 0.015809055417776108, + "learning_rate": 3.520535845723539e-05, + "loss": 0.0942, "step": 11060 }, { - "epoch": 33.02, - "grad_norm": 0.004199670627713203, - "learning_rate": 3.683223992502343e-06, - "loss": 0.0377, + "epoch": 26.0, + "grad_norm": 0.015211720019578934, + "learning_rate": 3.518695716178419e-05, + "loss": 0.1319, "step": 11070 }, { - "epoch": 33.02, - "grad_norm": 0.0021045091561973095, - "learning_rate": 3.636363636363636e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 47.66071319580078, + "learning_rate": 3.516855586633299e-05, + "loss": 0.1854, "step": 11080 }, { - "epoch": 33.02, - "grad_norm": 0.0019059345358982682, - "learning_rate": 3.5895032802249297e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.013697043061256409, + "learning_rate": 3.515015457088179e-05, + "loss": 0.0635, "step": 11090 }, { - "epoch": 33.02, - "grad_norm": 0.002579369815066457, - "learning_rate": 3.542642924086223e-06, - "loss": 0.0373, + "epoch": 26.01, + "grad_norm": 0.049156658351421356, + "learning_rate": 3.513175327543059e-05, + "loss": 0.0205, "step": 11100 }, { - "epoch": 33.02, - "grad_norm": 0.0014094141079112887, - "learning_rate": 3.4957825679475165e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 1.8962242603302002, + "learning_rate": 3.511335197997939e-05, + "loss": 0.0018, "step": 11110 }, { - "epoch": 33.02, - "grad_norm": 0.0017881103558465838, - "learning_rate": 3.4489222118088097e-06, - "loss": 0.0093, + "epoch": 26.01, + "grad_norm": 0.009972991421818733, + "learning_rate": 3.509495068452819e-05, + "loss": 0.03, "step": 11120 }, { - "epoch": 33.02, - "grad_norm": 0.001685052178800106, - "learning_rate": 3.402061855670103e-06, - "loss": 0.0556, + "epoch": 26.01, + "grad_norm": 0.1715347170829773, + "learning_rate": 3.507654938907699e-05, + "loss": 0.1898, "step": 11130 }, { - "epoch": 33.02, - "grad_norm": 0.004337169695645571, - "learning_rate": 3.3552014995313964e-06, - "loss": 0.0396, + "epoch": 26.01, + "grad_norm": 0.05813472345471382, + "learning_rate": 3.505814809362579e-05, + "loss": 0.0597, "step": 11140 }, { - "epoch": 33.02, - "grad_norm": 0.011138683184981346, - "learning_rate": 3.3083411433926896e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 10.253442764282227, + "learning_rate": 3.503974679817459e-05, + "loss": 0.1426, "step": 11150 }, { - "epoch": 33.02, - "grad_norm": 0.0024086865596473217, - "learning_rate": 3.2614807872539836e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.1588079035282135, + "learning_rate": 3.50213455027234e-05, + "loss": 0.1498, "step": 11160 }, { - "epoch": 33.02, - "grad_norm": 0.0029931641183793545, - "learning_rate": 3.2146204311152767e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.1124909520149231, + "learning_rate": 3.500294420727219e-05, + "loss": 0.1932, "step": 11170 }, { - "epoch": 33.02, - "grad_norm": 0.42393574118614197, - "learning_rate": 3.1677600749765703e-06, - "loss": 0.0016, + "epoch": 26.01, + "grad_norm": 0.07209834456443787, + "learning_rate": 3.498454291182099e-05, + "loss": 0.1186, "step": 11180 }, { - "epoch": 33.03, - "grad_norm": 0.0025856320280581713, - "learning_rate": 3.120899718837863e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 4.765960216522217, + "learning_rate": 3.49661416163698e-05, + "loss": 0.129, "step": 11190 }, { - "epoch": 33.03, - "grad_norm": 0.0019572244491428137, - "learning_rate": 3.0740393626991566e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 7.938549041748047, + "learning_rate": 3.494774032091859e-05, + "loss": 0.1319, "step": 11200 }, { - "epoch": 33.03, - "grad_norm": 0.3476734459400177, - "learning_rate": 3.02717900656045e-06, - "loss": 0.0044, + "epoch": 26.01, + "grad_norm": 10.07351016998291, + "learning_rate": 3.492933902546739e-05, + "loss": 0.1798, "step": 11210 }, { - "epoch": 33.03, - "grad_norm": 0.0009709845762699842, - "learning_rate": 2.9803186504217434e-06, - "loss": 0.0001, - "step": 11220 - }, - { - "epoch": 33.03, - "eval_accuracy": 0.8051575931232091, - "eval_loss": 1.383094310760498, - "eval_runtime": 33.8221, - "eval_samples_per_second": 20.637, - "eval_steps_per_second": 1.744, + "epoch": 26.01, + "grad_norm": 16.873706817626953, + "learning_rate": 3.49109377300162e-05, + "loss": 0.2251, "step": 11220 }, { - "epoch": 34.0, - "grad_norm": 0.0015264974208548665, - "learning_rate": 2.9334582942830366e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 16.359207153320312, + "learning_rate": 3.489253643456499e-05, + "loss": 0.1039, "step": 11230 }, { - "epoch": 34.0, - "grad_norm": 0.003399324370548129, - "learning_rate": 2.88659793814433e-06, - "loss": 0.0514, + "epoch": 26.01, + "grad_norm": 0.026186149567365646, + "learning_rate": 3.487413513911379e-05, + "loss": 0.0766, "step": 11240 }, { - "epoch": 34.0, - "grad_norm": 0.0033612081315368414, - "learning_rate": 2.8397375820056233e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.05351871997117996, + "learning_rate": 3.48557338436626e-05, + "loss": 0.1743, "step": 11250 }, { - "epoch": 34.0, - "grad_norm": 0.0017946057487279177, - "learning_rate": 2.792877225866917e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.06096191704273224, + "learning_rate": 3.483733254821139e-05, + "loss": 0.0711, "step": 11260 }, { - "epoch": 34.0, - "grad_norm": 0.001860388438217342, - "learning_rate": 2.74601686972821e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.14688991010189056, + "learning_rate": 3.481893125276019e-05, + "loss": 0.1217, "step": 11270 }, { - "epoch": 34.01, - "grad_norm": 0.0021725373808294535, - "learning_rate": 2.6991565135895036e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.16987045109272003, + "learning_rate": 3.4800529957309e-05, + "loss": 0.0558, "step": 11280 }, { - "epoch": 34.01, - "grad_norm": 0.030490437522530556, - "learning_rate": 2.652296157450797e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.23579993844032288, + "learning_rate": 3.47821286618578e-05, + "loss": 0.0538, "step": 11290 }, { - "epoch": 34.01, - "grad_norm": 0.0011160429567098618, - "learning_rate": 2.60543580131209e-06, - "loss": 0.0686, + "epoch": 26.01, + "grad_norm": 0.3610358238220215, + "learning_rate": 3.476372736640659e-05, + "loss": 0.1061, "step": 11300 }, { - "epoch": 34.01, - "grad_norm": 0.002387237036600709, - "learning_rate": 2.5585754451733835e-06, - "loss": 0.0074, + "epoch": 26.01, + "grad_norm": 0.9386014938354492, + "learning_rate": 3.47453260709554e-05, + "loss": 0.0251, "step": 11310 }, { - "epoch": 34.01, - "grad_norm": 0.002495302353054285, - "learning_rate": 2.5117150890346767e-06, - "loss": 0.0002, + "epoch": 26.01, + "grad_norm": 26.224563598632812, + "learning_rate": 3.47269247755042e-05, + "loss": 0.0344, "step": 11320 }, { - "epoch": 34.01, - "grad_norm": 0.022143444046378136, - "learning_rate": 2.46485473289597e-06, - "loss": 0.0002, + "epoch": 26.01, + "grad_norm": 0.008067339658737183, + "learning_rate": 3.470852348005299e-05, + "loss": 0.0606, "step": 11330 }, { - "epoch": 34.01, - "grad_norm": 0.002796097891405225, - "learning_rate": 2.4179943767572634e-06, - "loss": 0.0001, + "epoch": 26.01, + "grad_norm": 0.00769740529358387, + "learning_rate": 3.46901221846018e-05, + "loss": 0.0074, "step": 11340 }, { - "epoch": 34.01, - "grad_norm": 0.0035200004931539297, - "learning_rate": 2.3711340206185566e-06, - "loss": 0.0428, + "epoch": 26.01, + "eval_accuracy": 0.704954954954955, + "eval_loss": 1.5601754188537598, + "eval_runtime": 38.8018, + "eval_samples_per_second": 22.886, + "eval_steps_per_second": 1.907, + "step": 11340 + }, + { + "epoch": 27.0, + "grad_norm": 0.13885366916656494, + "learning_rate": 3.46717208891506e-05, + "loss": 0.1765, "step": 11350 }, { - "epoch": 34.01, - "grad_norm": 27.465776443481445, - "learning_rate": 2.32427366447985e-06, - "loss": 0.059, + "epoch": 27.0, + "grad_norm": 0.03417723625898361, + "learning_rate": 3.4653319593699394e-05, + "loss": 0.1657, "step": 11360 }, { - "epoch": 34.01, - "grad_norm": 0.00824726838618517, - "learning_rate": 2.2774133083411434e-06, - "loss": 0.0537, + "epoch": 27.0, + "grad_norm": 0.009304393082857132, + "learning_rate": 3.46349182982482e-05, + "loss": 0.1211, "step": 11370 }, { - "epoch": 34.01, - "grad_norm": 0.0025804354809224606, - "learning_rate": 2.2305529522024365e-06, - "loss": 0.0001, + "epoch": 27.0, + "grad_norm": 0.16966260969638824, + "learning_rate": 3.4616517002797e-05, + "loss": 0.019, "step": 11380 }, { - "epoch": 34.01, - "grad_norm": 0.01662873476743698, - "learning_rate": 2.1836925960637305e-06, - "loss": 0.0068, + "epoch": 27.0, + "grad_norm": 1.482495665550232, + "learning_rate": 3.4598115707345794e-05, + "loss": 0.0346, "step": 11390 }, { - "epoch": 34.02, - "grad_norm": 0.0012878773268312216, - "learning_rate": 2.1368322399250237e-06, - "loss": 0.0001, + "epoch": 27.0, + "grad_norm": 0.08011610060930252, + "learning_rate": 3.45797144118946e-05, + "loss": 0.0895, "step": 11400 }, { - "epoch": 34.02, - "grad_norm": 0.0017231220845133066, - "learning_rate": 2.089971883786317e-06, - "loss": 0.0002, + "epoch": 27.0, + "grad_norm": 0.025240659713745117, + "learning_rate": 3.45613131164434e-05, + "loss": 0.1737, "step": 11410 }, { - "epoch": 34.02, - "grad_norm": 0.007844923995435238, - "learning_rate": 2.0431115276476104e-06, - "loss": 0.0001, + "epoch": 27.0, + "grad_norm": 15.813284873962402, + "learning_rate": 3.45429118209922e-05, + "loss": 0.0534, "step": 11420 }, { - "epoch": 34.02, - "grad_norm": 0.001733616110868752, - "learning_rate": 1.9962511715089036e-06, - "loss": 0.0034, + "epoch": 27.0, + "grad_norm": 22.745393753051758, + "learning_rate": 3.4524510525541e-05, + "loss": 0.1105, "step": 11430 }, { - "epoch": 34.02, - "grad_norm": 0.002772507956251502, - "learning_rate": 1.9493908153701968e-06, - "loss": 0.0234, + "epoch": 27.0, + "grad_norm": 0.018844788894057274, + "learning_rate": 3.45061092300898e-05, + "loss": 0.0072, "step": 11440 }, { - "epoch": 34.02, - "grad_norm": 0.001203456544317305, - "learning_rate": 1.9025304592314903e-06, - "loss": 0.0001, + "epoch": 27.0, + "grad_norm": 0.08511705696582794, + "learning_rate": 3.44877079346386e-05, + "loss": 0.1052, "step": 11450 }, { - "epoch": 34.02, - "grad_norm": 0.004119969438761473, - "learning_rate": 1.8556701030927835e-06, - "loss": 0.0399, + "epoch": 27.0, + "grad_norm": 0.018431710079312325, + "learning_rate": 3.44693066391874e-05, + "loss": 0.0745, "step": 11460 }, { - "epoch": 34.02, - "grad_norm": 0.0023524747230112553, - "learning_rate": 1.8088097469540769e-06, - "loss": 0.0003, + "epoch": 27.0, + "grad_norm": 0.060771312564611435, + "learning_rate": 3.44509053437362e-05, + "loss": 0.026, "step": 11470 }, { - "epoch": 34.02, - "grad_norm": 0.0018690053839236498, - "learning_rate": 1.7619493908153703e-06, - "loss": 0.0001, + "epoch": 27.0, + "grad_norm": 0.01608353666961193, + "learning_rate": 3.4432504048285e-05, + "loss": 0.0576, "step": 11480 }, { - "epoch": 34.02, - "grad_norm": 0.0028986113611608744, - "learning_rate": 1.7150890346766636e-06, - "loss": 0.0371, + "epoch": 27.0, + "grad_norm": 0.0488639660179615, + "learning_rate": 3.44141027528338e-05, + "loss": 0.1017, "step": 11490 }, { - "epoch": 34.02, - "grad_norm": 0.0018392838537693024, - "learning_rate": 1.6682286785379568e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 28.09701156616211, + "learning_rate": 3.43957014573826e-05, + "loss": 0.0847, "step": 11500 }, { - "epoch": 34.02, - "grad_norm": 0.0016958696069195867, - "learning_rate": 1.6213683223992502e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.2560718357563019, + "learning_rate": 3.43773001619314e-05, + "loss": 0.0066, "step": 11510 }, { - "epoch": 34.03, - "grad_norm": 0.004104943014681339, - "learning_rate": 1.5745079662605435e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.11808411777019501, + "learning_rate": 3.43588988664802e-05, + "loss": 0.1041, "step": 11520 }, { - "epoch": 34.03, - "grad_norm": 0.008021462708711624, - "learning_rate": 1.527647610121837e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.3728163540363312, + "learning_rate": 3.4340497571029e-05, + "loss": 0.0721, "step": 11530 }, { - "epoch": 34.03, - "grad_norm": 5.87186861038208, - "learning_rate": 1.4807872539831303e-06, - "loss": 0.0109, + "epoch": 27.01, + "grad_norm": 6.730001449584961, + "learning_rate": 3.43220962755778e-05, + "loss": 0.0634, "step": 11540 }, { - "epoch": 34.03, - "grad_norm": 0.0011679594172164798, - "learning_rate": 1.4339268978444237e-06, - "loss": 0.0001, - "step": 11550 - }, - { - "epoch": 34.03, - "eval_accuracy": 0.8080229226361032, - "eval_loss": 1.3971121311187744, - "eval_runtime": 33.9156, - "eval_samples_per_second": 20.581, - "eval_steps_per_second": 1.74, + "epoch": 27.01, + "grad_norm": 63.36409378051758, + "learning_rate": 3.43036949801266e-05, + "loss": 0.2596, "step": 11550 }, { - "epoch": 35.0, - "grad_norm": 0.0017861544620245695, - "learning_rate": 1.387066541705717e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.0218398105353117, + "learning_rate": 3.42852936846754e-05, + "loss": 0.0739, "step": 11560 }, { - "epoch": 35.0, - "grad_norm": 0.0019135787151753902, - "learning_rate": 1.3402061855670102e-06, - "loss": 0.0002, + "epoch": 27.01, + "grad_norm": 7.553366661071777, + "learning_rate": 3.42668923892242e-05, + "loss": 0.167, "step": 11570 }, { - "epoch": 35.0, - "grad_norm": 0.04832134768366814, - "learning_rate": 1.2933458294283038e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.7259697914123535, + "learning_rate": 3.4248491093773e-05, + "loss": 0.1717, "step": 11580 }, { - "epoch": 35.0, - "grad_norm": 0.0034780928399413824, - "learning_rate": 1.2464854732895972e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.05467524379491806, + "learning_rate": 3.42300897983218e-05, + "loss": 0.043, "step": 11590 }, { - "epoch": 35.0, - "grad_norm": 0.001626980840228498, - "learning_rate": 1.1996251171508905e-06, - "loss": 0.0003, + "epoch": 27.01, + "grad_norm": 0.04555194452404976, + "learning_rate": 3.42116885028706e-05, + "loss": 0.0153, "step": 11600 }, { - "epoch": 35.01, - "grad_norm": 0.006935800425708294, - "learning_rate": 1.1527647610121837e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 7.373996734619141, + "learning_rate": 3.41932872074194e-05, + "loss": 0.1179, "step": 11610 }, { - "epoch": 35.01, - "grad_norm": 0.0010628902819007635, - "learning_rate": 1.105904404873477e-06, - "loss": 0.001, + "epoch": 27.01, + "grad_norm": 31.452836990356445, + "learning_rate": 3.41748859119682e-05, + "loss": 0.0755, "step": 11620 }, { - "epoch": 35.01, - "grad_norm": 0.0014588043559342623, - "learning_rate": 1.0590440487347704e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 9.720723152160645, + "learning_rate": 3.4156484616517003e-05, + "loss": 0.1386, "step": 11630 }, { - "epoch": 35.01, - "grad_norm": 0.007299583870917559, - "learning_rate": 1.0121836925960638e-06, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.06120923161506653, + "learning_rate": 3.4138083321065804e-05, + "loss": 0.167, "step": 11640 }, { - "epoch": 35.01, - "grad_norm": 0.002258384833112359, - "learning_rate": 9.65323336457357e-07, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.039387207478284836, + "learning_rate": 3.4119682025614604e-05, + "loss": 0.0332, "step": 11650 }, { - "epoch": 35.01, - "grad_norm": 61.81095504760742, - "learning_rate": 9.184629803186506e-07, - "loss": 0.0222, + "epoch": 27.01, + "grad_norm": 0.19560861587524414, + "learning_rate": 3.4101280730163404e-05, + "loss": 0.0708, "step": 11660 }, { - "epoch": 35.01, - "grad_norm": 0.0012000646675005555, - "learning_rate": 8.716026241799438e-07, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 2.266158103942871, + "learning_rate": 3.4082879434712204e-05, + "loss": 0.0311, "step": 11670 }, { - "epoch": 35.01, - "grad_norm": 0.01990874670445919, - "learning_rate": 8.247422680412372e-07, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.20093105733394623, + "learning_rate": 3.406447813926101e-05, + "loss": 0.1291, "step": 11680 }, { - "epoch": 35.01, - "grad_norm": 0.01420762948691845, - "learning_rate": 7.778819119025305e-07, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.83036869764328, + "learning_rate": 3.4046076843809804e-05, + "loss": 0.0734, "step": 11690 }, { - "epoch": 35.01, - "grad_norm": 0.002269094344228506, - "learning_rate": 7.310215557638238e-07, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 5.829738140106201, + "learning_rate": 3.4027675548358604e-05, + "loss": 0.0868, "step": 11700 }, { - "epoch": 35.01, - "grad_norm": 0.0038073186296969652, - "learning_rate": 6.841611996251172e-07, - "loss": 0.0181, + "epoch": 27.01, + "grad_norm": 7.233109474182129, + "learning_rate": 3.400927425290741e-05, + "loss": 0.1265, "step": 11710 }, { - "epoch": 35.01, - "grad_norm": 0.03407540172338486, - "learning_rate": 6.373008434864106e-07, - "loss": 0.029, + "epoch": 27.01, + "grad_norm": 0.12846483290195465, + "learning_rate": 3.3990872957456204e-05, + "loss": 0.0854, "step": 11720 }, { - "epoch": 35.02, - "grad_norm": 0.030750174075365067, - "learning_rate": 5.904404873477039e-07, - "loss": 0.0093, + "epoch": 27.01, + "grad_norm": 9.74392318725586, + "learning_rate": 3.3972471662005005e-05, + "loss": 0.1329, "step": 11730 }, { - "epoch": 35.02, - "grad_norm": 0.002124561695381999, - "learning_rate": 5.435801312089972e-07, - "loss": 0.0027, + "epoch": 27.01, + "grad_norm": 0.6316181421279907, + "learning_rate": 3.395407036655381e-05, + "loss": 0.0495, "step": 11740 }, { - "epoch": 35.02, - "grad_norm": 0.0012157908640801907, - "learning_rate": 4.967197750702906e-07, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 4.9387054443359375, + "learning_rate": 3.3935669071102605e-05, + "loss": 0.006, "step": 11750 }, { - "epoch": 35.02, - "grad_norm": 0.002194877015426755, - "learning_rate": 4.498594189315839e-07, - "loss": 0.0001, + "epoch": 27.01, + "grad_norm": 0.014866938814520836, + "learning_rate": 3.3917267775651405e-05, + "loss": 0.1256, "step": 11760 }, { - "epoch": 35.02, - "grad_norm": 0.003094845451414585, - "learning_rate": 4.0299906279287724e-07, - "loss": 0.0001, + "epoch": 27.01, + "eval_accuracy": 0.7173423423423423, + "eval_loss": 1.3965140581130981, + "eval_runtime": 38.9369, + "eval_samples_per_second": 22.806, + "eval_steps_per_second": 1.901, + "step": 11760 + }, + { + "epoch": 28.0, + "grad_norm": 0.08045655488967896, + "learning_rate": 3.389886648020021e-05, + "loss": 0.0437, "step": 11770 }, { - "epoch": 35.02, - "grad_norm": 0.001540425349958241, - "learning_rate": 3.561387066541706e-07, - "loss": 0.0001, + "epoch": 28.0, + "grad_norm": 0.04858655855059624, + "learning_rate": 3.388046518474901e-05, + "loss": 0.0011, "step": 11780 }, { - "epoch": 35.02, - "grad_norm": 0.0017908586887642741, - "learning_rate": 3.0927835051546394e-07, - "loss": 0.0001, + "epoch": 28.0, + "grad_norm": 9.89545726776123, + "learning_rate": 3.3862063889297805e-05, + "loss": 0.0796, "step": 11790 }, { - "epoch": 35.02, - "grad_norm": 0.007806339301168919, - "learning_rate": 2.6241799437675726e-07, - "loss": 0.0399, + "epoch": 28.0, + "grad_norm": 0.09138563275337219, + "learning_rate": 3.384366259384661e-05, + "loss": 0.0482, "step": 11800 }, { - "epoch": 35.02, - "grad_norm": 0.004247527569532394, - "learning_rate": 2.155576382380506e-07, - "loss": 0.034, + "epoch": 28.0, + "grad_norm": 0.07711490988731384, + "learning_rate": 3.382526129839541e-05, + "loss": 0.0255, "step": 11810 }, { - "epoch": 35.02, - "grad_norm": 0.0017347713001072407, - "learning_rate": 1.6869728209934398e-07, - "loss": 0.0005, + "epoch": 28.0, + "grad_norm": 21.119638442993164, + "learning_rate": 3.3806860002944206e-05, + "loss": 0.0481, "step": 11820 }, { - "epoch": 35.02, - "grad_norm": 0.001252649468369782, - "learning_rate": 1.218369259606373e-07, - "loss": 0.0001, + "epoch": 28.0, + "grad_norm": 0.01722894050180912, + "learning_rate": 3.378845870749301e-05, + "loss": 0.0201, "step": 11830 }, { - "epoch": 35.02, - "grad_norm": 0.0012991786934435368, - "learning_rate": 7.497656982193066e-08, - "loss": 0.0528, + "epoch": 28.0, + "grad_norm": 0.012293045409023762, + "learning_rate": 3.377005741204181e-05, + "loss": 0.1236, "step": 11840 }, { - "epoch": 35.03, - "grad_norm": 11.709962844848633, - "learning_rate": 2.8116213683223995e-08, - "loss": 0.0041, + "epoch": 28.0, + "grad_norm": 4.5801873207092285, + "learning_rate": 3.3751656116590606e-05, + "loss": 0.0025, "step": 11850 }, { - "epoch": 35.03, - "eval_accuracy": 0.8080229226361032, - "eval_loss": 1.3906208276748657, - "eval_runtime": 33.666, - "eval_samples_per_second": 20.733, - "eval_steps_per_second": 1.753, - "step": 11856 - }, - { - "epoch": 35.03, - "step": 11856, - "total_flos": 1.770184277095349e+20, - "train_loss": 0.18484977827027105, - "train_runtime": 12835.1405, - "train_samples_per_second": 11.085, - "train_steps_per_second": 0.924 - }, - { - "epoch": 35.03, - "eval_accuracy": 0.8209169054441261, - "eval_loss": 0.8522207736968994, - "eval_runtime": 35.5618, - "eval_samples_per_second": 19.628, - "eval_steps_per_second": 1.659, - "step": 11856 - }, - { - "epoch": 35.03, - "eval_accuracy": 0.8209169054441261, - "eval_loss": 0.8522207736968994, - "eval_runtime": 33.138, - "eval_samples_per_second": 21.063, - "eval_steps_per_second": 1.78, - "step": 11856 + "epoch": 28.0, + "grad_norm": 0.039160292595624924, + "learning_rate": 3.373325482113941e-05, + "loss": 0.0014, + "step": 11860 + }, + { + "epoch": 28.0, + "grad_norm": 0.021740267053246498, + "learning_rate": 3.371485352568821e-05, + "loss": 0.0528, + "step": 11870 + }, + { + "epoch": 28.0, + "grad_norm": 0.004420125856995583, + "learning_rate": 3.3696452230237006e-05, + "loss": 0.1623, + "step": 11880 + }, + { + "epoch": 28.0, + "grad_norm": 2.0642523765563965, + "learning_rate": 3.3678050934785806e-05, + "loss": 0.0021, + "step": 11890 + }, + { + "epoch": 28.0, + "grad_norm": 13.541433334350586, + "learning_rate": 3.365964963933461e-05, + "loss": 0.1623, + "step": 11900 + }, + { + "epoch": 28.0, + "grad_norm": 0.06738744676113129, + "learning_rate": 3.3641248343883413e-05, + "loss": 0.0012, + "step": 11910 + }, + { + "epoch": 28.01, + "grad_norm": 0.05531314015388489, + "learning_rate": 3.362284704843221e-05, + "loss": 0.1722, + "step": 11920 + }, + { + "epoch": 28.01, + "grad_norm": 12.824578285217285, + "learning_rate": 3.3604445752981014e-05, + "loss": 0.1559, + "step": 11930 + }, + { + "epoch": 28.01, + "grad_norm": 0.024552879855036736, + "learning_rate": 3.3586044457529814e-05, + "loss": 0.0032, + "step": 11940 + }, + { + "epoch": 28.01, + "grad_norm": 10.314522743225098, + "learning_rate": 3.356764316207861e-05, + "loss": 0.0975, + "step": 11950 + }, + { + "epoch": 28.01, + "grad_norm": 50.956687927246094, + "learning_rate": 3.3549241866627414e-05, + "loss": 0.1327, + "step": 11960 + }, + { + "epoch": 28.01, + "grad_norm": 10.152609825134277, + "learning_rate": 3.3530840571176214e-05, + "loss": 0.0645, + "step": 11970 + }, + { + "epoch": 28.01, + "grad_norm": 53.54362106323242, + "learning_rate": 3.351243927572501e-05, + "loss": 0.0216, + "step": 11980 + }, + { + "epoch": 28.01, + "grad_norm": 0.24693238735198975, + "learning_rate": 3.3494037980273814e-05, + "loss": 0.0092, + "step": 11990 + }, + { + "epoch": 28.01, + "grad_norm": 39.524871826171875, + "learning_rate": 3.3475636684822614e-05, + "loss": 0.0326, + "step": 12000 + }, + { + "epoch": 28.01, + "grad_norm": 2.962416172027588, + "learning_rate": 3.345723538937141e-05, + "loss": 0.0028, + "step": 12010 + }, + { + "epoch": 28.01, + "grad_norm": 0.015500775538384914, + "learning_rate": 3.3438834093920215e-05, + "loss": 0.1865, + "step": 12020 + }, + { + "epoch": 28.01, + "grad_norm": 50.48593521118164, + "learning_rate": 3.3420432798469015e-05, + "loss": 0.116, + "step": 12030 + }, + { + "epoch": 28.01, + "grad_norm": 0.811002254486084, + "learning_rate": 3.3402031503017815e-05, + "loss": 0.0727, + "step": 12040 + }, + { + "epoch": 28.01, + "grad_norm": 56.46418380737305, + "learning_rate": 3.3383630207566615e-05, + "loss": 0.0967, + "step": 12050 + }, + { + "epoch": 28.01, + "grad_norm": 0.04430406540632248, + "learning_rate": 3.3365228912115415e-05, + "loss": 0.0665, + "step": 12060 + }, + { + "epoch": 28.01, + "grad_norm": 0.013381538912653923, + "learning_rate": 3.3346827616664215e-05, + "loss": 0.0414, + "step": 12070 + }, + { + "epoch": 28.01, + "grad_norm": 0.0382387675344944, + "learning_rate": 3.3328426321213015e-05, + "loss": 0.0819, + "step": 12080 + }, + { + "epoch": 28.01, + "grad_norm": 0.07482504844665527, + "learning_rate": 3.3310025025761816e-05, + "loss": 0.0756, + "step": 12090 + }, + { + "epoch": 28.01, + "grad_norm": 35.36876678466797, + "learning_rate": 3.3291623730310616e-05, + "loss": 0.1254, + "step": 12100 + }, + { + "epoch": 28.01, + "grad_norm": 0.0059246160089969635, + "learning_rate": 3.3273222434859416e-05, + "loss": 0.1452, + "step": 12110 + }, + { + "epoch": 28.01, + "grad_norm": 8.87775707244873, + "learning_rate": 3.3254821139408216e-05, + "loss": 0.1301, + "step": 12120 + }, + { + "epoch": 28.01, + "grad_norm": 0.0224022027105093, + "learning_rate": 3.3236419843957016e-05, + "loss": 0.04, + "step": 12130 + }, + { + "epoch": 28.01, + "grad_norm": 0.08252600580453873, + "learning_rate": 3.3218018548505816e-05, + "loss": 0.0537, + "step": 12140 + }, + { + "epoch": 28.01, + "grad_norm": 0.013457234017550945, + "learning_rate": 3.3199617253054616e-05, + "loss": 0.0159, + "step": 12150 + }, + { + "epoch": 28.01, + "grad_norm": 0.1613243669271469, + "learning_rate": 3.3181215957603416e-05, + "loss": 0.1108, + "step": 12160 + }, + { + "epoch": 28.01, + "grad_norm": 0.0809352844953537, + "learning_rate": 3.3162814662152216e-05, + "loss": 0.001, + "step": 12170 + }, + { + "epoch": 28.01, + "grad_norm": 0.011667724698781967, + "learning_rate": 3.3144413366701017e-05, + "loss": 0.0021, + "step": 12180 + }, + { + "epoch": 28.01, + "eval_accuracy": 0.7342342342342343, + "eval_loss": 1.4513802528381348, + "eval_runtime": 38.794, + "eval_samples_per_second": 22.89, + "eval_steps_per_second": 1.908, + "step": 12180 + }, + { + "epoch": 29.0, + "grad_norm": 0.02104813978075981, + "learning_rate": 3.312601207124982e-05, + "loss": 0.0928, + "step": 12190 + }, + { + "epoch": 29.0, + "grad_norm": 0.006996306590735912, + "learning_rate": 3.310761077579862e-05, + "loss": 0.0851, + "step": 12200 + }, + { + "epoch": 29.0, + "grad_norm": 0.006665319669991732, + "learning_rate": 3.308920948034742e-05, + "loss": 0.0012, + "step": 12210 + }, + { + "epoch": 29.0, + "grad_norm": 17.009796142578125, + "learning_rate": 3.307080818489622e-05, + "loss": 0.1042, + "step": 12220 + }, + { + "epoch": 29.0, + "grad_norm": 0.015308617614209652, + "learning_rate": 3.305240688944502e-05, + "loss": 0.0524, + "step": 12230 + }, + { + "epoch": 29.0, + "grad_norm": 12.653825759887695, + "learning_rate": 3.303400559399382e-05, + "loss": 0.1284, + "step": 12240 + }, + { + "epoch": 29.0, + "grad_norm": 44.44422149658203, + "learning_rate": 3.301560429854262e-05, + "loss": 0.0961, + "step": 12250 + }, + { + "epoch": 29.0, + "grad_norm": 0.007318226154893637, + "learning_rate": 3.299720300309142e-05, + "loss": 0.0011, + "step": 12260 + }, + { + "epoch": 29.0, + "grad_norm": 0.03102783113718033, + "learning_rate": 3.297880170764022e-05, + "loss": 0.0264, + "step": 12270 + }, + { + "epoch": 29.0, + "grad_norm": 0.03847644105553627, + "learning_rate": 3.296040041218902e-05, + "loss": 0.1429, + "step": 12280 + }, + { + "epoch": 29.0, + "grad_norm": 13.650267601013184, + "learning_rate": 3.294199911673782e-05, + "loss": 0.1052, + "step": 12290 + }, + { + "epoch": 29.0, + "grad_norm": 0.36820557713508606, + "learning_rate": 3.2923597821286625e-05, + "loss": 0.0654, + "step": 12300 + }, + { + "epoch": 29.0, + "grad_norm": 0.013632736168801785, + "learning_rate": 3.290519652583542e-05, + "loss": 0.0014, + "step": 12310 + }, + { + "epoch": 29.0, + "grad_norm": 0.00829467736184597, + "learning_rate": 3.288679523038422e-05, + "loss": 0.0232, + "step": 12320 + }, + { + "epoch": 29.0, + "grad_norm": 0.01030084490776062, + "learning_rate": 3.2868393934933025e-05, + "loss": 0.0835, + "step": 12330 + }, + { + "epoch": 29.01, + "grad_norm": 0.01338445208966732, + "learning_rate": 3.284999263948182e-05, + "loss": 0.0216, + "step": 12340 + }, + { + "epoch": 29.01, + "grad_norm": 0.33869656920433044, + "learning_rate": 3.283159134403062e-05, + "loss": 0.0056, + "step": 12350 + }, + { + "epoch": 29.01, + "grad_norm": 0.016546163707971573, + "learning_rate": 3.2813190048579425e-05, + "loss": 0.0764, + "step": 12360 + }, + { + "epoch": 29.01, + "grad_norm": 4.044371128082275, + "learning_rate": 3.279478875312822e-05, + "loss": 0.0997, + "step": 12370 + }, + { + "epoch": 29.01, + "grad_norm": 0.11344069987535477, + "learning_rate": 3.277638745767702e-05, + "loss": 0.0927, + "step": 12380 + }, + { + "epoch": 29.01, + "grad_norm": 0.09442989528179169, + "learning_rate": 3.2757986162225826e-05, + "loss": 0.0375, + "step": 12390 + }, + { + "epoch": 29.01, + "grad_norm": 0.032961271703243256, + "learning_rate": 3.2739584866774626e-05, + "loss": 0.0655, + "step": 12400 + }, + { + "epoch": 29.01, + "grad_norm": 29.774293899536133, + "learning_rate": 3.272118357132342e-05, + "loss": 0.0449, + "step": 12410 + }, + { + "epoch": 29.01, + "grad_norm": 28.634647369384766, + "learning_rate": 3.2702782275872226e-05, + "loss": 0.1557, + "step": 12420 + }, + { + "epoch": 29.01, + "grad_norm": 0.052351102232933044, + "learning_rate": 3.2684380980421026e-05, + "loss": 0.0192, + "step": 12430 + }, + { + "epoch": 29.01, + "grad_norm": 0.08391708880662918, + "learning_rate": 3.266597968496982e-05, + "loss": 0.0334, + "step": 12440 + }, + { + "epoch": 29.01, + "grad_norm": 0.0222425889223814, + "learning_rate": 3.2647578389518626e-05, + "loss": 0.0608, + "step": 12450 + }, + { + "epoch": 29.01, + "grad_norm": 0.010909227654337883, + "learning_rate": 3.2629177094067427e-05, + "loss": 0.0445, + "step": 12460 + }, + { + "epoch": 29.01, + "grad_norm": 3.7934606075286865, + "learning_rate": 3.261077579861622e-05, + "loss": 0.0854, + "step": 12470 + }, + { + "epoch": 29.01, + "grad_norm": 5.625136852264404, + "learning_rate": 3.259237450316503e-05, + "loss": 0.0471, + "step": 12480 + }, + { + "epoch": 29.01, + "grad_norm": 30.222606658935547, + "learning_rate": 3.257397320771383e-05, + "loss": 0.1509, + "step": 12490 + }, + { + "epoch": 29.01, + "grad_norm": 0.051470424979925156, + "learning_rate": 3.255557191226262e-05, + "loss": 0.1511, + "step": 12500 + }, + { + "epoch": 29.01, + "grad_norm": 4.000987529754639, + "learning_rate": 3.253717061681143e-05, + "loss": 0.0246, + "step": 12510 + }, + { + "epoch": 29.01, + "grad_norm": 63.03067398071289, + "learning_rate": 3.251876932136023e-05, + "loss": 0.1366, + "step": 12520 + }, + { + "epoch": 29.01, + "grad_norm": 0.11710133403539658, + "learning_rate": 3.250036802590903e-05, + "loss": 0.001, + "step": 12530 + }, + { + "epoch": 29.01, + "grad_norm": 0.9972437620162964, + "learning_rate": 3.248196673045783e-05, + "loss": 0.08, + "step": 12540 + }, + { + "epoch": 29.01, + "grad_norm": 7.652525901794434, + "learning_rate": 3.246356543500663e-05, + "loss": 0.1765, + "step": 12550 + }, + { + "epoch": 29.01, + "grad_norm": 0.347403883934021, + "learning_rate": 3.244516413955543e-05, + "loss": 0.0018, + "step": 12560 + }, + { + "epoch": 29.01, + "grad_norm": 0.07959982007741928, + "learning_rate": 3.242676284410423e-05, + "loss": 0.0426, + "step": 12570 + }, + { + "epoch": 29.01, + "grad_norm": 0.03433597460389137, + "learning_rate": 3.240836154865303e-05, + "loss": 0.1113, + "step": 12580 + }, + { + "epoch": 29.01, + "grad_norm": 0.1946258246898651, + "learning_rate": 3.238996025320183e-05, + "loss": 0.1299, + "step": 12590 + }, + { + "epoch": 29.01, + "grad_norm": 0.2794837951660156, + "learning_rate": 3.237155895775063e-05, + "loss": 0.0476, + "step": 12600 + }, + { + "epoch": 29.01, + "eval_accuracy": 0.7173423423423423, + "eval_loss": 1.291523814201355, + "eval_runtime": 38.9534, + "eval_samples_per_second": 22.796, + "eval_steps_per_second": 1.9, + "step": 12600 + }, + { + "epoch": 30.0, + "grad_norm": 32.00685501098633, + "learning_rate": 3.235315766229943e-05, + "loss": 0.1156, + "step": 12610 + }, + { + "epoch": 30.0, + "grad_norm": 0.07065641134977341, + "learning_rate": 3.233475636684823e-05, + "loss": 0.0397, + "step": 12620 + }, + { + "epoch": 30.0, + "grad_norm": 3.7180702686309814, + "learning_rate": 3.231635507139703e-05, + "loss": 0.0469, + "step": 12630 + }, + { + "epoch": 30.0, + "grad_norm": 0.03295394033193588, + "learning_rate": 3.229795377594583e-05, + "loss": 0.0384, + "step": 12640 + }, + { + "epoch": 30.0, + "grad_norm": 0.0369311086833477, + "learning_rate": 3.227955248049463e-05, + "loss": 0.0088, + "step": 12650 + }, + { + "epoch": 30.0, + "grad_norm": 0.24643197655677795, + "learning_rate": 3.226115118504343e-05, + "loss": 0.0652, + "step": 12660 + }, + { + "epoch": 30.0, + "grad_norm": 0.02824876271188259, + "learning_rate": 3.224274988959223e-05, + "loss": 0.0169, + "step": 12670 + }, + { + "epoch": 30.0, + "grad_norm": 5.246930122375488, + "learning_rate": 3.222434859414103e-05, + "loss": 0.0051, + "step": 12680 + }, + { + "epoch": 30.0, + "grad_norm": 47.156028747558594, + "learning_rate": 3.220594729868983e-05, + "loss": 0.0966, + "step": 12690 + }, + { + "epoch": 30.0, + "grad_norm": 0.023476136848330498, + "learning_rate": 3.218754600323863e-05, + "loss": 0.1579, + "step": 12700 + }, + { + "epoch": 30.0, + "grad_norm": 0.12407675385475159, + "learning_rate": 3.216914470778743e-05, + "loss": 0.048, + "step": 12710 + }, + { + "epoch": 30.0, + "grad_norm": 0.08669500052928925, + "learning_rate": 3.215074341233623e-05, + "loss": 0.0019, + "step": 12720 + }, + { + "epoch": 30.0, + "grad_norm": 37.406314849853516, + "learning_rate": 3.213234211688503e-05, + "loss": 0.1033, + "step": 12730 + }, + { + "epoch": 30.0, + "grad_norm": 38.6237678527832, + "learning_rate": 3.211394082143383e-05, + "loss": 0.11, + "step": 12740 + }, + { + "epoch": 30.0, + "grad_norm": 0.009670387022197247, + "learning_rate": 3.209553952598263e-05, + "loss": 0.0324, + "step": 12750 + }, + { + "epoch": 30.01, + "grad_norm": 0.011206231079995632, + "learning_rate": 3.207713823053143e-05, + "loss": 0.1248, + "step": 12760 + }, + { + "epoch": 30.01, + "grad_norm": 0.17277628183364868, + "learning_rate": 3.205873693508023e-05, + "loss": 0.0164, + "step": 12770 + }, + { + "epoch": 30.01, + "grad_norm": 75.500244140625, + "learning_rate": 3.204033563962903e-05, + "loss": 0.1048, + "step": 12780 + }, + { + "epoch": 30.01, + "grad_norm": 0.032317329198122025, + "learning_rate": 3.202193434417784e-05, + "loss": 0.0273, + "step": 12790 + }, + { + "epoch": 30.01, + "grad_norm": 0.012517527677118778, + "learning_rate": 3.200353304872663e-05, + "loss": 0.0304, + "step": 12800 + }, + { + "epoch": 30.01, + "grad_norm": 0.05787045508623123, + "learning_rate": 3.198513175327543e-05, + "loss": 0.0156, + "step": 12810 + }, + { + "epoch": 30.01, + "grad_norm": 0.5614012479782104, + "learning_rate": 3.196673045782424e-05, + "loss": 0.1343, + "step": 12820 + }, + { + "epoch": 30.01, + "grad_norm": 0.007453802041709423, + "learning_rate": 3.194832916237303e-05, + "loss": 0.0983, + "step": 12830 + }, + { + "epoch": 30.01, + "grad_norm": 0.10857084393501282, + "learning_rate": 3.192992786692183e-05, + "loss": 0.0024, + "step": 12840 + }, + { + "epoch": 30.01, + "grad_norm": 0.0047862897627055645, + "learning_rate": 3.191152657147064e-05, + "loss": 0.1283, + "step": 12850 + }, + { + "epoch": 30.01, + "grad_norm": 0.009203149937093258, + "learning_rate": 3.189312527601943e-05, + "loss": 0.0203, + "step": 12860 + }, + { + "epoch": 30.01, + "grad_norm": 0.26198357343673706, + "learning_rate": 3.187472398056823e-05, + "loss": 0.1059, + "step": 12870 + }, + { + "epoch": 30.01, + "grad_norm": 0.0081553990021348, + "learning_rate": 3.185632268511704e-05, + "loss": 0.1133, + "step": 12880 + }, + { + "epoch": 30.01, + "grad_norm": 16.972984313964844, + "learning_rate": 3.183792138966583e-05, + "loss": 0.193, + "step": 12890 + }, + { + "epoch": 30.01, + "grad_norm": 0.27453407645225525, + "learning_rate": 3.181952009421463e-05, + "loss": 0.0293, + "step": 12900 + }, + { + "epoch": 30.01, + "grad_norm": 20.03423500061035, + "learning_rate": 3.180111879876343e-05, + "loss": 0.0772, + "step": 12910 + }, + { + "epoch": 30.01, + "grad_norm": 0.0040716310031712055, + "learning_rate": 3.178271750331224e-05, + "loss": 0.0139, + "step": 12920 + }, + { + "epoch": 30.01, + "grad_norm": 10.097931861877441, + "learning_rate": 3.176431620786103e-05, + "loss": 0.0693, + "step": 12930 + }, + { + "epoch": 30.01, + "grad_norm": 0.01247051265090704, + "learning_rate": 3.174591491240983e-05, + "loss": 0.103, + "step": 12940 + }, + { + "epoch": 30.01, + "grad_norm": 0.022959919646382332, + "learning_rate": 3.172751361695864e-05, + "loss": 0.0341, + "step": 12950 + }, + { + "epoch": 30.01, + "grad_norm": 0.004824052099138498, + "learning_rate": 3.170911232150743e-05, + "loss": 0.0056, + "step": 12960 + }, + { + "epoch": 30.01, + "grad_norm": 12.092653274536133, + "learning_rate": 3.169071102605623e-05, + "loss": 0.1353, + "step": 12970 + }, + { + "epoch": 30.01, + "grad_norm": 0.925576388835907, + "learning_rate": 3.167230973060504e-05, + "loss": 0.0969, + "step": 12980 + }, + { + "epoch": 30.01, + "grad_norm": 0.07548154145479202, + "learning_rate": 3.165390843515383e-05, + "loss": 0.1926, + "step": 12990 + }, + { + "epoch": 30.01, + "grad_norm": 0.11369650810956955, + "learning_rate": 3.163550713970263e-05, + "loss": 0.1268, + "step": 13000 + }, + { + "epoch": 30.01, + "grad_norm": 9.48705005645752, + "learning_rate": 3.161710584425144e-05, + "loss": 0.1238, + "step": 13010 + }, + { + "epoch": 30.01, + "grad_norm": 0.051245737820863724, + "learning_rate": 3.159870454880024e-05, + "loss": 0.0065, + "step": 13020 + }, + { + "epoch": 30.01, + "eval_accuracy": 0.7094594594594594, + "eval_loss": 1.3396903276443481, + "eval_runtime": 39.2399, + "eval_samples_per_second": 22.63, + "eval_steps_per_second": 1.886, + "step": 13020 + }, + { + "epoch": 31.0, + "grad_norm": 0.041375719010829926, + "learning_rate": 3.158030325334903e-05, + "loss": 0.0015, + "step": 13030 + }, + { + "epoch": 31.0, + "grad_norm": 13.484585762023926, + "learning_rate": 3.156190195789784e-05, + "loss": 0.0868, + "step": 13040 + }, + { + "epoch": 31.0, + "grad_norm": 16.3665714263916, + "learning_rate": 3.154350066244664e-05, + "loss": 0.0683, + "step": 13050 + }, + { + "epoch": 31.0, + "grad_norm": 18.90866470336914, + "learning_rate": 3.1525099366995434e-05, + "loss": 0.1037, + "step": 13060 + }, + { + "epoch": 31.0, + "grad_norm": 14.579414367675781, + "learning_rate": 3.150669807154424e-05, + "loss": 0.1559, + "step": 13070 + }, + { + "epoch": 31.0, + "grad_norm": 4.739386558532715, + "learning_rate": 3.148829677609304e-05, + "loss": 0.0373, + "step": 13080 + }, + { + "epoch": 31.0, + "grad_norm": 0.026588771492242813, + "learning_rate": 3.1469895480641834e-05, + "loss": 0.0876, + "step": 13090 + }, + { + "epoch": 31.0, + "grad_norm": 49.45380401611328, + "learning_rate": 3.145149418519064e-05, + "loss": 0.0323, + "step": 13100 + }, + { + "epoch": 31.0, + "grad_norm": 0.015482367016375065, + "learning_rate": 3.143309288973944e-05, + "loss": 0.0893, + "step": 13110 + }, + { + "epoch": 31.0, + "grad_norm": 0.029974903911352158, + "learning_rate": 3.1414691594288234e-05, + "loss": 0.0616, + "step": 13120 + }, + { + "epoch": 31.0, + "grad_norm": 0.11971423774957657, + "learning_rate": 3.139629029883704e-05, + "loss": 0.026, + "step": 13130 + }, + { + "epoch": 31.0, + "grad_norm": 24.187334060668945, + "learning_rate": 3.137788900338584e-05, + "loss": 0.0959, + "step": 13140 + }, + { + "epoch": 31.0, + "grad_norm": 0.07652156054973602, + "learning_rate": 3.135948770793464e-05, + "loss": 0.0014, + "step": 13150 + }, + { + "epoch": 31.0, + "grad_norm": 0.26165226101875305, + "learning_rate": 3.134108641248344e-05, + "loss": 0.0068, + "step": 13160 + }, + { + "epoch": 31.0, + "grad_norm": 0.27254414558410645, + "learning_rate": 3.132268511703224e-05, + "loss": 0.0008, + "step": 13170 + }, + { + "epoch": 31.01, + "grad_norm": 10.051579475402832, + "learning_rate": 3.130428382158104e-05, + "loss": 0.1076, + "step": 13180 + }, + { + "epoch": 31.01, + "grad_norm": 0.21670132875442505, + "learning_rate": 3.128588252612984e-05, + "loss": 0.0178, + "step": 13190 + }, + { + "epoch": 31.01, + "grad_norm": 7.604001522064209, + "learning_rate": 3.126748123067864e-05, + "loss": 0.0622, + "step": 13200 + }, + { + "epoch": 31.01, + "grad_norm": 0.004798478446900845, + "learning_rate": 3.124907993522744e-05, + "loss": 0.04, + "step": 13210 + }, + { + "epoch": 31.01, + "grad_norm": 0.05649665370583534, + "learning_rate": 3.123067863977624e-05, + "loss": 0.0325, + "step": 13220 + }, + { + "epoch": 31.01, + "grad_norm": 0.013989850878715515, + "learning_rate": 3.121227734432504e-05, + "loss": 0.0669, + "step": 13230 + }, + { + "epoch": 31.01, + "grad_norm": 0.2536843717098236, + "learning_rate": 3.119387604887384e-05, + "loss": 0.0344, + "step": 13240 + }, + { + "epoch": 31.01, + "grad_norm": 18.08637809753418, + "learning_rate": 3.117547475342264e-05, + "loss": 0.1827, + "step": 13250 + }, + { + "epoch": 31.01, + "grad_norm": 0.4526243209838867, + "learning_rate": 3.115707345797144e-05, + "loss": 0.0668, + "step": 13260 + }, + { + "epoch": 31.01, + "grad_norm": 26.46347999572754, + "learning_rate": 3.113867216252024e-05, + "loss": 0.0317, + "step": 13270 + }, + { + "epoch": 31.01, + "grad_norm": 10.450908660888672, + "learning_rate": 3.112027086706904e-05, + "loss": 0.1146, + "step": 13280 + }, + { + "epoch": 31.01, + "grad_norm": 15.456535339355469, + "learning_rate": 3.110186957161784e-05, + "loss": 0.0594, + "step": 13290 + }, + { + "epoch": 31.01, + "grad_norm": 0.1086319163441658, + "learning_rate": 3.108346827616664e-05, + "loss": 0.0602, + "step": 13300 + }, + { + "epoch": 31.01, + "grad_norm": 14.730328559875488, + "learning_rate": 3.106506698071544e-05, + "loss": 0.106, + "step": 13310 + }, + { + "epoch": 31.01, + "grad_norm": 37.09258270263672, + "learning_rate": 3.104666568526424e-05, + "loss": 0.0091, + "step": 13320 + }, + { + "epoch": 31.01, + "grad_norm": 1.7202033996582031, + "learning_rate": 3.1028264389813043e-05, + "loss": 0.1634, + "step": 13330 + }, + { + "epoch": 31.01, + "grad_norm": 11.892850875854492, + "learning_rate": 3.1009863094361844e-05, + "loss": 0.0566, + "step": 13340 + }, + { + "epoch": 31.01, + "grad_norm": 0.0069226413033902645, + "learning_rate": 3.0991461798910644e-05, + "loss": 0.0477, + "step": 13350 + }, + { + "epoch": 31.01, + "grad_norm": 46.1098518371582, + "learning_rate": 3.0973060503459444e-05, + "loss": 0.0911, + "step": 13360 + }, + { + "epoch": 31.01, + "grad_norm": 0.013168955221772194, + "learning_rate": 3.0954659208008244e-05, + "loss": 0.0538, + "step": 13370 + }, + { + "epoch": 31.01, + "grad_norm": 27.440155029296875, + "learning_rate": 3.0936257912557044e-05, + "loss": 0.1327, + "step": 13380 + }, + { + "epoch": 31.01, + "grad_norm": 0.0305141843855381, + "learning_rate": 3.0917856617105844e-05, + "loss": 0.039, + "step": 13390 + }, + { + "epoch": 31.01, + "grad_norm": 0.005779525265097618, + "learning_rate": 3.0899455321654644e-05, + "loss": 0.0019, + "step": 13400 + }, + { + "epoch": 31.01, + "grad_norm": 0.060245249420404434, + "learning_rate": 3.088105402620345e-05, + "loss": 0.0031, + "step": 13410 + }, + { + "epoch": 31.01, + "grad_norm": 0.0227971151471138, + "learning_rate": 3.0862652730752244e-05, + "loss": 0.0631, + "step": 13420 + }, + { + "epoch": 31.01, + "grad_norm": 0.02739185467362404, + "learning_rate": 3.0844251435301045e-05, + "loss": 0.0543, + "step": 13430 + }, + { + "epoch": 31.01, + "grad_norm": 0.001662073889747262, + "learning_rate": 3.082585013984985e-05, + "loss": 0.0435, + "step": 13440 + }, + { + "epoch": 31.01, + "eval_accuracy": 0.6948198198198198, + "eval_loss": 1.8911927938461304, + "eval_runtime": 38.7968, + "eval_samples_per_second": 22.888, + "eval_steps_per_second": 1.907, + "step": 13440 + }, + { + "epoch": 32.0, + "grad_norm": 0.0038868181873112917, + "learning_rate": 3.0807448844398645e-05, + "loss": 0.1149, + "step": 13450 + }, + { + "epoch": 32.0, + "grad_norm": 0.19289876520633698, + "learning_rate": 3.0789047548947445e-05, + "loss": 0.1026, + "step": 13460 + }, + { + "epoch": 32.0, + "grad_norm": 0.35097235441207886, + "learning_rate": 3.077064625349625e-05, + "loss": 0.0466, + "step": 13470 + }, + { + "epoch": 32.0, + "grad_norm": 0.012586521916091442, + "learning_rate": 3.0752244958045045e-05, + "loss": 0.0783, + "step": 13480 + }, + { + "epoch": 32.0, + "grad_norm": 0.3049808442592621, + "learning_rate": 3.0733843662593845e-05, + "loss": 0.1405, + "step": 13490 + }, + { + "epoch": 32.0, + "grad_norm": 0.13303309679031372, + "learning_rate": 3.071544236714265e-05, + "loss": 0.0215, + "step": 13500 + }, + { + "epoch": 32.0, + "grad_norm": 0.060418274253606796, + "learning_rate": 3.0697041071691445e-05, + "loss": 0.0711, + "step": 13510 + }, + { + "epoch": 32.0, + "grad_norm": 0.021137768402695656, + "learning_rate": 3.0678639776240246e-05, + "loss": 0.0407, + "step": 13520 + }, + { + "epoch": 32.0, + "grad_norm": 0.027525225654244423, + "learning_rate": 3.066023848078905e-05, + "loss": 0.0626, + "step": 13530 + }, + { + "epoch": 32.0, + "grad_norm": 32.222984313964844, + "learning_rate": 3.064183718533785e-05, + "loss": 0.1088, + "step": 13540 + }, + { + "epoch": 32.0, + "grad_norm": 6.560800075531006, + "learning_rate": 3.0623435889886646e-05, + "loss": 0.0797, + "step": 13550 + }, + { + "epoch": 32.0, + "grad_norm": 18.688661575317383, + "learning_rate": 3.060503459443545e-05, + "loss": 0.0968, + "step": 13560 + }, + { + "epoch": 32.0, + "grad_norm": 0.02211933769285679, + "learning_rate": 3.058663329898425e-05, + "loss": 0.1444, + "step": 13570 + }, + { + "epoch": 32.0, + "grad_norm": 0.02764921449124813, + "learning_rate": 3.0568232003533046e-05, + "loss": 0.0071, + "step": 13580 + }, + { + "epoch": 32.0, + "grad_norm": 0.03323595970869064, + "learning_rate": 3.054983070808185e-05, + "loss": 0.0033, + "step": 13590 + }, + { + "epoch": 32.01, + "grad_norm": 21.21465301513672, + "learning_rate": 3.053142941263065e-05, + "loss": 0.0496, + "step": 13600 + }, + { + "epoch": 32.01, + "grad_norm": 4.1312432289123535, + "learning_rate": 3.051302811717945e-05, + "loss": 0.0679, + "step": 13610 + }, + { + "epoch": 32.01, + "grad_norm": 0.027629682794213295, + "learning_rate": 3.049462682172825e-05, + "loss": 0.0377, + "step": 13620 + }, + { + "epoch": 32.01, + "grad_norm": 0.06411249935626984, + "learning_rate": 3.0476225526277054e-05, + "loss": 0.0027, + "step": 13630 + }, + { + "epoch": 32.01, + "grad_norm": 28.367021560668945, + "learning_rate": 3.0457824230825854e-05, + "loss": 0.1851, + "step": 13640 + }, + { + "epoch": 32.01, + "grad_norm": 0.14021526277065277, + "learning_rate": 3.043942293537465e-05, + "loss": 0.0068, + "step": 13650 + }, + { + "epoch": 32.01, + "grad_norm": 0.00497056171298027, + "learning_rate": 3.0421021639923454e-05, + "loss": 0.0096, + "step": 13660 + }, + { + "epoch": 32.01, + "grad_norm": 0.09903844445943832, + "learning_rate": 3.0402620344472254e-05, + "loss": 0.0247, + "step": 13670 + }, + { + "epoch": 32.01, + "grad_norm": 0.24359776079654694, + "learning_rate": 3.038421904902105e-05, + "loss": 0.035, + "step": 13680 + }, + { + "epoch": 32.01, + "grad_norm": 17.976119995117188, + "learning_rate": 3.0365817753569854e-05, + "loss": 0.1399, + "step": 13690 + }, + { + "epoch": 32.01, + "grad_norm": 0.016763564199209213, + "learning_rate": 3.0347416458118654e-05, + "loss": 0.0012, + "step": 13700 + }, + { + "epoch": 32.01, + "grad_norm": 0.03081514686346054, + "learning_rate": 3.032901516266745e-05, + "loss": 0.0606, + "step": 13710 + }, + { + "epoch": 32.01, + "grad_norm": 0.020975248888134956, + "learning_rate": 3.0310613867216255e-05, + "loss": 0.1219, + "step": 13720 + }, + { + "epoch": 32.01, + "grad_norm": 0.08395440131425858, + "learning_rate": 3.0292212571765055e-05, + "loss": 0.1052, + "step": 13730 + }, + { + "epoch": 32.01, + "grad_norm": 0.04376498609781265, + "learning_rate": 3.027381127631385e-05, + "loss": 0.1467, + "step": 13740 + }, + { + "epoch": 32.01, + "grad_norm": 0.012876118533313274, + "learning_rate": 3.0255409980862655e-05, + "loss": 0.0342, + "step": 13750 + }, + { + "epoch": 32.01, + "grad_norm": 0.1162232756614685, + "learning_rate": 3.0237008685411455e-05, + "loss": 0.1335, + "step": 13760 + }, + { + "epoch": 32.01, + "grad_norm": 0.05354901775717735, + "learning_rate": 3.021860738996026e-05, + "loss": 0.1076, + "step": 13770 + }, + { + "epoch": 32.01, + "grad_norm": 0.008597791194915771, + "learning_rate": 3.0200206094509055e-05, + "loss": 0.0528, + "step": 13780 + }, + { + "epoch": 32.01, + "grad_norm": 0.40106403827667236, + "learning_rate": 3.0181804799057855e-05, + "loss": 0.0025, + "step": 13790 + }, + { + "epoch": 32.01, + "grad_norm": 0.02502295933663845, + "learning_rate": 3.016340350360666e-05, + "loss": 0.1028, + "step": 13800 + }, + { + "epoch": 32.01, + "grad_norm": 0.009747982025146484, + "learning_rate": 3.0145002208155452e-05, + "loss": 0.0218, + "step": 13810 + }, + { + "epoch": 32.01, + "grad_norm": 0.27103859186172485, + "learning_rate": 3.0126600912704256e-05, + "loss": 0.0131, + "step": 13820 + }, + { + "epoch": 32.01, + "grad_norm": 0.07821822911500931, + "learning_rate": 3.010819961725306e-05, + "loss": 0.0508, + "step": 13830 + }, + { + "epoch": 32.01, + "grad_norm": 7.904542446136475, + "learning_rate": 3.0089798321801853e-05, + "loss": 0.0756, + "step": 13840 + }, + { + "epoch": 32.01, + "grad_norm": 0.14019513130187988, + "learning_rate": 3.0071397026350656e-05, + "loss": 0.1796, + "step": 13850 + }, + { + "epoch": 32.01, + "grad_norm": 0.00757236173376441, + "learning_rate": 3.005299573089946e-05, + "loss": 0.0268, + "step": 13860 + }, + { + "epoch": 32.01, + "eval_accuracy": 0.7286036036036037, + "eval_loss": 1.5766730308532715, + "eval_runtime": 39.4733, + "eval_samples_per_second": 22.496, + "eval_steps_per_second": 1.875, + "step": 13860 + }, + { + "epoch": 33.0, + "grad_norm": 0.5436367392539978, + "learning_rate": 3.0034594435448253e-05, + "loss": 0.052, + "step": 13870 + }, + { + "epoch": 33.0, + "grad_norm": 0.02723160944879055, + "learning_rate": 3.0016193139997057e-05, + "loss": 0.0698, + "step": 13880 + }, + { + "epoch": 33.0, + "grad_norm": 0.005910804029554129, + "learning_rate": 2.999779184454586e-05, + "loss": 0.0185, + "step": 13890 + }, + { + "epoch": 33.0, + "grad_norm": 5.659780025482178, + "learning_rate": 2.997939054909466e-05, + "loss": 0.1311, + "step": 13900 + }, + { + "epoch": 33.0, + "grad_norm": 9.268265724182129, + "learning_rate": 2.9960989253643457e-05, + "loss": 0.089, + "step": 13910 + }, + { + "epoch": 33.0, + "grad_norm": 0.08459486067295074, + "learning_rate": 2.9942587958192257e-05, + "loss": 0.0104, + "step": 13920 + }, + { + "epoch": 33.0, + "grad_norm": 0.17442312836647034, + "learning_rate": 2.992418666274106e-05, + "loss": 0.0581, + "step": 13930 + }, + { + "epoch": 33.0, + "grad_norm": 0.04593927040696144, + "learning_rate": 2.9905785367289857e-05, + "loss": 0.0009, + "step": 13940 + }, + { + "epoch": 33.0, + "grad_norm": 0.023078270256519318, + "learning_rate": 2.9887384071838657e-05, + "loss": 0.0013, + "step": 13950 + }, + { + "epoch": 33.0, + "grad_norm": 0.07796052098274231, + "learning_rate": 2.986898277638746e-05, + "loss": 0.0006, + "step": 13960 + }, + { + "epoch": 33.0, + "grad_norm": 0.8386691212654114, + "learning_rate": 2.9850581480936258e-05, + "loss": 0.0464, + "step": 13970 + }, + { + "epoch": 33.0, + "grad_norm": 0.004704775754362345, + "learning_rate": 2.9832180185485058e-05, + "loss": 0.0561, + "step": 13980 + }, + { + "epoch": 33.0, + "grad_norm": 16.554256439208984, + "learning_rate": 2.981377889003386e-05, + "loss": 0.0981, + "step": 13990 + }, + { + "epoch": 33.0, + "grad_norm": 0.007485406938940287, + "learning_rate": 2.9795377594582658e-05, + "loss": 0.0493, + "step": 14000 + }, + { + "epoch": 33.0, + "grad_norm": 0.03169386461377144, + "learning_rate": 2.9776976299131458e-05, + "loss": 0.0035, + "step": 14010 + }, + { + "epoch": 33.01, + "grad_norm": 0.021181074902415276, + "learning_rate": 2.975857500368026e-05, + "loss": 0.0241, + "step": 14020 + }, + { + "epoch": 33.01, + "grad_norm": 0.13125194609165192, + "learning_rate": 2.974017370822906e-05, + "loss": 0.0269, + "step": 14030 + }, + { + "epoch": 33.01, + "grad_norm": 60.82809829711914, + "learning_rate": 2.972177241277786e-05, + "loss": 0.1368, + "step": 14040 + }, + { + "epoch": 33.01, + "grad_norm": 1.089142084121704, + "learning_rate": 2.9703371117326662e-05, + "loss": 0.0073, + "step": 14050 + }, + { + "epoch": 33.01, + "grad_norm": 13.938457489013672, + "learning_rate": 2.9684969821875462e-05, + "loss": 0.0849, + "step": 14060 + }, + { + "epoch": 33.01, + "grad_norm": 0.70406574010849, + "learning_rate": 2.966656852642426e-05, + "loss": 0.1026, + "step": 14070 + }, + { + "epoch": 33.01, + "grad_norm": 0.01301854383200407, + "learning_rate": 2.9648167230973062e-05, + "loss": 0.0848, + "step": 14080 + }, + { + "epoch": 33.01, + "grad_norm": 0.006891491822898388, + "learning_rate": 2.9629765935521862e-05, + "loss": 0.0399, + "step": 14090 + }, + { + "epoch": 33.01, + "grad_norm": 0.010800345800817013, + "learning_rate": 2.961136464007066e-05, + "loss": 0.0005, + "step": 14100 + }, + { + "epoch": 33.01, + "grad_norm": 0.006483331322669983, + "learning_rate": 2.9592963344619463e-05, + "loss": 0.1803, + "step": 14110 + }, + { + "epoch": 33.01, + "grad_norm": 0.0052506220526993275, + "learning_rate": 2.9574562049168263e-05, + "loss": 0.0681, + "step": 14120 + }, + { + "epoch": 33.01, + "grad_norm": 6.714112758636475, + "learning_rate": 2.955616075371706e-05, + "loss": 0.1035, + "step": 14130 + }, + { + "epoch": 33.01, + "grad_norm": 6.365170955657959, + "learning_rate": 2.9537759458265863e-05, + "loss": 0.1068, + "step": 14140 + }, + { + "epoch": 33.01, + "grad_norm": 0.1516963392496109, + "learning_rate": 2.9519358162814663e-05, + "loss": 0.0267, + "step": 14150 + }, + { + "epoch": 33.01, + "grad_norm": 0.44562458992004395, + "learning_rate": 2.9500956867363467e-05, + "loss": 0.0887, + "step": 14160 + }, + { + "epoch": 33.01, + "grad_norm": 0.04399920254945755, + "learning_rate": 2.9482555571912263e-05, + "loss": 0.0365, + "step": 14170 + }, + { + "epoch": 33.01, + "grad_norm": 0.004122023470699787, + "learning_rate": 2.9464154276461063e-05, + "loss": 0.0552, + "step": 14180 + }, + { + "epoch": 33.01, + "grad_norm": 38.76673126220703, + "learning_rate": 2.9445752981009867e-05, + "loss": 0.0167, + "step": 14190 + }, + { + "epoch": 33.01, + "grad_norm": 16.6954288482666, + "learning_rate": 2.9427351685558664e-05, + "loss": 0.1445, + "step": 14200 + }, + { + "epoch": 33.01, + "grad_norm": 0.03708453103899956, + "learning_rate": 2.9408950390107464e-05, + "loss": 0.0057, + "step": 14210 + }, + { + "epoch": 33.01, + "grad_norm": 0.042344264686107635, + "learning_rate": 2.9390549094656267e-05, + "loss": 0.0747, + "step": 14220 + }, + { + "epoch": 33.01, + "grad_norm": 31.488584518432617, + "learning_rate": 2.9372147799205064e-05, + "loss": 0.1425, + "step": 14230 + }, + { + "epoch": 33.01, + "grad_norm": 81.25188446044922, + "learning_rate": 2.9353746503753864e-05, + "loss": 0.0618, + "step": 14240 + }, + { + "epoch": 33.01, + "grad_norm": 0.0070849936455488205, + "learning_rate": 2.9335345208302668e-05, + "loss": 0.07, + "step": 14250 + }, + { + "epoch": 33.01, + "grad_norm": 0.09240752458572388, + "learning_rate": 2.9316943912851468e-05, + "loss": 0.0872, + "step": 14260 + }, + { + "epoch": 33.01, + "grad_norm": 0.10065799206495285, + "learning_rate": 2.9298542617400264e-05, + "loss": 0.0012, + "step": 14270 + }, + { + "epoch": 33.01, + "grad_norm": 0.11566521972417831, + "learning_rate": 2.9280141321949068e-05, + "loss": 0.0487, + "step": 14280 + }, + { + "epoch": 33.01, + "eval_accuracy": 0.6948198198198198, + "eval_loss": 1.64386785030365, + "eval_runtime": 40.8874, + "eval_samples_per_second": 21.718, + "eval_steps_per_second": 1.81, + "step": 14280 + }, + { + "epoch": 34.0, + "grad_norm": 0.012225592508912086, + "learning_rate": 2.9261740026497868e-05, + "loss": 0.002, + "step": 14290 + }, + { + "epoch": 34.0, + "grad_norm": 0.0045622168108820915, + "learning_rate": 2.9243338731046665e-05, + "loss": 0.0388, + "step": 14300 + }, + { + "epoch": 34.0, + "grad_norm": 5.046571731567383, + "learning_rate": 2.9224937435595468e-05, + "loss": 0.0706, + "step": 14310 + }, + { + "epoch": 34.0, + "grad_norm": 9.082304000854492, + "learning_rate": 2.920653614014427e-05, + "loss": 0.018, + "step": 14320 + }, + { + "epoch": 34.0, + "grad_norm": 18.434141159057617, + "learning_rate": 2.9188134844693065e-05, + "loss": 0.0919, + "step": 14330 + }, + { + "epoch": 34.0, + "grad_norm": 21.874008178710938, + "learning_rate": 2.916973354924187e-05, + "loss": 0.0768, + "step": 14340 + }, + { + "epoch": 34.0, + "grad_norm": 0.020802170038223267, + "learning_rate": 2.915133225379067e-05, + "loss": 0.0008, + "step": 14350 + }, + { + "epoch": 34.0, + "grad_norm": 0.00489756790921092, + "learning_rate": 2.9132930958339465e-05, + "loss": 0.0451, + "step": 14360 + }, + { + "epoch": 34.0, + "grad_norm": 0.21378767490386963, + "learning_rate": 2.911452966288827e-05, + "loss": 0.0487, + "step": 14370 + }, + { + "epoch": 34.0, + "grad_norm": 0.009158837608993053, + "learning_rate": 2.909612836743707e-05, + "loss": 0.0993, + "step": 14380 + }, + { + "epoch": 34.0, + "grad_norm": 0.006322511006146669, + "learning_rate": 2.9077727071985873e-05, + "loss": 0.0008, + "step": 14390 + }, + { + "epoch": 34.0, + "grad_norm": 0.017560819163918495, + "learning_rate": 2.905932577653467e-05, + "loss": 0.0038, + "step": 14400 + }, + { + "epoch": 34.0, + "grad_norm": 0.01258725207298994, + "learning_rate": 2.904092448108347e-05, + "loss": 0.0009, + "step": 14410 + }, + { + "epoch": 34.0, + "grad_norm": 0.04801618680357933, + "learning_rate": 2.9022523185632273e-05, + "loss": 0.0177, + "step": 14420 + }, + { + "epoch": 34.0, + "grad_norm": 0.12019483745098114, + "learning_rate": 2.900412189018107e-05, + "loss": 0.0011, + "step": 14430 + }, + { + "epoch": 34.01, + "grad_norm": 0.029142582789063454, + "learning_rate": 2.898572059472987e-05, + "loss": 0.0014, + "step": 14440 + }, + { + "epoch": 34.01, + "grad_norm": 0.02893805503845215, + "learning_rate": 2.8967319299278673e-05, + "loss": 0.0474, + "step": 14450 + }, + { + "epoch": 34.01, + "grad_norm": 0.022320715710520744, + "learning_rate": 2.894891800382747e-05, + "loss": 0.0351, + "step": 14460 + }, + { + "epoch": 34.01, + "grad_norm": 0.006042890250682831, + "learning_rate": 2.893051670837627e-05, + "loss": 0.0643, + "step": 14470 + }, + { + "epoch": 34.01, + "grad_norm": 0.5463034510612488, + "learning_rate": 2.8912115412925074e-05, + "loss": 0.1444, + "step": 14480 + }, + { + "epoch": 34.01, + "grad_norm": 0.030179716646671295, + "learning_rate": 2.889371411747387e-05, + "loss": 0.0242, + "step": 14490 + }, + { + "epoch": 34.01, + "grad_norm": 0.007446791976690292, + "learning_rate": 2.887531282202267e-05, + "loss": 0.0257, + "step": 14500 + }, + { + "epoch": 34.01, + "grad_norm": 1.6489183902740479, + "learning_rate": 2.8856911526571474e-05, + "loss": 0.0958, + "step": 14510 + }, + { + "epoch": 34.01, + "grad_norm": 0.004721594974398613, + "learning_rate": 2.8838510231120274e-05, + "loss": 0.0324, + "step": 14520 + }, + { + "epoch": 34.01, + "grad_norm": 0.061093464493751526, + "learning_rate": 2.882010893566907e-05, + "loss": 0.0235, + "step": 14530 + }, + { + "epoch": 34.01, + "grad_norm": 0.0076067266054451466, + "learning_rate": 2.8801707640217874e-05, + "loss": 0.072, + "step": 14540 + }, + { + "epoch": 34.01, + "grad_norm": 0.032138094305992126, + "learning_rate": 2.8783306344766674e-05, + "loss": 0.0008, + "step": 14550 + }, + { + "epoch": 34.01, + "grad_norm": 0.07343176752328873, + "learning_rate": 2.876490504931547e-05, + "loss": 0.0391, + "step": 14560 + }, + { + "epoch": 34.01, + "grad_norm": 0.003957556094974279, + "learning_rate": 2.8746503753864275e-05, + "loss": 0.0057, + "step": 14570 + }, + { + "epoch": 34.01, + "grad_norm": 4.1355366706848145, + "learning_rate": 2.8728102458413075e-05, + "loss": 0.0859, + "step": 14580 + }, + { + "epoch": 34.01, + "grad_norm": 0.026744043454527855, + "learning_rate": 2.870970116296187e-05, + "loss": 0.0972, + "step": 14590 + }, + { + "epoch": 34.01, + "grad_norm": 0.02687433548271656, + "learning_rate": 2.8691299867510675e-05, + "loss": 0.0012, + "step": 14600 + }, + { + "epoch": 34.01, + "grad_norm": 1.3930257558822632, + "learning_rate": 2.8672898572059475e-05, + "loss": 0.1244, + "step": 14610 + }, + { + "epoch": 34.01, + "grad_norm": 0.3148985505104065, + "learning_rate": 2.8654497276608272e-05, + "loss": 0.1685, + "step": 14620 + }, + { + "epoch": 34.01, + "grad_norm": 0.009437570348381996, + "learning_rate": 2.8636095981157075e-05, + "loss": 0.0461, + "step": 14630 + }, + { + "epoch": 34.01, + "grad_norm": 36.728065490722656, + "learning_rate": 2.8617694685705875e-05, + "loss": 0.0151, + "step": 14640 + }, + { + "epoch": 34.01, + "grad_norm": 0.02354242280125618, + "learning_rate": 2.859929339025468e-05, + "loss": 0.0082, + "step": 14650 + }, + { + "epoch": 34.01, + "grad_norm": 0.450647234916687, + "learning_rate": 2.8580892094803476e-05, + "loss": 0.0035, + "step": 14660 + }, + { + "epoch": 34.01, + "grad_norm": 0.03615148365497589, + "learning_rate": 2.8562490799352276e-05, + "loss": 0.0476, + "step": 14670 + }, + { + "epoch": 34.01, + "grad_norm": 0.051935531198978424, + "learning_rate": 2.854408950390108e-05, + "loss": 0.001, + "step": 14680 + }, + { + "epoch": 34.01, + "grad_norm": 0.05409989133477211, + "learning_rate": 2.8525688208449876e-05, + "loss": 0.0746, + "step": 14690 + }, + { + "epoch": 34.01, + "grad_norm": 0.02356979064643383, + "learning_rate": 2.8507286912998676e-05, + "loss": 0.0448, + "step": 14700 + }, + { + "epoch": 34.01, + "eval_accuracy": 0.7353603603603603, + "eval_loss": 1.5989632606506348, + "eval_runtime": 40.4825, + "eval_samples_per_second": 21.935, + "eval_steps_per_second": 1.828, + "step": 14700 + }, + { + "epoch": 35.0, + "grad_norm": 0.008124900050461292, + "learning_rate": 2.848888561754748e-05, + "loss": 0.049, + "step": 14710 + }, + { + "epoch": 35.0, + "grad_norm": 0.017850523814558983, + "learning_rate": 2.8470484322096276e-05, + "loss": 0.0283, + "step": 14720 + }, + { + "epoch": 35.0, + "grad_norm": 0.013829112984240055, + "learning_rate": 2.8452083026645077e-05, + "loss": 0.0005, + "step": 14730 + }, + { + "epoch": 35.0, + "grad_norm": 0.21093538403511047, + "learning_rate": 2.843368173119388e-05, + "loss": 0.0046, + "step": 14740 + }, + { + "epoch": 35.0, + "grad_norm": 30.419967651367188, + "learning_rate": 2.8415280435742673e-05, + "loss": 0.0616, + "step": 14750 + }, + { + "epoch": 35.0, + "grad_norm": 5.482494354248047, + "learning_rate": 2.8396879140291477e-05, + "loss": 0.196, + "step": 14760 + }, + { + "epoch": 35.0, + "grad_norm": 2.091125726699829, + "learning_rate": 2.837847784484028e-05, + "loss": 0.001, + "step": 14770 + }, + { + "epoch": 35.0, + "grad_norm": 0.0029685271438211203, + "learning_rate": 2.836007654938908e-05, + "loss": 0.0625, + "step": 14780 + }, + { + "epoch": 35.0, + "grad_norm": 0.013809144496917725, + "learning_rate": 2.8341675253937877e-05, + "loss": 0.0004, + "step": 14790 + }, + { + "epoch": 35.0, + "grad_norm": 0.007634790614247322, + "learning_rate": 2.832327395848668e-05, + "loss": 0.0817, + "step": 14800 + }, + { + "epoch": 35.0, + "grad_norm": 4.974092960357666, + "learning_rate": 2.830487266303548e-05, + "loss": 0.0542, + "step": 14810 + }, + { + "epoch": 35.0, + "grad_norm": 0.005965742748230696, + "learning_rate": 2.8286471367584278e-05, + "loss": 0.0003, + "step": 14820 + }, + { + "epoch": 35.0, + "grad_norm": 0.0047448077239096165, + "learning_rate": 2.8268070072133078e-05, + "loss": 0.0247, + "step": 14830 + }, + { + "epoch": 35.0, + "grad_norm": 0.0023388988338410854, + "learning_rate": 2.824966877668188e-05, + "loss": 0.0048, + "step": 14840 + }, + { + "epoch": 35.0, + "grad_norm": 0.0013745896285399795, + "learning_rate": 2.8231267481230678e-05, + "loss": 0.0077, + "step": 14850 + }, + { + "epoch": 35.01, + "grad_norm": 6.498810768127441, + "learning_rate": 2.8212866185779478e-05, + "loss": 0.0423, + "step": 14860 + }, + { + "epoch": 35.01, + "grad_norm": 0.02234739437699318, + "learning_rate": 2.819446489032828e-05, + "loss": 0.0649, + "step": 14870 + }, + { + "epoch": 35.01, + "grad_norm": 1.1391817331314087, + "learning_rate": 2.8176063594877085e-05, + "loss": 0.0073, + "step": 14880 + }, + { + "epoch": 35.01, + "grad_norm": 0.019465837627649307, + "learning_rate": 2.815766229942588e-05, + "loss": 0.0364, + "step": 14890 + }, + { + "epoch": 35.01, + "grad_norm": 0.005256633274257183, + "learning_rate": 2.8139261003974682e-05, + "loss": 0.1411, + "step": 14900 + }, + { + "epoch": 35.01, + "grad_norm": 34.344825744628906, + "learning_rate": 2.8120859708523485e-05, + "loss": 0.007, + "step": 14910 + }, + { + "epoch": 35.01, + "grad_norm": 0.020804625004529953, + "learning_rate": 2.810245841307228e-05, + "loss": 0.0598, + "step": 14920 + }, + { + "epoch": 35.01, + "grad_norm": 0.02393057756125927, + "learning_rate": 2.8084057117621082e-05, + "loss": 0.1846, + "step": 14930 + }, + { + "epoch": 35.01, + "grad_norm": 27.570369720458984, + "learning_rate": 2.8065655822169882e-05, + "loss": 0.1569, + "step": 14940 + }, + { + "epoch": 35.01, + "grad_norm": 31.2330322265625, + "learning_rate": 2.804725452671868e-05, + "loss": 0.0561, + "step": 14950 + }, + { + "epoch": 35.01, + "grad_norm": 0.04531894251704216, + "learning_rate": 2.8028853231267483e-05, + "loss": 0.1036, + "step": 14960 + }, + { + "epoch": 35.01, + "grad_norm": 0.06247282400727272, + "learning_rate": 2.8010451935816283e-05, + "loss": 0.123, + "step": 14970 + }, + { + "epoch": 35.01, + "grad_norm": 0.027552228420972824, + "learning_rate": 2.799205064036508e-05, + "loss": 0.0291, + "step": 14980 + }, + { + "epoch": 35.01, + "grad_norm": 0.035376742482185364, + "learning_rate": 2.7973649344913883e-05, + "loss": 0.0458, + "step": 14990 + }, + { + "epoch": 35.01, + "grad_norm": 0.054985884577035904, + "learning_rate": 2.7955248049462683e-05, + "loss": 0.0864, + "step": 15000 + }, + { + "epoch": 35.01, + "grad_norm": 0.0469009093940258, + "learning_rate": 2.7936846754011487e-05, + "loss": 0.0442, + "step": 15010 + }, + { + "epoch": 35.01, + "grad_norm": 0.2176922708749771, + "learning_rate": 2.7918445458560283e-05, + "loss": 0.0718, + "step": 15020 + }, + { + "epoch": 35.01, + "grad_norm": 6.875114917755127, + "learning_rate": 2.7900044163109083e-05, + "loss": 0.0042, + "step": 15030 + }, + { + "epoch": 35.01, + "grad_norm": 59.076351165771484, + "learning_rate": 2.7881642867657887e-05, + "loss": 0.0111, + "step": 15040 + }, + { + "epoch": 35.01, + "grad_norm": 0.07590507715940475, + "learning_rate": 2.7863241572206684e-05, + "loss": 0.0776, + "step": 15050 + }, + { + "epoch": 35.01, + "grad_norm": 39.765769958496094, + "learning_rate": 2.7844840276755484e-05, + "loss": 0.0745, + "step": 15060 + }, + { + "epoch": 35.01, + "grad_norm": 8.698269844055176, + "learning_rate": 2.7826438981304287e-05, + "loss": 0.1075, + "step": 15070 + }, + { + "epoch": 35.01, + "grad_norm": 0.08205238729715347, + "learning_rate": 2.7808037685853084e-05, + "loss": 0.0927, + "step": 15080 + }, + { + "epoch": 35.01, + "grad_norm": 0.0548090860247612, + "learning_rate": 2.7789636390401884e-05, + "loss": 0.0759, + "step": 15090 + }, + { + "epoch": 35.01, + "grad_norm": 0.2036578506231308, + "learning_rate": 2.7771235094950688e-05, + "loss": 0.0013, + "step": 15100 + }, + { + "epoch": 35.01, + "grad_norm": 0.007210288662463427, + "learning_rate": 2.7752833799499484e-05, + "loss": 0.0409, + "step": 15110 + }, + { + "epoch": 35.01, + "grad_norm": 0.8128947615623474, + "learning_rate": 2.7734432504048284e-05, + "loss": 0.0166, + "step": 15120 + }, + { + "epoch": 35.01, + "eval_accuracy": 0.7466216216216216, + "eval_loss": 1.3866125345230103, + "eval_runtime": 40.3674, + "eval_samples_per_second": 21.998, + "eval_steps_per_second": 1.833, + "step": 15120 + }, + { + "epoch": 36.0, + "grad_norm": 5.199777126312256, + "learning_rate": 2.7716031208597088e-05, + "loss": 0.0392, + "step": 15130 + }, + { + "epoch": 36.0, + "grad_norm": 0.014085162431001663, + "learning_rate": 2.7697629913145888e-05, + "loss": 0.0007, + "step": 15140 + }, + { + "epoch": 36.0, + "grad_norm": 0.006830631755292416, + "learning_rate": 2.7679228617694685e-05, + "loss": 0.0252, + "step": 15150 + }, + { + "epoch": 36.0, + "grad_norm": 0.01971651241183281, + "learning_rate": 2.7660827322243488e-05, + "loss": 0.0514, + "step": 15160 + }, + { + "epoch": 36.0, + "grad_norm": 0.06991241872310638, + "learning_rate": 2.764242602679229e-05, + "loss": 0.0034, + "step": 15170 + }, + { + "epoch": 36.0, + "grad_norm": 6.701659202575684, + "learning_rate": 2.7624024731341085e-05, + "loss": 0.0145, + "step": 15180 + }, + { + "epoch": 36.0, + "grad_norm": 0.02505069226026535, + "learning_rate": 2.760562343588989e-05, + "loss": 0.004, + "step": 15190 + }, + { + "epoch": 36.0, + "grad_norm": 0.0011081969132646918, + "learning_rate": 2.758722214043869e-05, + "loss": 0.0478, + "step": 15200 + }, + { + "epoch": 36.0, + "grad_norm": 0.22539083659648895, + "learning_rate": 2.7568820844987485e-05, + "loss": 0.001, + "step": 15210 + }, + { + "epoch": 36.0, + "grad_norm": 45.33222198486328, + "learning_rate": 2.755041954953629e-05, + "loss": 0.0577, + "step": 15220 + }, + { + "epoch": 36.0, + "grad_norm": 22.034912109375, + "learning_rate": 2.753201825408509e-05, + "loss": 0.0064, + "step": 15230 + }, + { + "epoch": 36.0, + "grad_norm": 0.002172585343942046, + "learning_rate": 2.7513616958633886e-05, + "loss": 0.0289, + "step": 15240 + }, + { + "epoch": 36.0, + "grad_norm": 0.3061286211013794, + "learning_rate": 2.749521566318269e-05, + "loss": 0.0004, + "step": 15250 + }, + { + "epoch": 36.0, + "grad_norm": 0.004251683130860329, + "learning_rate": 2.747681436773149e-05, + "loss": 0.146, + "step": 15260 + }, + { + "epoch": 36.0, + "grad_norm": 0.0011500741820782423, + "learning_rate": 2.7458413072280293e-05, + "loss": 0.0003, + "step": 15270 + }, + { + "epoch": 36.01, + "grad_norm": 0.009844346903264523, + "learning_rate": 2.744001177682909e-05, + "loss": 0.0534, + "step": 15280 + }, + { + "epoch": 36.01, + "grad_norm": 0.0010252447100356221, + "learning_rate": 2.742161048137789e-05, + "loss": 0.178, + "step": 15290 + }, + { + "epoch": 36.01, + "grad_norm": 0.0019560528453439474, + "learning_rate": 2.7403209185926693e-05, + "loss": 0.0295, + "step": 15300 + }, + { + "epoch": 36.01, + "grad_norm": 42.15266036987305, + "learning_rate": 2.738480789047549e-05, + "loss": 0.0355, + "step": 15310 + }, + { + "epoch": 36.01, + "grad_norm": 0.0029344751965254545, + "learning_rate": 2.736640659502429e-05, + "loss": 0.0258, + "step": 15320 + }, + { + "epoch": 36.01, + "grad_norm": 0.007348980288952589, + "learning_rate": 2.7348005299573094e-05, + "loss": 0.0184, + "step": 15330 + }, + { + "epoch": 36.01, + "grad_norm": 0.003109491430222988, + "learning_rate": 2.732960400412189e-05, + "loss": 0.0027, + "step": 15340 + }, + { + "epoch": 36.01, + "grad_norm": 2.0378973484039307, + "learning_rate": 2.731120270867069e-05, + "loss": 0.0007, + "step": 15350 + }, + { + "epoch": 36.01, + "grad_norm": 0.07484769821166992, + "learning_rate": 2.7292801413219494e-05, + "loss": 0.0012, + "step": 15360 + }, + { + "epoch": 36.01, + "grad_norm": 0.5420640110969543, + "learning_rate": 2.727440011776829e-05, + "loss": 0.044, + "step": 15370 + }, + { + "epoch": 36.01, + "grad_norm": 0.0010783092584460974, + "learning_rate": 2.725599882231709e-05, + "loss": 0.0591, + "step": 15380 + }, + { + "epoch": 36.01, + "grad_norm": 0.005851478781551123, + "learning_rate": 2.7237597526865894e-05, + "loss": 0.0002, + "step": 15390 + }, + { + "epoch": 36.01, + "grad_norm": 24.51445960998535, + "learning_rate": 2.7219196231414694e-05, + "loss": 0.0758, + "step": 15400 + }, + { + "epoch": 36.01, + "grad_norm": 59.16766357421875, + "learning_rate": 2.720079493596349e-05, + "loss": 0.0366, + "step": 15410 + }, + { + "epoch": 36.01, + "grad_norm": 0.007826605811715126, + "learning_rate": 2.7182393640512295e-05, + "loss": 0.0034, + "step": 15420 + }, + { + "epoch": 36.01, + "grad_norm": 0.11950548738241196, + "learning_rate": 2.7163992345061095e-05, + "loss": 0.1212, + "step": 15430 + }, + { + "epoch": 36.01, + "grad_norm": 0.668392539024353, + "learning_rate": 2.714559104960989e-05, + "loss": 0.0063, + "step": 15440 + }, + { + "epoch": 36.01, + "grad_norm": 49.121849060058594, + "learning_rate": 2.7127189754158695e-05, + "loss": 0.0223, + "step": 15450 + }, + { + "epoch": 36.01, + "grad_norm": 0.2821270227432251, + "learning_rate": 2.7108788458707495e-05, + "loss": 0.0005, + "step": 15460 + }, + { + "epoch": 36.01, + "grad_norm": 0.003207216504961252, + "learning_rate": 2.7090387163256292e-05, + "loss": 0.1271, + "step": 15470 + }, + { + "epoch": 36.01, + "grad_norm": 0.23750409483909607, + "learning_rate": 2.7071985867805095e-05, + "loss": 0.0904, + "step": 15480 + }, + { + "epoch": 36.01, + "grad_norm": 10.629240036010742, + "learning_rate": 2.7053584572353895e-05, + "loss": 0.0721, + "step": 15490 + }, + { + "epoch": 36.01, + "grad_norm": 0.029718786478042603, + "learning_rate": 2.7035183276902692e-05, + "loss": 0.0314, + "step": 15500 + }, + { + "epoch": 36.01, + "grad_norm": 0.00921417772769928, + "learning_rate": 2.7016781981451496e-05, + "loss": 0.0583, + "step": 15510 + }, + { + "epoch": 36.01, + "grad_norm": 0.024288874119520187, + "learning_rate": 2.6998380686000296e-05, + "loss": 0.0412, + "step": 15520 + }, + { + "epoch": 36.01, + "grad_norm": 0.01890076883137226, + "learning_rate": 2.69799793905491e-05, + "loss": 0.0013, + "step": 15530 + }, + { + "epoch": 36.01, + "grad_norm": 0.002824255032464862, + "learning_rate": 2.6961578095097896e-05, + "loss": 0.1029, + "step": 15540 + }, + { + "epoch": 36.01, + "eval_accuracy": 0.7105855855855856, + "eval_loss": 1.7426668405532837, + "eval_runtime": 39.0783, + "eval_samples_per_second": 22.724, + "eval_steps_per_second": 1.894, + "step": 15540 + }, + { + "epoch": 37.0, + "grad_norm": 0.017560964450240135, + "learning_rate": 2.6943176799646696e-05, + "loss": 0.0951, + "step": 15550 + }, + { + "epoch": 37.0, + "grad_norm": 0.04531846195459366, + "learning_rate": 2.69247755041955e-05, + "loss": 0.0723, + "step": 15560 + }, + { + "epoch": 37.0, + "grad_norm": 1.8820828199386597, + "learning_rate": 2.6906374208744296e-05, + "loss": 0.0335, + "step": 15570 + }, + { + "epoch": 37.0, + "grad_norm": 0.0036666542291641235, + "learning_rate": 2.6887972913293096e-05, + "loss": 0.0375, + "step": 15580 + }, + { + "epoch": 37.0, + "grad_norm": 0.002468695631250739, + "learning_rate": 2.68695716178419e-05, + "loss": 0.001, + "step": 15590 + }, + { + "epoch": 37.0, + "grad_norm": 0.207871675491333, + "learning_rate": 2.6851170322390697e-05, + "loss": 0.1084, + "step": 15600 + }, + { + "epoch": 37.0, + "grad_norm": 0.026301635429263115, + "learning_rate": 2.6832769026939497e-05, + "loss": 0.0007, + "step": 15610 + }, + { + "epoch": 37.0, + "grad_norm": 0.005130626726895571, + "learning_rate": 2.68143677314883e-05, + "loss": 0.1154, + "step": 15620 + }, + { + "epoch": 37.0, + "grad_norm": 0.1096784695982933, + "learning_rate": 2.67959664360371e-05, + "loss": 0.112, + "step": 15630 + }, + { + "epoch": 37.0, + "grad_norm": 0.00986840482801199, + "learning_rate": 2.6777565140585897e-05, + "loss": 0.0784, + "step": 15640 + }, + { + "epoch": 37.0, + "grad_norm": 0.677042543888092, + "learning_rate": 2.67591638451347e-05, + "loss": 0.0483, + "step": 15650 + }, + { + "epoch": 37.0, + "grad_norm": 0.00216664164327085, + "learning_rate": 2.67407625496835e-05, + "loss": 0.0014, + "step": 15660 + }, + { + "epoch": 37.0, + "grad_norm": 0.06850877404212952, + "learning_rate": 2.6722361254232298e-05, + "loss": 0.0023, + "step": 15670 + }, + { + "epoch": 37.0, + "grad_norm": 0.0250447578728199, + "learning_rate": 2.67039599587811e-05, + "loss": 0.0511, + "step": 15680 + }, + { + "epoch": 37.0, + "grad_norm": 0.02025407738983631, + "learning_rate": 2.66855586633299e-05, + "loss": 0.0628, + "step": 15690 + }, + { + "epoch": 37.01, + "grad_norm": 0.00478848721832037, + "learning_rate": 2.6667157367878698e-05, + "loss": 0.0897, + "step": 15700 + }, + { + "epoch": 37.01, + "grad_norm": 0.13232554495334625, + "learning_rate": 2.66487560724275e-05, + "loss": 0.0195, + "step": 15710 + }, + { + "epoch": 37.01, + "grad_norm": 0.03150768205523491, + "learning_rate": 2.66303547769763e-05, + "loss": 0.0366, + "step": 15720 + }, + { + "epoch": 37.01, + "grad_norm": 0.05836179479956627, + "learning_rate": 2.6611953481525098e-05, + "loss": 0.0235, + "step": 15730 + }, + { + "epoch": 37.01, + "grad_norm": 0.003542052349075675, + "learning_rate": 2.6593552186073902e-05, + "loss": 0.1149, + "step": 15740 + }, + { + "epoch": 37.01, + "grad_norm": 0.0033408894669264555, + "learning_rate": 2.6575150890622702e-05, + "loss": 0.0015, + "step": 15750 + }, + { + "epoch": 37.01, + "grad_norm": 0.00453279260545969, + "learning_rate": 2.6556749595171505e-05, + "loss": 0.001, + "step": 15760 + }, + { + "epoch": 37.01, + "grad_norm": 0.22416509687900543, + "learning_rate": 2.65383482997203e-05, + "loss": 0.0005, + "step": 15770 + }, + { + "epoch": 37.01, + "grad_norm": 0.07529207319021225, + "learning_rate": 2.6519947004269102e-05, + "loss": 0.0583, + "step": 15780 + }, + { + "epoch": 37.01, + "grad_norm": 90.49588012695312, + "learning_rate": 2.6501545708817906e-05, + "loss": 0.0168, + "step": 15790 + }, + { + "epoch": 37.01, + "grad_norm": 0.5191478133201599, + "learning_rate": 2.64831444133667e-05, + "loss": 0.0957, + "step": 15800 + }, + { + "epoch": 37.01, + "grad_norm": 0.003834774950519204, + "learning_rate": 2.6464743117915503e-05, + "loss": 0.0245, + "step": 15810 + }, + { + "epoch": 37.01, + "grad_norm": 0.020836833864450455, + "learning_rate": 2.6446341822464306e-05, + "loss": 0.0311, + "step": 15820 + }, + { + "epoch": 37.01, + "grad_norm": 31.80251693725586, + "learning_rate": 2.64279405270131e-05, + "loss": 0.1184, + "step": 15830 + }, + { + "epoch": 37.01, + "grad_norm": 0.004135098308324814, + "learning_rate": 2.6409539231561903e-05, + "loss": 0.146, + "step": 15840 + }, + { + "epoch": 37.01, + "grad_norm": 0.04643867164850235, + "learning_rate": 2.6391137936110706e-05, + "loss": 0.0443, + "step": 15850 + }, + { + "epoch": 37.01, + "grad_norm": 1.761141300201416, + "learning_rate": 2.63727366406595e-05, + "loss": 0.1349, + "step": 15860 + }, + { + "epoch": 37.01, + "grad_norm": 0.007804957218468189, + "learning_rate": 2.6354335345208303e-05, + "loss": 0.0093, + "step": 15870 + }, + { + "epoch": 37.01, + "grad_norm": 0.010941618122160435, + "learning_rate": 2.6335934049757103e-05, + "loss": 0.0335, + "step": 15880 + }, + { + "epoch": 37.01, + "grad_norm": 0.005491418763995171, + "learning_rate": 2.6317532754305907e-05, + "loss": 0.0461, + "step": 15890 + }, + { + "epoch": 37.01, + "grad_norm": 0.017632165923714638, + "learning_rate": 2.6299131458854704e-05, + "loss": 0.0007, + "step": 15900 + }, + { + "epoch": 37.01, + "grad_norm": 0.04650917276740074, + "learning_rate": 2.6280730163403504e-05, + "loss": 0.1037, + "step": 15910 + }, + { + "epoch": 37.01, + "grad_norm": 17.576862335205078, + "learning_rate": 2.6262328867952307e-05, + "loss": 0.0725, + "step": 15920 + }, + { + "epoch": 37.01, + "grad_norm": 0.09621303528547287, + "learning_rate": 2.6243927572501104e-05, + "loss": 0.0178, + "step": 15930 + }, + { + "epoch": 37.01, + "grad_norm": 10.42127799987793, + "learning_rate": 2.6225526277049904e-05, + "loss": 0.2358, + "step": 15940 + }, + { + "epoch": 37.01, + "grad_norm": 0.18157176673412323, + "learning_rate": 2.6207124981598708e-05, + "loss": 0.0535, + "step": 15950 + }, + { + "epoch": 37.01, + "grad_norm": 0.05556317791342735, + "learning_rate": 2.6188723686147504e-05, + "loss": 0.0678, + "step": 15960 + }, + { + "epoch": 37.01, + "eval_accuracy": 0.7364864864864865, + "eval_loss": 1.419447422027588, + "eval_runtime": 39.1526, + "eval_samples_per_second": 22.68, + "eval_steps_per_second": 1.89, + "step": 15960 + }, + { + "epoch": 38.0, + "grad_norm": 3.7612640857696533, + "learning_rate": 2.6170322390696304e-05, + "loss": 0.0745, + "step": 15970 + }, + { + "epoch": 38.0, + "grad_norm": 0.08191471546888351, + "learning_rate": 2.6151921095245108e-05, + "loss": 0.0784, + "step": 15980 + }, + { + "epoch": 38.0, + "grad_norm": 0.015262553468346596, + "learning_rate": 2.6133519799793905e-05, + "loss": 0.0007, + "step": 15990 + }, + { + "epoch": 38.0, + "grad_norm": 0.006925337016582489, + "learning_rate": 2.6115118504342705e-05, + "loss": 0.0034, + "step": 16000 + }, + { + "epoch": 38.0, + "grad_norm": 0.012639672495424747, + "learning_rate": 2.6096717208891508e-05, + "loss": 0.0025, + "step": 16010 + }, + { + "epoch": 38.0, + "grad_norm": 2.9875874519348145, + "learning_rate": 2.607831591344031e-05, + "loss": 0.0021, + "step": 16020 + }, + { + "epoch": 38.0, + "grad_norm": 0.013625150546431541, + "learning_rate": 2.6059914617989105e-05, + "loss": 0.0067, + "step": 16030 + }, + { + "epoch": 38.0, + "grad_norm": 5.456194877624512, + "learning_rate": 2.604151332253791e-05, + "loss": 0.0573, + "step": 16040 + }, + { + "epoch": 38.0, + "grad_norm": 25.647403717041016, + "learning_rate": 2.602311202708671e-05, + "loss": 0.0657, + "step": 16050 + }, + { + "epoch": 38.0, + "grad_norm": 0.0463079996407032, + "learning_rate": 2.6004710731635505e-05, + "loss": 0.0213, + "step": 16060 + }, + { + "epoch": 38.0, + "grad_norm": 0.02664658986032009, + "learning_rate": 2.598630943618431e-05, + "loss": 0.0016, + "step": 16070 + }, + { + "epoch": 38.0, + "grad_norm": 1.7586325407028198, + "learning_rate": 2.596790814073311e-05, + "loss": 0.0569, + "step": 16080 + }, + { + "epoch": 38.0, + "grad_norm": 0.015104389749467373, + "learning_rate": 2.5949506845281906e-05, + "loss": 0.1198, + "step": 16090 + }, + { + "epoch": 38.0, + "grad_norm": 0.009908678941428661, + "learning_rate": 2.593110554983071e-05, + "loss": 0.1008, + "step": 16100 + }, + { + "epoch": 38.0, + "grad_norm": 23.33299446105957, + "learning_rate": 2.591270425437951e-05, + "loss": 0.0304, + "step": 16110 + }, + { + "epoch": 38.01, + "grad_norm": 0.06382476538419724, + "learning_rate": 2.5894302958928306e-05, + "loss": 0.1119, + "step": 16120 + }, + { + "epoch": 38.01, + "grad_norm": 0.05750289559364319, + "learning_rate": 2.587590166347711e-05, + "loss": 0.0004, + "step": 16130 + }, + { + "epoch": 38.01, + "grad_norm": 0.06110772490501404, + "learning_rate": 2.585750036802591e-05, + "loss": 0.0007, + "step": 16140 + }, + { + "epoch": 38.01, + "grad_norm": 0.0023573378566652536, + "learning_rate": 2.5839099072574713e-05, + "loss": 0.031, + "step": 16150 + }, + { + "epoch": 38.01, + "grad_norm": 0.015254752710461617, + "learning_rate": 2.582069777712351e-05, + "loss": 0.0157, + "step": 16160 + }, + { + "epoch": 38.01, + "grad_norm": 0.05136784166097641, + "learning_rate": 2.580229648167231e-05, + "loss": 0.0594, + "step": 16170 + }, + { + "epoch": 38.01, + "grad_norm": 0.004741484299302101, + "learning_rate": 2.5783895186221114e-05, + "loss": 0.0053, + "step": 16180 + }, + { + "epoch": 38.01, + "grad_norm": 0.027769001200795174, + "learning_rate": 2.576549389076991e-05, + "loss": 0.0471, + "step": 16190 + }, + { + "epoch": 38.01, + "grad_norm": 0.0037545578088611364, + "learning_rate": 2.574709259531871e-05, + "loss": 0.0274, + "step": 16200 + }, + { + "epoch": 38.01, + "grad_norm": 2.7802109718322754, + "learning_rate": 2.5728691299867514e-05, + "loss": 0.0447, + "step": 16210 + }, + { + "epoch": 38.01, + "grad_norm": 0.006928629241883755, + "learning_rate": 2.571029000441631e-05, + "loss": 0.0005, + "step": 16220 + }, + { + "epoch": 38.01, + "grad_norm": 0.029513835906982422, + "learning_rate": 2.569188870896511e-05, + "loss": 0.0168, + "step": 16230 + }, + { + "epoch": 38.01, + "grad_norm": 0.002607174916192889, + "learning_rate": 2.5673487413513914e-05, + "loss": 0.0006, + "step": 16240 + }, + { + "epoch": 38.01, + "grad_norm": 0.034030377864837646, + "learning_rate": 2.5655086118062714e-05, + "loss": 0.0126, + "step": 16250 + }, + { + "epoch": 38.01, + "grad_norm": 2.631910562515259, + "learning_rate": 2.563668482261151e-05, + "loss": 0.019, + "step": 16260 + }, + { + "epoch": 38.01, + "grad_norm": 0.061333347111940384, + "learning_rate": 2.5618283527160315e-05, + "loss": 0.0005, + "step": 16270 + }, + { + "epoch": 38.01, + "grad_norm": 0.33661890029907227, + "learning_rate": 2.5599882231709115e-05, + "loss": 0.0435, + "step": 16280 + }, + { + "epoch": 38.01, + "grad_norm": 0.032611507922410965, + "learning_rate": 2.558148093625791e-05, + "loss": 0.0224, + "step": 16290 + }, + { + "epoch": 38.01, + "grad_norm": 0.0037321383133530617, + "learning_rate": 2.5563079640806715e-05, + "loss": 0.0165, + "step": 16300 + }, + { + "epoch": 38.01, + "grad_norm": 124.54664611816406, + "learning_rate": 2.5544678345355515e-05, + "loss": 0.0152, + "step": 16310 + }, + { + "epoch": 38.01, + "grad_norm": 0.03767447546124458, + "learning_rate": 2.5526277049904312e-05, + "loss": 0.0946, + "step": 16320 + }, + { + "epoch": 38.01, + "grad_norm": 0.0054799229837954044, + "learning_rate": 2.5507875754453115e-05, + "loss": 0.1248, + "step": 16330 + }, + { + "epoch": 38.01, + "grad_norm": 0.002836138242855668, + "learning_rate": 2.5489474459001915e-05, + "loss": 0.0002, + "step": 16340 + }, + { + "epoch": 38.01, + "grad_norm": 4.855489730834961, + "learning_rate": 2.5471073163550712e-05, + "loss": 0.063, + "step": 16350 + }, + { + "epoch": 38.01, + "grad_norm": 0.013726359233260155, + "learning_rate": 2.5452671868099516e-05, + "loss": 0.027, + "step": 16360 + }, + { + "epoch": 38.01, + "grad_norm": 0.7091811299324036, + "learning_rate": 2.5434270572648316e-05, + "loss": 0.1472, + "step": 16370 + }, + { + "epoch": 38.01, + "grad_norm": 0.004487201105803251, + "learning_rate": 2.541586927719712e-05, + "loss": 0.0007, + "step": 16380 + }, + { + "epoch": 38.01, + "eval_accuracy": 0.7072072072072072, + "eval_loss": 1.9136948585510254, + "eval_runtime": 38.9937, + "eval_samples_per_second": 22.773, + "eval_steps_per_second": 1.898, + "step": 16380 + }, + { + "epoch": 39.0, + "grad_norm": 9.85066032409668, + "learning_rate": 2.5397467981745916e-05, + "loss": 0.0349, + "step": 16390 + }, + { + "epoch": 39.0, + "grad_norm": 0.1230677142739296, + "learning_rate": 2.5379066686294716e-05, + "loss": 0.0003, + "step": 16400 + }, + { + "epoch": 39.0, + "grad_norm": 0.010495364665985107, + "learning_rate": 2.536066539084352e-05, + "loss": 0.0453, + "step": 16410 + }, + { + "epoch": 39.0, + "grad_norm": 0.0022531235590577126, + "learning_rate": 2.5342264095392316e-05, + "loss": 0.0454, + "step": 16420 + }, + { + "epoch": 39.0, + "grad_norm": 46.67967987060547, + "learning_rate": 2.5323862799941116e-05, + "loss": 0.1218, + "step": 16430 + }, + { + "epoch": 39.0, + "grad_norm": 0.027552777901291847, + "learning_rate": 2.530546150448992e-05, + "loss": 0.0181, + "step": 16440 + }, + { + "epoch": 39.0, + "grad_norm": 0.18322528898715973, + "learning_rate": 2.5287060209038717e-05, + "loss": 0.0005, + "step": 16450 + }, + { + "epoch": 39.0, + "grad_norm": 1.810534119606018, + "learning_rate": 2.5268658913587517e-05, + "loss": 0.0027, + "step": 16460 + }, + { + "epoch": 39.0, + "grad_norm": 0.05370220169425011, + "learning_rate": 2.525025761813632e-05, + "loss": 0.0217, + "step": 16470 + }, + { + "epoch": 39.0, + "grad_norm": 0.008379046805202961, + "learning_rate": 2.5231856322685117e-05, + "loss": 0.0405, + "step": 16480 + }, + { + "epoch": 39.0, + "grad_norm": 0.004804358817636967, + "learning_rate": 2.5213455027233917e-05, + "loss": 0.0448, + "step": 16490 + }, + { + "epoch": 39.0, + "grad_norm": 0.009872007183730602, + "learning_rate": 2.519505373178272e-05, + "loss": 0.0058, + "step": 16500 + }, + { + "epoch": 39.0, + "grad_norm": 0.0027543141040951014, + "learning_rate": 2.517665243633152e-05, + "loss": 0.0384, + "step": 16510 + }, + { + "epoch": 39.0, + "grad_norm": 0.0008711799746379256, + "learning_rate": 2.5158251140880317e-05, + "loss": 0.0007, + "step": 16520 + }, + { + "epoch": 39.0, + "grad_norm": 3.9537038803100586, + "learning_rate": 2.513984984542912e-05, + "loss": 0.0887, + "step": 16530 + }, + { + "epoch": 39.01, + "grad_norm": 0.0022325078025460243, + "learning_rate": 2.512144854997792e-05, + "loss": 0.0005, + "step": 16540 + }, + { + "epoch": 39.01, + "grad_norm": 0.014836194925010204, + "learning_rate": 2.5103047254526718e-05, + "loss": 0.0028, + "step": 16550 + }, + { + "epoch": 39.01, + "grad_norm": 0.3858121335506439, + "learning_rate": 2.508464595907552e-05, + "loss": 0.0892, + "step": 16560 + }, + { + "epoch": 39.01, + "grad_norm": 0.01734367199242115, + "learning_rate": 2.506624466362432e-05, + "loss": 0.0148, + "step": 16570 + }, + { + "epoch": 39.01, + "grad_norm": 0.00928126834332943, + "learning_rate": 2.5047843368173118e-05, + "loss": 0.0977, + "step": 16580 + }, + { + "epoch": 39.01, + "grad_norm": 0.005387528333812952, + "learning_rate": 2.502944207272192e-05, + "loss": 0.0325, + "step": 16590 + }, + { + "epoch": 39.01, + "grad_norm": 11.179068565368652, + "learning_rate": 2.5011040777270722e-05, + "loss": 0.0623, + "step": 16600 + }, + { + "epoch": 39.01, + "grad_norm": 0.06299767643213272, + "learning_rate": 2.4992639481819522e-05, + "loss": 0.0917, + "step": 16610 + }, + { + "epoch": 39.01, + "grad_norm": 0.013811836019158363, + "learning_rate": 2.4974238186368322e-05, + "loss": 0.0981, + "step": 16620 + }, + { + "epoch": 39.01, + "grad_norm": 0.037350479513406754, + "learning_rate": 2.4955836890917122e-05, + "loss": 0.0119, + "step": 16630 + }, + { + "epoch": 39.01, + "grad_norm": 0.34366223216056824, + "learning_rate": 2.4937435595465922e-05, + "loss": 0.0972, + "step": 16640 + }, + { + "epoch": 39.01, + "grad_norm": 0.2959058880805969, + "learning_rate": 2.4919034300014722e-05, + "loss": 0.0052, + "step": 16650 + }, + { + "epoch": 39.01, + "grad_norm": 0.041184067726135254, + "learning_rate": 2.4900633004563522e-05, + "loss": 0.0447, + "step": 16660 + }, + { + "epoch": 39.01, + "grad_norm": 0.00951891764998436, + "learning_rate": 2.4882231709112323e-05, + "loss": 0.007, + "step": 16670 + }, + { + "epoch": 39.01, + "grad_norm": 0.020858224481344223, + "learning_rate": 2.4863830413661123e-05, + "loss": 0.0467, + "step": 16680 + }, + { + "epoch": 39.01, + "grad_norm": 0.05100645124912262, + "learning_rate": 2.4845429118209923e-05, + "loss": 0.0731, + "step": 16690 + }, + { + "epoch": 39.01, + "grad_norm": 0.005188668146729469, + "learning_rate": 2.4827027822758723e-05, + "loss": 0.0587, + "step": 16700 + }, + { + "epoch": 39.01, + "grad_norm": 0.008620868436992168, + "learning_rate": 2.4808626527307523e-05, + "loss": 0.0281, + "step": 16710 + }, + { + "epoch": 39.01, + "grad_norm": 0.014651943929493427, + "learning_rate": 2.4790225231856323e-05, + "loss": 0.0893, + "step": 16720 + }, + { + "epoch": 39.01, + "grad_norm": 0.18780621886253357, + "learning_rate": 2.4771823936405127e-05, + "loss": 0.0099, + "step": 16730 + }, + { + "epoch": 39.01, + "grad_norm": 0.015461204573512077, + "learning_rate": 2.4753422640953923e-05, + "loss": 0.0778, + "step": 16740 + }, + { + "epoch": 39.01, + "grad_norm": 0.245300754904747, + "learning_rate": 2.4735021345502724e-05, + "loss": 0.0006, + "step": 16750 + }, + { + "epoch": 39.01, + "grad_norm": 0.094014972448349, + "learning_rate": 2.4716620050051527e-05, + "loss": 0.0009, + "step": 16760 + }, + { + "epoch": 39.01, + "grad_norm": 0.003981316927820444, + "learning_rate": 2.4698218754600324e-05, + "loss": 0.0896, + "step": 16770 + }, + { + "epoch": 39.01, + "grad_norm": 3.2442426681518555, + "learning_rate": 2.4679817459149127e-05, + "loss": 0.0009, + "step": 16780 + }, + { + "epoch": 39.01, + "grad_norm": 0.0036344637628644705, + "learning_rate": 2.4661416163697927e-05, + "loss": 0.0018, + "step": 16790 + }, + { + "epoch": 39.01, + "grad_norm": 0.002372809685766697, + "learning_rate": 2.4643014868246724e-05, + "loss": 0.0602, + "step": 16800 + }, + { + "epoch": 39.01, + "eval_accuracy": 0.7308558558558559, + "eval_loss": 1.617972731590271, + "eval_runtime": 38.5744, + "eval_samples_per_second": 23.02, + "eval_steps_per_second": 1.918, + "step": 16800 + }, + { + "epoch": 40.0, + "grad_norm": 0.004174708854407072, + "learning_rate": 2.4624613572795528e-05, + "loss": 0.0004, + "step": 16810 + }, + { + "epoch": 40.0, + "grad_norm": 0.009840855374932289, + "learning_rate": 2.4606212277344324e-05, + "loss": 0.0135, + "step": 16820 + }, + { + "epoch": 40.0, + "grad_norm": 0.013369137421250343, + "learning_rate": 2.4587810981893124e-05, + "loss": 0.0318, + "step": 16830 + }, + { + "epoch": 40.0, + "grad_norm": 0.0507052019238472, + "learning_rate": 2.4569409686441928e-05, + "loss": 0.0004, + "step": 16840 + }, + { + "epoch": 40.0, + "grad_norm": 0.04035002738237381, + "learning_rate": 2.4551008390990725e-05, + "loss": 0.0084, + "step": 16850 + }, + { + "epoch": 40.0, + "grad_norm": 0.11788014322519302, + "learning_rate": 2.4532607095539528e-05, + "loss": 0.0253, + "step": 16860 + }, + { + "epoch": 40.0, + "grad_norm": 0.003910732455551624, + "learning_rate": 2.451420580008833e-05, + "loss": 0.0002, + "step": 16870 + }, + { + "epoch": 40.0, + "grad_norm": 0.7661402225494385, + "learning_rate": 2.4495804504637125e-05, + "loss": 0.0299, + "step": 16880 + }, + { + "epoch": 40.0, + "grad_norm": 0.026837226003408432, + "learning_rate": 2.447740320918593e-05, + "loss": 0.0003, + "step": 16890 + }, + { + "epoch": 40.0, + "grad_norm": 13.870619773864746, + "learning_rate": 2.445900191373473e-05, + "loss": 0.0911, + "step": 16900 + }, + { + "epoch": 40.0, + "grad_norm": 0.41090813279151917, + "learning_rate": 2.444060061828353e-05, + "loss": 0.0368, + "step": 16910 + }, + { + "epoch": 40.0, + "grad_norm": 0.0015127554070204496, + "learning_rate": 2.442219932283233e-05, + "loss": 0.0339, + "step": 16920 + }, + { + "epoch": 40.0, + "grad_norm": 0.05826074630022049, + "learning_rate": 2.440379802738113e-05, + "loss": 0.0242, + "step": 16930 + }, + { + "epoch": 40.0, + "grad_norm": 0.01771511137485504, + "learning_rate": 2.438539673192993e-05, + "loss": 0.0498, + "step": 16940 + }, + { + "epoch": 40.0, + "grad_norm": 0.03588107228279114, + "learning_rate": 2.436699543647873e-05, + "loss": 0.0668, + "step": 16950 + }, + { + "epoch": 40.01, + "grad_norm": 0.004284579772502184, + "learning_rate": 2.434859414102753e-05, + "loss": 0.0873, + "step": 16960 + }, + { + "epoch": 40.01, + "grad_norm": 13.748348236083984, + "learning_rate": 2.433019284557633e-05, + "loss": 0.0038, + "step": 16970 + }, + { + "epoch": 40.01, + "grad_norm": 0.025629336014389992, + "learning_rate": 2.431179155012513e-05, + "loss": 0.001, + "step": 16980 + }, + { + "epoch": 40.01, + "grad_norm": 0.0758618637919426, + "learning_rate": 2.429339025467393e-05, + "loss": 0.0006, + "step": 16990 + }, + { + "epoch": 40.01, + "grad_norm": 0.004732039291411638, + "learning_rate": 2.427498895922273e-05, + "loss": 0.054, + "step": 17000 + }, + { + "epoch": 40.01, + "grad_norm": 0.05720449239015579, + "learning_rate": 2.425658766377153e-05, + "loss": 0.0007, + "step": 17010 + }, + { + "epoch": 40.01, + "grad_norm": 0.14128656685352325, + "learning_rate": 2.423818636832033e-05, + "loss": 0.0004, + "step": 17020 + }, + { + "epoch": 40.01, + "grad_norm": 0.0036420163232833147, + "learning_rate": 2.421978507286913e-05, + "loss": 0.034, + "step": 17030 + }, + { + "epoch": 40.01, + "grad_norm": 0.09513210505247116, + "learning_rate": 2.4201383777417934e-05, + "loss": 0.0915, + "step": 17040 + }, + { + "epoch": 40.01, + "grad_norm": 0.025200609117746353, + "learning_rate": 2.418298248196673e-05, + "loss": 0.1074, + "step": 17050 + }, + { + "epoch": 40.01, + "grad_norm": 65.16971588134766, + "learning_rate": 2.416458118651553e-05, + "loss": 0.0952, + "step": 17060 + }, + { + "epoch": 40.01, + "grad_norm": 0.010291090235114098, + "learning_rate": 2.4146179891064334e-05, + "loss": 0.0003, + "step": 17070 + }, + { + "epoch": 40.01, + "grad_norm": 0.7548648715019226, + "learning_rate": 2.412777859561313e-05, + "loss": 0.0522, + "step": 17080 + }, + { + "epoch": 40.01, + "grad_norm": 0.024852802976965904, + "learning_rate": 2.4109377300161934e-05, + "loss": 0.0224, + "step": 17090 + }, + { + "epoch": 40.01, + "grad_norm": 10.804015159606934, + "learning_rate": 2.4090976004710734e-05, + "loss": 0.1196, + "step": 17100 + }, + { + "epoch": 40.01, + "grad_norm": 19.513050079345703, + "learning_rate": 2.407257470925953e-05, + "loss": 0.0707, + "step": 17110 + }, + { + "epoch": 40.01, + "grad_norm": 0.014289339073002338, + "learning_rate": 2.4054173413808335e-05, + "loss": 0.0865, + "step": 17120 + }, + { + "epoch": 40.01, + "grad_norm": 37.88726806640625, + "learning_rate": 2.4035772118357135e-05, + "loss": 0.0773, + "step": 17130 + }, + { + "epoch": 40.01, + "grad_norm": 0.6281788349151611, + "learning_rate": 2.401737082290593e-05, + "loss": 0.0007, + "step": 17140 + }, + { + "epoch": 40.01, + "grad_norm": 0.09738222509622574, + "learning_rate": 2.3998969527454735e-05, + "loss": 0.0008, + "step": 17150 + }, + { + "epoch": 40.01, + "grad_norm": 0.0028530398849397898, + "learning_rate": 2.3980568232003535e-05, + "loss": 0.0301, + "step": 17160 + }, + { + "epoch": 40.01, + "grad_norm": 11.567769050598145, + "learning_rate": 2.3962166936552335e-05, + "loss": 0.0078, + "step": 17170 + }, + { + "epoch": 40.01, + "grad_norm": 0.010698237456381321, + "learning_rate": 2.3943765641101135e-05, + "loss": 0.0003, + "step": 17180 + }, + { + "epoch": 40.01, + "grad_norm": 0.0026003606617450714, + "learning_rate": 2.3925364345649935e-05, + "loss": 0.0827, + "step": 17190 + }, + { + "epoch": 40.01, + "grad_norm": 0.004792836960405111, + "learning_rate": 2.3906963050198736e-05, + "loss": 0.0963, + "step": 17200 + }, + { + "epoch": 40.01, + "grad_norm": 0.004160434473305941, + "learning_rate": 2.3888561754747536e-05, + "loss": 0.0004, + "step": 17210 + }, + { + "epoch": 40.01, + "grad_norm": 0.0015712743625044823, + "learning_rate": 2.3870160459296336e-05, + "loss": 0.0977, + "step": 17220 + }, + { + "epoch": 40.01, + "eval_accuracy": 0.7353603603603603, + "eval_loss": 1.5709514617919922, + "eval_runtime": 188.7609, + "eval_samples_per_second": 4.704, + "eval_steps_per_second": 0.392, + "step": 17220 + }, + { + "epoch": 41.0, + "grad_norm": 6.6910247802734375, + "learning_rate": 2.3851759163845136e-05, + "loss": 0.0501, + "step": 17230 + }, + { + "epoch": 41.0, + "grad_norm": 0.0936957523226738, + "learning_rate": 2.3833357868393936e-05, + "loss": 0.0745, + "step": 17240 + }, + { + "epoch": 41.0, + "grad_norm": 44.87736892700195, + "learning_rate": 2.3814956572942736e-05, + "loss": 0.0959, + "step": 17250 + }, + { + "epoch": 41.0, + "grad_norm": 0.6046668291091919, + "learning_rate": 2.3796555277491536e-05, + "loss": 0.0053, + "step": 17260 + }, + { + "epoch": 41.0, + "grad_norm": 0.00754895992577076, + "learning_rate": 2.3778153982040336e-05, + "loss": 0.0051, + "step": 17270 + }, + { + "epoch": 41.0, + "grad_norm": 0.04283663257956505, + "learning_rate": 2.3759752686589136e-05, + "loss": 0.1071, + "step": 17280 + }, + { + "epoch": 41.0, + "grad_norm": 0.02367532253265381, + "learning_rate": 2.3741351391137937e-05, + "loss": 0.0285, + "step": 17290 + }, + { + "epoch": 41.0, + "grad_norm": 0.08702404052019119, + "learning_rate": 2.372295009568674e-05, + "loss": 0.0055, + "step": 17300 + }, + { + "epoch": 41.0, + "grad_norm": 0.0050528873689472675, + "learning_rate": 2.3704548800235537e-05, + "loss": 0.0553, + "step": 17310 + }, + { + "epoch": 41.0, + "grad_norm": 0.03083922155201435, + "learning_rate": 2.3686147504784337e-05, + "loss": 0.0396, + "step": 17320 + }, + { + "epoch": 41.0, + "grad_norm": 0.008288837969303131, + "learning_rate": 2.3667746209333137e-05, + "loss": 0.012, + "step": 17330 + }, + { + "epoch": 41.0, + "grad_norm": 1.1453498601913452, + "learning_rate": 2.3649344913881937e-05, + "loss": 0.0006, + "step": 17340 + }, + { + "epoch": 41.0, + "grad_norm": 0.010078281164169312, + "learning_rate": 2.363094361843074e-05, + "loss": 0.0316, + "step": 17350 + }, + { + "epoch": 41.0, + "grad_norm": 18.37259864807129, + "learning_rate": 2.3612542322979537e-05, + "loss": 0.0638, + "step": 17360 + }, + { + "epoch": 41.0, + "grad_norm": 0.00207115919329226, + "learning_rate": 2.3594141027528337e-05, + "loss": 0.0002, + "step": 17370 + }, + { + "epoch": 41.01, + "grad_norm": 0.03447471559047699, + "learning_rate": 2.357573973207714e-05, + "loss": 0.047, + "step": 17380 + }, + { + "epoch": 41.01, + "grad_norm": 0.02097044140100479, + "learning_rate": 2.3557338436625938e-05, + "loss": 0.0003, + "step": 17390 + }, + { + "epoch": 41.01, + "grad_norm": 0.001427238341420889, + "learning_rate": 2.353893714117474e-05, + "loss": 0.0122, + "step": 17400 + }, + { + "epoch": 41.01, + "grad_norm": 0.0013924982631579041, + "learning_rate": 2.352053584572354e-05, + "loss": 0.0134, + "step": 17410 + }, + { + "epoch": 41.01, + "grad_norm": 0.22463291883468628, + "learning_rate": 2.3502134550272338e-05, + "loss": 0.0424, + "step": 17420 + }, + { + "epoch": 41.01, + "grad_norm": 0.1324460357427597, + "learning_rate": 2.348373325482114e-05, + "loss": 0.0004, + "step": 17430 + }, + { + "epoch": 41.01, + "grad_norm": 0.015974344685673714, + "learning_rate": 2.346533195936994e-05, + "loss": 0.022, + "step": 17440 + }, + { + "epoch": 41.01, + "grad_norm": 0.0521487221121788, + "learning_rate": 2.344693066391874e-05, + "loss": 0.0183, + "step": 17450 + }, + { + "epoch": 41.01, + "grad_norm": 1.8123937845230103, + "learning_rate": 2.3428529368467542e-05, + "loss": 0.0009, + "step": 17460 + }, + { + "epoch": 41.01, + "grad_norm": 0.06810711324214935, + "learning_rate": 2.3410128073016342e-05, + "loss": 0.0933, + "step": 17470 + }, + { + "epoch": 41.01, + "grad_norm": 0.24335838854312897, + "learning_rate": 2.3391726777565142e-05, + "loss": 0.0208, + "step": 17480 + }, + { + "epoch": 41.01, + "grad_norm": 0.029186120256781578, + "learning_rate": 2.3373325482113942e-05, + "loss": 0.0566, + "step": 17490 + }, + { + "epoch": 41.01, + "grad_norm": 7.900701999664307, + "learning_rate": 2.3354924186662742e-05, + "loss": 0.0158, + "step": 17500 + }, + { + "epoch": 41.01, + "grad_norm": 0.007547201123088598, + "learning_rate": 2.3336522891211542e-05, + "loss": 0.1573, + "step": 17510 + }, + { + "epoch": 41.01, + "grad_norm": 0.07081840187311172, + "learning_rate": 2.3318121595760343e-05, + "loss": 0.011, + "step": 17520 + }, + { + "epoch": 41.01, + "grad_norm": 3.3901169300079346, + "learning_rate": 2.3299720300309143e-05, + "loss": 0.0747, + "step": 17530 + }, + { + "epoch": 41.01, + "grad_norm": 15.250762939453125, + "learning_rate": 2.3281319004857943e-05, + "loss": 0.0037, + "step": 17540 + }, + { + "epoch": 41.01, + "grad_norm": 0.00895916298031807, + "learning_rate": 2.3262917709406743e-05, + "loss": 0.0028, + "step": 17550 + }, + { + "epoch": 41.01, + "grad_norm": 0.008652539923787117, + "learning_rate": 2.3244516413955543e-05, + "loss": 0.0022, + "step": 17560 + }, + { + "epoch": 41.01, + "grad_norm": 0.004491760861128569, + "learning_rate": 2.3226115118504343e-05, + "loss": 0.051, + "step": 17570 + }, + { + "epoch": 41.01, + "grad_norm": 0.0023870787117630243, + "learning_rate": 2.3207713823053143e-05, + "loss": 0.1203, + "step": 17580 + }, + { + "epoch": 41.01, + "grad_norm": 0.007122043985873461, + "learning_rate": 2.3189312527601943e-05, + "loss": 0.0005, + "step": 17590 + }, + { + "epoch": 41.01, + "grad_norm": 0.003632687497884035, + "learning_rate": 2.3170911232150744e-05, + "loss": 0.0662, + "step": 17600 + }, + { + "epoch": 41.01, + "grad_norm": 5.689450263977051, + "learning_rate": 2.3152509936699547e-05, + "loss": 0.0521, + "step": 17610 + }, + { + "epoch": 41.01, + "grad_norm": 12.24816608428955, + "learning_rate": 2.3134108641248344e-05, + "loss": 0.0515, + "step": 17620 + }, + { + "epoch": 41.01, + "grad_norm": 10.62009334564209, + "learning_rate": 2.3115707345797144e-05, + "loss": 0.0531, + "step": 17630 + }, + { + "epoch": 41.01, + "grad_norm": 0.02156691811978817, + "learning_rate": 2.3097306050345947e-05, + "loss": 0.0606, + "step": 17640 + }, + { + "epoch": 41.01, + "eval_accuracy": 0.7342342342342343, + "eval_loss": 1.3908066749572754, + "eval_runtime": 193.3149, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 0.383, + "step": 17640 + }, + { + "epoch": 42.0, + "grad_norm": 0.056538455188274384, + "learning_rate": 2.3078904754894744e-05, + "loss": 0.0007, + "step": 17650 + }, + { + "epoch": 42.0, + "grad_norm": 0.012973904609680176, + "learning_rate": 2.3060503459443548e-05, + "loss": 0.0648, + "step": 17660 + }, + { + "epoch": 42.0, + "grad_norm": 0.43850526213645935, + "learning_rate": 2.3042102163992348e-05, + "loss": 0.0018, + "step": 17670 + }, + { + "epoch": 42.0, + "grad_norm": 12.316707611083984, + "learning_rate": 2.3023700868541144e-05, + "loss": 0.0728, + "step": 17680 + }, + { + "epoch": 42.0, + "grad_norm": 0.034915756434202194, + "learning_rate": 2.3005299573089948e-05, + "loss": 0.0126, + "step": 17690 + }, + { + "epoch": 42.0, + "grad_norm": 14.699057579040527, + "learning_rate": 2.2986898277638748e-05, + "loss": 0.0525, + "step": 17700 + }, + { + "epoch": 42.0, + "grad_norm": 0.06873279809951782, + "learning_rate": 2.2968496982187548e-05, + "loss": 0.0026, + "step": 17710 + }, + { + "epoch": 42.0, + "grad_norm": 0.032149434089660645, + "learning_rate": 2.2950095686736348e-05, + "loss": 0.0307, + "step": 17720 + }, + { + "epoch": 42.0, + "grad_norm": 0.002019342966377735, + "learning_rate": 2.293169439128515e-05, + "loss": 0.0006, + "step": 17730 + }, + { + "epoch": 42.0, + "grad_norm": 0.028612077236175537, + "learning_rate": 2.291329309583395e-05, + "loss": 0.0774, + "step": 17740 + }, + { + "epoch": 42.0, + "grad_norm": 0.005914296023547649, + "learning_rate": 2.289489180038275e-05, + "loss": 0.0439, + "step": 17750 + }, + { + "epoch": 42.0, + "grad_norm": 0.006604051683098078, + "learning_rate": 2.2876490504931545e-05, + "loss": 0.0078, + "step": 17760 + }, + { + "epoch": 42.0, + "grad_norm": 0.3514362573623657, + "learning_rate": 2.285808920948035e-05, + "loss": 0.0004, + "step": 17770 + }, + { + "epoch": 42.0, + "grad_norm": 0.6133325099945068, + "learning_rate": 2.283968791402915e-05, + "loss": 0.002, + "step": 17780 + }, + { + "epoch": 42.0, + "grad_norm": 34.64471435546875, + "learning_rate": 2.282128661857795e-05, + "loss": 0.1041, + "step": 17790 + }, + { + "epoch": 42.01, + "grad_norm": 0.014041568152606487, + "learning_rate": 2.280288532312675e-05, + "loss": 0.0008, + "step": 17800 + }, + { + "epoch": 42.01, + "grad_norm": 0.03229415416717529, + "learning_rate": 2.278448402767555e-05, + "loss": 0.036, + "step": 17810 + }, + { + "epoch": 42.01, + "grad_norm": 0.004131761845201254, + "learning_rate": 2.276608273222435e-05, + "loss": 0.0136, + "step": 17820 + }, + { + "epoch": 42.01, + "grad_norm": 0.006716595031321049, + "learning_rate": 2.274768143677315e-05, + "loss": 0.0645, + "step": 17830 + }, + { + "epoch": 42.01, + "grad_norm": 10.16481876373291, + "learning_rate": 2.2729280141321953e-05, + "loss": 0.0539, + "step": 17840 + }, + { + "epoch": 42.01, + "grad_norm": 0.02316008321940899, + "learning_rate": 2.271087884587075e-05, + "loss": 0.0512, + "step": 17850 + }, + { + "epoch": 42.01, + "grad_norm": 0.31423234939575195, + "learning_rate": 2.269247755041955e-05, + "loss": 0.0007, + "step": 17860 + }, + { + "epoch": 42.01, + "grad_norm": 0.010419754311442375, + "learning_rate": 2.267407625496835e-05, + "loss": 0.0003, + "step": 17870 + }, + { + "epoch": 42.01, + "grad_norm": 0.003867323510348797, + "learning_rate": 2.265567495951715e-05, + "loss": 0.001, + "step": 17880 + }, + { + "epoch": 42.01, + "grad_norm": 0.007264145649969578, + "learning_rate": 2.263727366406595e-05, + "loss": 0.0104, + "step": 17890 + }, + { + "epoch": 42.01, + "grad_norm": 0.06619902700185776, + "learning_rate": 2.261887236861475e-05, + "loss": 0.0032, + "step": 17900 + }, + { + "epoch": 42.01, + "grad_norm": 0.0021775520872324705, + "learning_rate": 2.260047107316355e-05, + "loss": 0.001, + "step": 17910 + }, + { + "epoch": 42.01, + "grad_norm": 0.032563529908657074, + "learning_rate": 2.2582069777712354e-05, + "loss": 0.0634, + "step": 17920 + }, + { + "epoch": 42.01, + "grad_norm": 0.010027103126049042, + "learning_rate": 2.256366848226115e-05, + "loss": 0.0072, + "step": 17930 + }, + { + "epoch": 42.01, + "grad_norm": 0.003468131646513939, + "learning_rate": 2.254526718680995e-05, + "loss": 0.0004, + "step": 17940 + }, + { + "epoch": 42.01, + "grad_norm": 0.0016712818760424852, + "learning_rate": 2.2526865891358754e-05, + "loss": 0.0016, + "step": 17950 + }, + { + "epoch": 42.01, + "grad_norm": 0.004305675625801086, + "learning_rate": 2.250846459590755e-05, + "loss": 0.0005, + "step": 17960 + }, + { + "epoch": 42.01, + "grad_norm": 0.0015782952541485429, + "learning_rate": 2.2490063300456355e-05, + "loss": 0.0002, + "step": 17970 + }, + { + "epoch": 42.01, + "grad_norm": 0.024773990735411644, + "learning_rate": 2.2471662005005155e-05, + "loss": 0.0228, + "step": 17980 + }, + { + "epoch": 42.01, + "grad_norm": 0.0037998452316969633, + "learning_rate": 2.245326070955395e-05, + "loss": 0.0525, + "step": 17990 + }, + { + "epoch": 42.01, + "grad_norm": 0.003529587760567665, + "learning_rate": 2.2434859414102755e-05, + "loss": 0.0001, + "step": 18000 + }, + { + "epoch": 42.01, + "grad_norm": 0.008061232976615429, + "learning_rate": 2.2416458118651555e-05, + "loss": 0.0019, + "step": 18010 + }, + { + "epoch": 42.01, + "grad_norm": 0.007496473845094442, + "learning_rate": 2.2398056823200355e-05, + "loss": 0.0001, + "step": 18020 + }, + { + "epoch": 42.01, + "grad_norm": 0.01914142817258835, + "learning_rate": 2.2379655527749155e-05, + "loss": 0.0379, + "step": 18030 + }, + { + "epoch": 42.01, + "grad_norm": 0.08634476363658905, + "learning_rate": 2.2361254232297955e-05, + "loss": 0.1079, + "step": 18040 + }, + { + "epoch": 42.01, + "grad_norm": 0.006580962333828211, + "learning_rate": 2.2342852936846755e-05, + "loss": 0.0002, + "step": 18050 + }, + { + "epoch": 42.01, + "grad_norm": 0.005276253912597895, + "learning_rate": 2.2324451641395556e-05, + "loss": 0.1046, + "step": 18060 + }, + { + "epoch": 42.01, + "eval_accuracy": 0.7252252252252253, + "eval_loss": 1.7845572233200073, + "eval_runtime": 190.8962, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.388, + "step": 18060 + }, + { + "epoch": 43.0, + "grad_norm": 0.0017791197169572115, + "learning_rate": 2.2306050345944356e-05, + "loss": 0.0002, + "step": 18070 + }, + { + "epoch": 43.0, + "grad_norm": 0.03276696428656578, + "learning_rate": 2.2287649050493156e-05, + "loss": 0.0003, + "step": 18080 + }, + { + "epoch": 43.0, + "grad_norm": 0.01086380984634161, + "learning_rate": 2.2269247755041956e-05, + "loss": 0.0005, + "step": 18090 + }, + { + "epoch": 43.0, + "grad_norm": 0.006562290713191032, + "learning_rate": 2.2250846459590756e-05, + "loss": 0.0252, + "step": 18100 + }, + { + "epoch": 43.0, + "grad_norm": 0.0010953620076179504, + "learning_rate": 2.2232445164139556e-05, + "loss": 0.0096, + "step": 18110 + }, + { + "epoch": 43.0, + "grad_norm": 0.0018935714615508914, + "learning_rate": 2.2214043868688356e-05, + "loss": 0.0011, + "step": 18120 + }, + { + "epoch": 43.0, + "grad_norm": 0.005813705734908581, + "learning_rate": 2.2195642573237156e-05, + "loss": 0.1357, + "step": 18130 + }, + { + "epoch": 43.0, + "grad_norm": 0.004509144462645054, + "learning_rate": 2.2177241277785957e-05, + "loss": 0.0003, + "step": 18140 + }, + { + "epoch": 43.0, + "grad_norm": 0.08074437826871872, + "learning_rate": 2.215883998233476e-05, + "loss": 0.0007, + "step": 18150 + }, + { + "epoch": 43.0, + "grad_norm": 0.002948348643258214, + "learning_rate": 2.2140438686883557e-05, + "loss": 0.0001, + "step": 18160 + }, + { + "epoch": 43.0, + "grad_norm": 0.006361374631524086, + "learning_rate": 2.2122037391432357e-05, + "loss": 0.0031, + "step": 18170 + }, + { + "epoch": 43.0, + "grad_norm": 9.16525936126709, + "learning_rate": 2.210363609598116e-05, + "loss": 0.0998, + "step": 18180 + }, + { + "epoch": 43.0, + "grad_norm": 19.75349998474121, + "learning_rate": 2.2085234800529957e-05, + "loss": 0.0581, + "step": 18190 + }, + { + "epoch": 43.0, + "grad_norm": 0.005757891573011875, + "learning_rate": 2.2066833505078757e-05, + "loss": 0.0717, + "step": 18200 + }, + { + "epoch": 43.0, + "grad_norm": 0.002932116389274597, + "learning_rate": 2.204843220962756e-05, + "loss": 0.1433, + "step": 18210 + }, + { + "epoch": 43.01, + "grad_norm": 0.030524814501404762, + "learning_rate": 2.2030030914176357e-05, + "loss": 0.0004, + "step": 18220 + }, + { + "epoch": 43.01, + "grad_norm": 0.13268819451332092, + "learning_rate": 2.201162961872516e-05, + "loss": 0.0009, + "step": 18230 + }, + { + "epoch": 43.01, + "grad_norm": 21.9083309173584, + "learning_rate": 2.199322832327396e-05, + "loss": 0.0293, + "step": 18240 + }, + { + "epoch": 43.01, + "grad_norm": 0.02300228551030159, + "learning_rate": 2.1974827027822758e-05, + "loss": 0.0245, + "step": 18250 + }, + { + "epoch": 43.01, + "grad_norm": 0.0050278110429644585, + "learning_rate": 2.195642573237156e-05, + "loss": 0.0001, + "step": 18260 + }, + { + "epoch": 43.01, + "grad_norm": 0.04535319283604622, + "learning_rate": 2.1938024436920358e-05, + "loss": 0.0991, + "step": 18270 + }, + { + "epoch": 43.01, + "grad_norm": 0.025395981967449188, + "learning_rate": 2.191962314146916e-05, + "loss": 0.0361, + "step": 18280 + }, + { + "epoch": 43.01, + "grad_norm": 0.09500641375780106, + "learning_rate": 2.190122184601796e-05, + "loss": 0.0028, + "step": 18290 + }, + { + "epoch": 43.01, + "grad_norm": 0.02161112241446972, + "learning_rate": 2.188282055056676e-05, + "loss": 0.0089, + "step": 18300 + }, + { + "epoch": 43.01, + "grad_norm": 0.03070366010069847, + "learning_rate": 2.1864419255115562e-05, + "loss": 0.1413, + "step": 18310 + }, + { + "epoch": 43.01, + "grad_norm": 0.0025216133799403906, + "learning_rate": 2.1846017959664362e-05, + "loss": 0.0003, + "step": 18320 + }, + { + "epoch": 43.01, + "grad_norm": 0.544791042804718, + "learning_rate": 2.1827616664213162e-05, + "loss": 0.0926, + "step": 18330 + }, + { + "epoch": 43.01, + "grad_norm": 0.08401042222976685, + "learning_rate": 2.1809215368761962e-05, + "loss": 0.0004, + "step": 18340 + }, + { + "epoch": 43.01, + "grad_norm": 0.05331547558307648, + "learning_rate": 2.1790814073310762e-05, + "loss": 0.057, + "step": 18350 + }, + { + "epoch": 43.01, + "grad_norm": 0.07863025367259979, + "learning_rate": 2.1772412777859562e-05, + "loss": 0.0571, + "step": 18360 + }, + { + "epoch": 43.01, + "grad_norm": 0.03373830392956734, + "learning_rate": 2.1754011482408363e-05, + "loss": 0.0863, + "step": 18370 + }, + { + "epoch": 43.01, + "grad_norm": 0.0050715371035039425, + "learning_rate": 2.1735610186957163e-05, + "loss": 0.0542, + "step": 18380 + }, + { + "epoch": 43.01, + "grad_norm": 0.019844966009259224, + "learning_rate": 2.1717208891505963e-05, + "loss": 0.033, + "step": 18390 + }, + { + "epoch": 43.01, + "grad_norm": 12.274739265441895, + "learning_rate": 2.1698807596054763e-05, + "loss": 0.0837, + "step": 18400 + }, + { + "epoch": 43.01, + "grad_norm": 0.0111334677785635, + "learning_rate": 2.1680406300603563e-05, + "loss": 0.0326, + "step": 18410 + }, + { + "epoch": 43.01, + "grad_norm": 0.011171177960932255, + "learning_rate": 2.1662005005152363e-05, + "loss": 0.0048, + "step": 18420 + }, + { + "epoch": 43.01, + "grad_norm": 2.7167227268218994, + "learning_rate": 2.1643603709701163e-05, + "loss": 0.0808, + "step": 18430 + }, + { + "epoch": 43.01, + "grad_norm": 0.005512547213584185, + "learning_rate": 2.1625202414249963e-05, + "loss": 0.0954, + "step": 18440 + }, + { + "epoch": 43.01, + "grad_norm": 0.008709631860256195, + "learning_rate": 2.1606801118798763e-05, + "loss": 0.0004, + "step": 18450 + }, + { + "epoch": 43.01, + "grad_norm": 24.202104568481445, + "learning_rate": 2.1588399823347567e-05, + "loss": 0.059, + "step": 18460 + }, + { + "epoch": 43.01, + "grad_norm": 0.03742791339755058, + "learning_rate": 2.1569998527896364e-05, + "loss": 0.0772, + "step": 18470 + }, + { + "epoch": 43.01, + "grad_norm": 0.00550016388297081, + "learning_rate": 2.1551597232445164e-05, + "loss": 0.0004, + "step": 18480 + }, + { + "epoch": 43.01, + "eval_accuracy": 0.7240990990990991, + "eval_loss": 1.6395900249481201, + "eval_runtime": 109.1588, + "eval_samples_per_second": 8.135, + "eval_steps_per_second": 0.678, + "step": 18480 + }, + { + "epoch": 44.0, + "grad_norm": 0.14148157835006714, + "learning_rate": 2.1533195936993967e-05, + "loss": 0.0578, + "step": 18490 + }, + { + "epoch": 44.0, + "grad_norm": 30.328266143798828, + "learning_rate": 2.1514794641542764e-05, + "loss": 0.1007, + "step": 18500 + }, + { + "epoch": 44.0, + "grad_norm": 0.004108451772481203, + "learning_rate": 2.1496393346091564e-05, + "loss": 0.0004, + "step": 18510 + }, + { + "epoch": 44.0, + "grad_norm": 0.011362356133759022, + "learning_rate": 2.1477992050640368e-05, + "loss": 0.0333, + "step": 18520 + }, + { + "epoch": 44.0, + "grad_norm": 0.02009459026157856, + "learning_rate": 2.1459590755189164e-05, + "loss": 0.0486, + "step": 18530 + }, + { + "epoch": 44.0, + "grad_norm": 0.031281691044569016, + "learning_rate": 2.1441189459737968e-05, + "loss": 0.0411, + "step": 18540 + }, + { + "epoch": 44.0, + "grad_norm": 0.006339728366583586, + "learning_rate": 2.1422788164286768e-05, + "loss": 0.0003, + "step": 18550 + }, + { + "epoch": 44.0, + "grad_norm": 0.00324187777005136, + "learning_rate": 2.1404386868835565e-05, + "loss": 0.0004, + "step": 18560 + }, + { + "epoch": 44.0, + "grad_norm": 47.98183822631836, + "learning_rate": 2.1385985573384368e-05, + "loss": 0.1047, + "step": 18570 + }, + { + "epoch": 44.0, + "grad_norm": 6.298222064971924, + "learning_rate": 2.136758427793317e-05, + "loss": 0.0573, + "step": 18580 + }, + { + "epoch": 44.0, + "grad_norm": 10.068635940551758, + "learning_rate": 2.134918298248197e-05, + "loss": 0.0494, + "step": 18590 + }, + { + "epoch": 44.0, + "grad_norm": 0.14003126323223114, + "learning_rate": 2.133078168703077e-05, + "loss": 0.0006, + "step": 18600 + }, + { + "epoch": 44.0, + "grad_norm": 0.020095407962799072, + "learning_rate": 2.131238039157957e-05, + "loss": 0.0159, + "step": 18610 + }, + { + "epoch": 44.0, + "grad_norm": 0.0022556742187589407, + "learning_rate": 2.129397909612837e-05, + "loss": 0.0012, + "step": 18620 + }, + { + "epoch": 44.0, + "grad_norm": 0.02804502658545971, + "learning_rate": 2.127557780067717e-05, + "loss": 0.0357, + "step": 18630 + }, + { + "epoch": 44.01, + "grad_norm": 0.009413721971213818, + "learning_rate": 2.125717650522597e-05, + "loss": 0.0004, + "step": 18640 + }, + { + "epoch": 44.01, + "grad_norm": 0.008668404072523117, + "learning_rate": 2.123877520977477e-05, + "loss": 0.0004, + "step": 18650 + }, + { + "epoch": 44.01, + "grad_norm": 0.006598074920475483, + "learning_rate": 2.122037391432357e-05, + "loss": 0.0004, + "step": 18660 + }, + { + "epoch": 44.01, + "grad_norm": 0.005593809299170971, + "learning_rate": 2.120197261887237e-05, + "loss": 0.038, + "step": 18670 + }, + { + "epoch": 44.01, + "grad_norm": 0.013844887726008892, + "learning_rate": 2.118357132342117e-05, + "loss": 0.0556, + "step": 18680 + }, + { + "epoch": 44.01, + "grad_norm": 0.032117415219545364, + "learning_rate": 2.116517002796997e-05, + "loss": 0.0303, + "step": 18690 + }, + { + "epoch": 44.01, + "grad_norm": 0.04788897559046745, + "learning_rate": 2.114676873251877e-05, + "loss": 0.0002, + "step": 18700 + }, + { + "epoch": 44.01, + "grad_norm": 0.0030235203448683023, + "learning_rate": 2.112836743706757e-05, + "loss": 0.0002, + "step": 18710 + }, + { + "epoch": 44.01, + "grad_norm": 0.00274168630130589, + "learning_rate": 2.1109966141616373e-05, + "loss": 0.0342, + "step": 18720 + }, + { + "epoch": 44.01, + "grad_norm": 3.2250986099243164, + "learning_rate": 2.109156484616517e-05, + "loss": 0.0497, + "step": 18730 + }, + { + "epoch": 44.01, + "grad_norm": 0.0157657228410244, + "learning_rate": 2.107316355071397e-05, + "loss": 0.1054, + "step": 18740 + }, + { + "epoch": 44.01, + "grad_norm": 0.0087531553581357, + "learning_rate": 2.1054762255262774e-05, + "loss": 0.0018, + "step": 18750 + }, + { + "epoch": 44.01, + "grad_norm": 0.17030996084213257, + "learning_rate": 2.103636095981157e-05, + "loss": 0.0336, + "step": 18760 + }, + { + "epoch": 44.01, + "grad_norm": 0.07173646986484528, + "learning_rate": 2.1017959664360374e-05, + "loss": 0.0003, + "step": 18770 + }, + { + "epoch": 44.01, + "grad_norm": 0.025418242439627647, + "learning_rate": 2.0999558368909174e-05, + "loss": 0.0097, + "step": 18780 + }, + { + "epoch": 44.01, + "grad_norm": 15.252046585083008, + "learning_rate": 2.098115707345797e-05, + "loss": 0.0543, + "step": 18790 + }, + { + "epoch": 44.01, + "grad_norm": 0.01682462729513645, + "learning_rate": 2.0962755778006774e-05, + "loss": 0.0023, + "step": 18800 + }, + { + "epoch": 44.01, + "grad_norm": 0.1344674676656723, + "learning_rate": 2.094435448255557e-05, + "loss": 0.0003, + "step": 18810 + }, + { + "epoch": 44.01, + "grad_norm": 0.01934729889035225, + "learning_rate": 2.092595318710437e-05, + "loss": 0.0003, + "step": 18820 + }, + { + "epoch": 44.01, + "grad_norm": 0.0010090465657413006, + "learning_rate": 2.0907551891653175e-05, + "loss": 0.0477, + "step": 18830 + }, + { + "epoch": 44.01, + "grad_norm": 0.005481057800352573, + "learning_rate": 2.088915059620197e-05, + "loss": 0.0017, + "step": 18840 + }, + { + "epoch": 44.01, + "grad_norm": 0.12130451202392578, + "learning_rate": 2.0870749300750775e-05, + "loss": 0.1554, + "step": 18850 + }, + { + "epoch": 44.01, + "grad_norm": 0.010078057646751404, + "learning_rate": 2.0852348005299575e-05, + "loss": 0.0126, + "step": 18860 + }, + { + "epoch": 44.01, + "grad_norm": 0.0075241439044475555, + "learning_rate": 2.0833946709848372e-05, + "loss": 0.0003, + "step": 18870 + }, + { + "epoch": 44.01, + "grad_norm": 0.03676333650946617, + "learning_rate": 2.0815545414397175e-05, + "loss": 0.0013, + "step": 18880 + }, + { + "epoch": 44.01, + "grad_norm": 0.003307629842311144, + "learning_rate": 2.0797144118945975e-05, + "loss": 0.0662, + "step": 18890 + }, + { + "epoch": 44.01, + "grad_norm": 0.13727083802223206, + "learning_rate": 2.0778742823494775e-05, + "loss": 0.0881, + "step": 18900 + }, + { + "epoch": 44.01, + "eval_accuracy": 0.7195945945945946, + "eval_loss": 1.6206213235855103, + "eval_runtime": 113.0431, + "eval_samples_per_second": 7.855, + "eval_steps_per_second": 0.655, + "step": 18900 + }, + { + "epoch": 45.0, + "grad_norm": 7.187080383300781, + "learning_rate": 2.0760341528043576e-05, + "loss": 0.0406, + "step": 18910 + }, + { + "epoch": 45.0, + "grad_norm": 1.3414313793182373, + "learning_rate": 2.0741940232592376e-05, + "loss": 0.0366, + "step": 18920 + }, + { + "epoch": 45.0, + "grad_norm": 4.305115222930908, + "learning_rate": 2.0723538937141176e-05, + "loss": 0.0339, + "step": 18930 + }, + { + "epoch": 45.0, + "grad_norm": 0.024366533383727074, + "learning_rate": 2.0705137641689976e-05, + "loss": 0.0021, + "step": 18940 + }, + { + "epoch": 45.0, + "grad_norm": 0.06154268607497215, + "learning_rate": 2.0686736346238776e-05, + "loss": 0.0437, + "step": 18950 + }, + { + "epoch": 45.0, + "grad_norm": 0.012441672384738922, + "learning_rate": 2.0668335050787576e-05, + "loss": 0.0033, + "step": 18960 + }, + { + "epoch": 45.0, + "grad_norm": 0.008273870684206486, + "learning_rate": 2.0649933755336376e-05, + "loss": 0.0003, + "step": 18970 + }, + { + "epoch": 45.0, + "grad_norm": 0.00964616984128952, + "learning_rate": 2.0631532459885176e-05, + "loss": 0.0002, + "step": 18980 + }, + { + "epoch": 45.0, + "grad_norm": 0.004747298080474138, + "learning_rate": 2.0613131164433977e-05, + "loss": 0.0002, + "step": 18990 + }, + { + "epoch": 45.0, + "grad_norm": 0.30449992418289185, + "learning_rate": 2.0594729868982777e-05, + "loss": 0.0005, + "step": 19000 + }, + { + "epoch": 45.0, + "grad_norm": 0.049932073801755905, + "learning_rate": 2.0576328573531577e-05, + "loss": 0.1286, + "step": 19010 + }, + { + "epoch": 45.0, + "grad_norm": 0.0017162609146907926, + "learning_rate": 2.0557927278080377e-05, + "loss": 0.0004, + "step": 19020 + }, + { + "epoch": 45.0, + "grad_norm": 0.0063346978276968, + "learning_rate": 2.053952598262918e-05, + "loss": 0.0256, + "step": 19030 + }, + { + "epoch": 45.0, + "grad_norm": 0.18762005865573883, + "learning_rate": 2.0521124687177977e-05, + "loss": 0.0018, + "step": 19040 + }, + { + "epoch": 45.0, + "grad_norm": 0.01104031503200531, + "learning_rate": 2.0502723391726777e-05, + "loss": 0.0636, + "step": 19050 + }, + { + "epoch": 45.01, + "grad_norm": 0.006080237217247486, + "learning_rate": 2.048432209627558e-05, + "loss": 0.001, + "step": 19060 + }, + { + "epoch": 45.01, + "grad_norm": 0.013174448162317276, + "learning_rate": 2.0465920800824377e-05, + "loss": 0.0052, + "step": 19070 + }, + { + "epoch": 45.01, + "grad_norm": 0.006308967713266611, + "learning_rate": 2.044751950537318e-05, + "loss": 0.0027, + "step": 19080 + }, + { + "epoch": 45.01, + "grad_norm": 0.018024206161499023, + "learning_rate": 2.042911820992198e-05, + "loss": 0.0003, + "step": 19090 + }, + { + "epoch": 45.01, + "grad_norm": 0.07368378341197968, + "learning_rate": 2.0410716914470778e-05, + "loss": 0.0002, + "step": 19100 + }, + { + "epoch": 45.01, + "grad_norm": 0.09977413713932037, + "learning_rate": 2.039231561901958e-05, + "loss": 0.0295, + "step": 19110 + }, + { + "epoch": 45.01, + "grad_norm": 0.03205341845750809, + "learning_rate": 2.037391432356838e-05, + "loss": 0.0007, + "step": 19120 + }, + { + "epoch": 45.01, + "grad_norm": 0.0013329902430996299, + "learning_rate": 2.0355513028117178e-05, + "loss": 0.0739, + "step": 19130 + }, + { + "epoch": 45.01, + "grad_norm": 0.014855876564979553, + "learning_rate": 2.033711173266598e-05, + "loss": 0.0005, + "step": 19140 + }, + { + "epoch": 45.01, + "grad_norm": 0.25314536690711975, + "learning_rate": 2.0318710437214782e-05, + "loss": 0.0372, + "step": 19150 + }, + { + "epoch": 45.01, + "grad_norm": 0.004318530671298504, + "learning_rate": 2.0300309141763582e-05, + "loss": 0.002, + "step": 19160 + }, + { + "epoch": 45.01, + "grad_norm": 0.0036497735418379307, + "learning_rate": 2.0281907846312382e-05, + "loss": 0.0032, + "step": 19170 + }, + { + "epoch": 45.01, + "grad_norm": 3.6807165145874023, + "learning_rate": 2.0263506550861182e-05, + "loss": 0.1226, + "step": 19180 + }, + { + "epoch": 45.01, + "grad_norm": 0.06369734555482864, + "learning_rate": 2.0245105255409982e-05, + "loss": 0.0439, + "step": 19190 + }, + { + "epoch": 45.01, + "grad_norm": 0.6539937257766724, + "learning_rate": 2.0226703959958782e-05, + "loss": 0.0295, + "step": 19200 + }, + { + "epoch": 45.01, + "grad_norm": 0.0202985480427742, + "learning_rate": 2.0208302664507582e-05, + "loss": 0.003, + "step": 19210 + }, + { + "epoch": 45.01, + "grad_norm": 0.010312013328075409, + "learning_rate": 2.0189901369056383e-05, + "loss": 0.0003, + "step": 19220 + }, + { + "epoch": 45.01, + "grad_norm": 1.6498684883117676, + "learning_rate": 2.0171500073605183e-05, + "loss": 0.0075, + "step": 19230 + }, + { + "epoch": 45.01, + "grad_norm": 0.006631897762417793, + "learning_rate": 2.0153098778153983e-05, + "loss": 0.0618, + "step": 19240 + }, + { + "epoch": 45.01, + "grad_norm": 0.02790352888405323, + "learning_rate": 2.0134697482702783e-05, + "loss": 0.0006, + "step": 19250 + }, + { + "epoch": 45.01, + "grad_norm": 0.013174889609217644, + "learning_rate": 2.0116296187251586e-05, + "loss": 0.0065, + "step": 19260 + }, + { + "epoch": 45.01, + "grad_norm": 0.003927392885088921, + "learning_rate": 2.0097894891800383e-05, + "loss": 0.022, + "step": 19270 + }, + { + "epoch": 45.01, + "grad_norm": 0.004552062135189772, + "learning_rate": 2.0079493596349183e-05, + "loss": 0.0006, + "step": 19280 + }, + { + "epoch": 45.01, + "grad_norm": 0.005525296088308096, + "learning_rate": 2.0061092300897987e-05, + "loss": 0.0004, + "step": 19290 + }, + { + "epoch": 45.01, + "grad_norm": 0.011126107536256313, + "learning_rate": 2.0042691005446783e-05, + "loss": 0.0321, + "step": 19300 + }, + { + "epoch": 45.01, + "grad_norm": 0.011895339004695415, + "learning_rate": 2.0024289709995584e-05, + "loss": 0.0009, + "step": 19310 + }, + { + "epoch": 45.01, + "grad_norm": 8.476682662963867, + "learning_rate": 2.0005888414544384e-05, + "loss": 0.0934, + "step": 19320 + }, + { + "epoch": 45.01, + "eval_accuracy": 0.7319819819819819, + "eval_loss": 1.6994054317474365, + "eval_runtime": 105.9213, + "eval_samples_per_second": 8.384, + "eval_steps_per_second": 0.699, + "step": 19320 + }, + { + "epoch": 46.0, + "grad_norm": 0.019197309389710426, + "learning_rate": 1.9987487119093184e-05, + "loss": 0.049, + "step": 19330 + }, + { + "epoch": 46.0, + "grad_norm": 0.0017492288025096059, + "learning_rate": 1.9969085823641987e-05, + "loss": 0.0658, + "step": 19340 + }, + { + "epoch": 46.0, + "grad_norm": 0.0031795764807611704, + "learning_rate": 1.9950684528190784e-05, + "loss": 0.0003, + "step": 19350 + }, + { + "epoch": 46.0, + "grad_norm": 0.01530479546636343, + "learning_rate": 1.9932283232739584e-05, + "loss": 0.0531, + "step": 19360 + }, + { + "epoch": 46.0, + "grad_norm": 0.04168014973402023, + "learning_rate": 1.9913881937288388e-05, + "loss": 0.0503, + "step": 19370 + }, + { + "epoch": 46.0, + "grad_norm": 0.08437560498714447, + "learning_rate": 1.9895480641837184e-05, + "loss": 0.0356, + "step": 19380 + }, + { + "epoch": 46.0, + "grad_norm": 0.010686563327908516, + "learning_rate": 1.9877079346385988e-05, + "loss": 0.0048, + "step": 19390 + }, + { + "epoch": 46.0, + "grad_norm": 0.3396773636341095, + "learning_rate": 1.9858678050934788e-05, + "loss": 0.0385, + "step": 19400 + }, + { + "epoch": 46.0, + "grad_norm": 0.06184344366192818, + "learning_rate": 1.9840276755483585e-05, + "loss": 0.0581, + "step": 19410 + }, + { + "epoch": 46.0, + "grad_norm": 0.01603071019053459, + "learning_rate": 1.9821875460032388e-05, + "loss": 0.0802, + "step": 19420 + }, + { + "epoch": 46.0, + "grad_norm": 0.007481284439563751, + "learning_rate": 1.980347416458119e-05, + "loss": 0.0004, + "step": 19430 + }, + { + "epoch": 46.0, + "grad_norm": 0.026033438742160797, + "learning_rate": 1.9785072869129985e-05, + "loss": 0.0027, + "step": 19440 + }, + { + "epoch": 46.0, + "grad_norm": 0.5898566246032715, + "learning_rate": 1.976667157367879e-05, + "loss": 0.0004, + "step": 19450 + }, + { + "epoch": 46.0, + "grad_norm": 0.003322466742247343, + "learning_rate": 1.974827027822759e-05, + "loss": 0.0003, + "step": 19460 + }, + { + "epoch": 46.0, + "grad_norm": 0.014266034588217735, + "learning_rate": 1.972986898277639e-05, + "loss": 0.0004, + "step": 19470 + }, + { + "epoch": 46.01, + "grad_norm": 0.005200342275202274, + "learning_rate": 1.971146768732519e-05, + "loss": 0.0002, + "step": 19480 + }, + { + "epoch": 46.01, + "grad_norm": 0.009922517463564873, + "learning_rate": 1.969306639187399e-05, + "loss": 0.0009, + "step": 19490 + }, + { + "epoch": 46.01, + "grad_norm": 0.011363714933395386, + "learning_rate": 1.967466509642279e-05, + "loss": 0.0295, + "step": 19500 + }, + { + "epoch": 46.01, + "grad_norm": 63.66260528564453, + "learning_rate": 1.965626380097159e-05, + "loss": 0.0362, + "step": 19510 + }, + { + "epoch": 46.01, + "grad_norm": 0.03335092216730118, + "learning_rate": 1.963786250552039e-05, + "loss": 0.0003, + "step": 19520 + }, + { + "epoch": 46.01, + "grad_norm": 0.0011501980479806662, + "learning_rate": 1.961946121006919e-05, + "loss": 0.0075, + "step": 19530 + }, + { + "epoch": 46.01, + "grad_norm": 0.00867602787911892, + "learning_rate": 1.960105991461799e-05, + "loss": 0.0001, + "step": 19540 + }, + { + "epoch": 46.01, + "grad_norm": 0.0034504039213061333, + "learning_rate": 1.958265861916679e-05, + "loss": 0.0002, + "step": 19550 + }, + { + "epoch": 46.01, + "grad_norm": 0.001728889998048544, + "learning_rate": 1.956425732371559e-05, + "loss": 0.0039, + "step": 19560 + }, + { + "epoch": 46.01, + "grad_norm": 0.0017645972548052669, + "learning_rate": 1.9545856028264393e-05, + "loss": 0.0396, + "step": 19570 + }, + { + "epoch": 46.01, + "grad_norm": 0.002970011904835701, + "learning_rate": 1.952745473281319e-05, + "loss": 0.0248, + "step": 19580 + }, + { + "epoch": 46.01, + "grad_norm": 0.01166481152176857, + "learning_rate": 1.950905343736199e-05, + "loss": 0.0003, + "step": 19590 + }, + { + "epoch": 46.01, + "grad_norm": 0.030790084972977638, + "learning_rate": 1.9490652141910794e-05, + "loss": 0.0669, + "step": 19600 + }, + { + "epoch": 46.01, + "grad_norm": 0.0027878263499587774, + "learning_rate": 1.947225084645959e-05, + "loss": 0.0031, + "step": 19610 + }, + { + "epoch": 46.01, + "grad_norm": 0.0010275563690811396, + "learning_rate": 1.945384955100839e-05, + "loss": 0.0001, + "step": 19620 + }, + { + "epoch": 46.01, + "grad_norm": 0.004202474374324083, + "learning_rate": 1.9435448255557194e-05, + "loss": 0.0266, + "step": 19630 + }, + { + "epoch": 46.01, + "grad_norm": 1.82235586643219, + "learning_rate": 1.941704696010599e-05, + "loss": 0.0292, + "step": 19640 + }, + { + "epoch": 46.01, + "grad_norm": 0.044389884918928146, + "learning_rate": 1.9398645664654794e-05, + "loss": 0.0566, + "step": 19650 + }, + { + "epoch": 46.01, + "grad_norm": 0.029817450791597366, + "learning_rate": 1.9380244369203594e-05, + "loss": 0.0018, + "step": 19660 + }, + { + "epoch": 46.01, + "grad_norm": 0.0017704556230455637, + "learning_rate": 1.936184307375239e-05, + "loss": 0.0367, + "step": 19670 + }, + { + "epoch": 46.01, + "grad_norm": 0.004369074944406748, + "learning_rate": 1.9343441778301195e-05, + "loss": 0.0005, + "step": 19680 + }, + { + "epoch": 46.01, + "grad_norm": 0.0013988850405439734, + "learning_rate": 1.9325040482849995e-05, + "loss": 0.0002, + "step": 19690 + }, + { + "epoch": 46.01, + "grad_norm": 0.0056244307197630405, + "learning_rate": 1.9306639187398795e-05, + "loss": 0.0001, + "step": 19700 + }, + { + "epoch": 46.01, + "grad_norm": 0.0017847019480541348, + "learning_rate": 1.9288237891947595e-05, + "loss": 0.0001, + "step": 19710 + }, + { + "epoch": 46.01, + "grad_norm": 0.038331713527441025, + "learning_rate": 1.9269836596496392e-05, + "loss": 0.0002, + "step": 19720 + }, + { + "epoch": 46.01, + "grad_norm": 0.0021360579412430525, + "learning_rate": 1.9251435301045195e-05, + "loss": 0.0001, + "step": 19730 + }, + { + "epoch": 46.01, + "grad_norm": 0.0037925804499536753, + "learning_rate": 1.9233034005593995e-05, + "loss": 0.0001, + "step": 19740 + }, + { + "epoch": 46.01, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.0068118572235107, + "eval_runtime": 115.1092, + "eval_samples_per_second": 7.714, + "eval_steps_per_second": 0.643, + "step": 19740 + }, + { + "epoch": 47.0, + "grad_norm": 88.71112060546875, + "learning_rate": 1.9214632710142792e-05, + "loss": 0.0471, + "step": 19750 + }, + { + "epoch": 47.0, + "grad_norm": 0.0014459657249972224, + "learning_rate": 1.9196231414691596e-05, + "loss": 0.0006, + "step": 19760 + }, + { + "epoch": 47.0, + "grad_norm": 0.0023679425939917564, + "learning_rate": 1.9177830119240396e-05, + "loss": 0.0557, + "step": 19770 + }, + { + "epoch": 47.0, + "grad_norm": 63.74143600463867, + "learning_rate": 1.9159428823789196e-05, + "loss": 0.0349, + "step": 19780 + }, + { + "epoch": 47.0, + "grad_norm": 0.000937833683565259, + "learning_rate": 1.9141027528337996e-05, + "loss": 0.0002, + "step": 19790 + }, + { + "epoch": 47.0, + "grad_norm": 0.0016346214106306434, + "learning_rate": 1.9122626232886796e-05, + "loss": 0.0003, + "step": 19800 + }, + { + "epoch": 47.0, + "grad_norm": 0.002341426908969879, + "learning_rate": 1.9104224937435596e-05, + "loss": 0.0043, + "step": 19810 + }, + { + "epoch": 47.0, + "grad_norm": 0.08337362110614777, + "learning_rate": 1.9085823641984396e-05, + "loss": 0.0002, + "step": 19820 + }, + { + "epoch": 47.0, + "grad_norm": 0.04266023263335228, + "learning_rate": 1.9067422346533196e-05, + "loss": 0.0002, + "step": 19830 + }, + { + "epoch": 47.0, + "grad_norm": 0.001009949017316103, + "learning_rate": 1.9049021051081996e-05, + "loss": 0.0001, + "step": 19840 + }, + { + "epoch": 47.0, + "grad_norm": 0.003605714999139309, + "learning_rate": 1.9030619755630797e-05, + "loss": 0.0004, + "step": 19850 + }, + { + "epoch": 47.0, + "grad_norm": 0.00593281164765358, + "learning_rate": 1.9012218460179597e-05, + "loss": 0.0338, + "step": 19860 + }, + { + "epoch": 47.0, + "grad_norm": 29.594966888427734, + "learning_rate": 1.8993817164728397e-05, + "loss": 0.0494, + "step": 19870 + }, + { + "epoch": 47.0, + "grad_norm": 0.005202536471188068, + "learning_rate": 1.89754158692772e-05, + "loss": 0.0797, + "step": 19880 + }, + { + "epoch": 47.0, + "grad_norm": 0.0021886222530156374, + "learning_rate": 1.8957014573825997e-05, + "loss": 0.0355, + "step": 19890 + }, + { + "epoch": 47.01, + "grad_norm": 0.0017879304941743612, + "learning_rate": 1.8938613278374797e-05, + "loss": 0.0598, + "step": 19900 + }, + { + "epoch": 47.01, + "grad_norm": 0.037195175886154175, + "learning_rate": 1.89202119829236e-05, + "loss": 0.0307, + "step": 19910 + }, + { + "epoch": 47.01, + "grad_norm": 0.002196249086409807, + "learning_rate": 1.8901810687472397e-05, + "loss": 0.0747, + "step": 19920 + }, + { + "epoch": 47.01, + "grad_norm": 0.3126240670681, + "learning_rate": 1.8883409392021198e-05, + "loss": 0.0001, + "step": 19930 + }, + { + "epoch": 47.01, + "grad_norm": 0.003922034986317158, + "learning_rate": 1.886500809657e-05, + "loss": 0.0002, + "step": 19940 + }, + { + "epoch": 47.01, + "grad_norm": 0.00338058453053236, + "learning_rate": 1.8846606801118798e-05, + "loss": 0.0572, + "step": 19950 + }, + { + "epoch": 47.01, + "grad_norm": 10.452116012573242, + "learning_rate": 1.88282055056676e-05, + "loss": 0.1096, + "step": 19960 + }, + { + "epoch": 47.01, + "grad_norm": 0.010865331627428532, + "learning_rate": 1.88098042102164e-05, + "loss": 0.0158, + "step": 19970 + }, + { + "epoch": 47.01, + "grad_norm": 2.9410135746002197, + "learning_rate": 1.8791402914765198e-05, + "loss": 0.0097, + "step": 19980 + }, + { + "epoch": 47.01, + "grad_norm": 103.67179870605469, + "learning_rate": 1.8773001619314e-05, + "loss": 0.1498, + "step": 19990 + }, + { + "epoch": 47.01, + "grad_norm": 0.005131885409355164, + "learning_rate": 1.8754600323862802e-05, + "loss": 0.0751, + "step": 20000 + }, + { + "epoch": 47.01, + "grad_norm": 0.003077390603721142, + "learning_rate": 1.8736199028411602e-05, + "loss": 0.0021, + "step": 20010 + }, + { + "epoch": 47.01, + "grad_norm": 0.014272770844399929, + "learning_rate": 1.8717797732960402e-05, + "loss": 0.0003, + "step": 20020 + }, + { + "epoch": 47.01, + "grad_norm": 0.05707676336169243, + "learning_rate": 1.8699396437509202e-05, + "loss": 0.118, + "step": 20030 + }, + { + "epoch": 47.01, + "grad_norm": 0.009387039579451084, + "learning_rate": 1.8680995142058002e-05, + "loss": 0.0003, + "step": 20040 + }, + { + "epoch": 47.01, + "grad_norm": 0.0015558353625237942, + "learning_rate": 1.8662593846606802e-05, + "loss": 0.0006, + "step": 20050 + }, + { + "epoch": 47.01, + "grad_norm": 0.0268620066344738, + "learning_rate": 1.8644192551155602e-05, + "loss": 0.026, + "step": 20060 + }, + { + "epoch": 47.01, + "grad_norm": 0.0657452940940857, + "learning_rate": 1.8625791255704403e-05, + "loss": 0.0004, + "step": 20070 + }, + { + "epoch": 47.01, + "grad_norm": 0.005338532850146294, + "learning_rate": 1.8607389960253203e-05, + "loss": 0.0014, + "step": 20080 + }, + { + "epoch": 47.01, + "grad_norm": 0.005034546833485365, + "learning_rate": 1.8588988664802003e-05, + "loss": 0.053, + "step": 20090 + }, + { + "epoch": 47.01, + "grad_norm": 0.002866822760552168, + "learning_rate": 1.8570587369350803e-05, + "loss": 0.0631, + "step": 20100 + }, + { + "epoch": 47.01, + "grad_norm": 0.008067389018833637, + "learning_rate": 1.8552186073899603e-05, + "loss": 0.0005, + "step": 20110 + }, + { + "epoch": 47.01, + "grad_norm": 81.31429290771484, + "learning_rate": 1.8533784778448403e-05, + "loss": 0.0132, + "step": 20120 + }, + { + "epoch": 47.01, + "grad_norm": 22.95867347717285, + "learning_rate": 1.8515383482997203e-05, + "loss": 0.0539, + "step": 20130 + }, + { + "epoch": 47.01, + "grad_norm": 2.4549527168273926, + "learning_rate": 1.8496982187546007e-05, + "loss": 0.1012, + "step": 20140 + }, + { + "epoch": 47.01, + "grad_norm": 0.003576475428417325, + "learning_rate": 1.8478580892094803e-05, + "loss": 0.0006, + "step": 20150 + }, + { + "epoch": 47.01, + "grad_norm": 0.0023642319720238447, + "learning_rate": 1.8460179596643604e-05, + "loss": 0.024, + "step": 20160 + }, + { + "epoch": 47.01, + "eval_accuracy": 0.7376126126126126, + "eval_loss": 1.534989595413208, + "eval_runtime": 67.2041, + "eval_samples_per_second": 13.213, + "eval_steps_per_second": 1.101, + "step": 20160 + }, + { + "epoch": 48.0, + "grad_norm": 0.37329983711242676, + "learning_rate": 1.8441778301192407e-05, + "loss": 0.0007, + "step": 20170 + }, + { + "epoch": 48.0, + "grad_norm": 0.17028795182704926, + "learning_rate": 1.8423377005741204e-05, + "loss": 0.0557, + "step": 20180 + }, + { + "epoch": 48.0, + "grad_norm": 0.010627706535160542, + "learning_rate": 1.8404975710290007e-05, + "loss": 0.0456, + "step": 20190 + }, + { + "epoch": 48.0, + "grad_norm": 0.0034769929479807615, + "learning_rate": 1.8386574414838807e-05, + "loss": 0.0014, + "step": 20200 + }, + { + "epoch": 48.0, + "grad_norm": 0.016666559502482414, + "learning_rate": 1.8368173119387604e-05, + "loss": 0.0011, + "step": 20210 + }, + { + "epoch": 48.0, + "grad_norm": 25.03653907775879, + "learning_rate": 1.8349771823936408e-05, + "loss": 0.0389, + "step": 20220 + }, + { + "epoch": 48.0, + "grad_norm": 0.004017295315861702, + "learning_rate": 1.8331370528485208e-05, + "loss": 0.0222, + "step": 20230 + }, + { + "epoch": 48.0, + "grad_norm": 0.1146978884935379, + "learning_rate": 1.8312969233034004e-05, + "loss": 0.0051, + "step": 20240 + }, + { + "epoch": 48.0, + "grad_norm": 0.036740757524967194, + "learning_rate": 1.8294567937582808e-05, + "loss": 0.051, + "step": 20250 + }, + { + "epoch": 48.0, + "grad_norm": 0.011806732974946499, + "learning_rate": 1.8276166642131605e-05, + "loss": 0.0082, + "step": 20260 + }, + { + "epoch": 48.0, + "grad_norm": 0.007759864907711744, + "learning_rate": 1.8257765346680408e-05, + "loss": 0.0133, + "step": 20270 + }, + { + "epoch": 48.0, + "grad_norm": 0.03835444524884224, + "learning_rate": 1.823936405122921e-05, + "loss": 0.0007, + "step": 20280 + }, + { + "epoch": 48.0, + "grad_norm": 0.0012061005691066384, + "learning_rate": 1.8220962755778005e-05, + "loss": 0.0247, + "step": 20290 + }, + { + "epoch": 48.0, + "grad_norm": 0.005193190183490515, + "learning_rate": 1.820256146032681e-05, + "loss": 0.0076, + "step": 20300 + }, + { + "epoch": 48.0, + "grad_norm": 0.002150206360965967, + "learning_rate": 1.818416016487561e-05, + "loss": 0.0001, + "step": 20310 + }, + { + "epoch": 48.01, + "grad_norm": 0.0028511222917586565, + "learning_rate": 1.816575886942441e-05, + "loss": 0.0002, + "step": 20320 + }, + { + "epoch": 48.01, + "grad_norm": 0.002456626622006297, + "learning_rate": 1.814735757397321e-05, + "loss": 0.0002, + "step": 20330 + }, + { + "epoch": 48.01, + "grad_norm": 0.034909721463918686, + "learning_rate": 1.812895627852201e-05, + "loss": 0.0001, + "step": 20340 + }, + { + "epoch": 48.01, + "grad_norm": 0.008975083939731121, + "learning_rate": 1.811055498307081e-05, + "loss": 0.0002, + "step": 20350 + }, + { + "epoch": 48.01, + "grad_norm": 0.0032727932557463646, + "learning_rate": 1.809215368761961e-05, + "loss": 0.0001, + "step": 20360 + }, + { + "epoch": 48.01, + "grad_norm": 0.005578655283898115, + "learning_rate": 1.807375239216841e-05, + "loss": 0.0484, + "step": 20370 + }, + { + "epoch": 48.01, + "grad_norm": 0.006554843857884407, + "learning_rate": 1.805535109671721e-05, + "loss": 0.0001, + "step": 20380 + }, + { + "epoch": 48.01, + "grad_norm": 0.0009955121204257011, + "learning_rate": 1.803694980126601e-05, + "loss": 0.0346, + "step": 20390 + }, + { + "epoch": 48.01, + "grad_norm": 0.008674697019159794, + "learning_rate": 1.801854850581481e-05, + "loss": 0.0001, + "step": 20400 + }, + { + "epoch": 48.01, + "grad_norm": 0.13496126234531403, + "learning_rate": 1.800014721036361e-05, + "loss": 0.0169, + "step": 20410 + }, + { + "epoch": 48.01, + "grad_norm": 0.00233973260037601, + "learning_rate": 1.798174591491241e-05, + "loss": 0.0003, + "step": 20420 + }, + { + "epoch": 48.01, + "grad_norm": 0.01351605448871851, + "learning_rate": 1.796334461946121e-05, + "loss": 0.0487, + "step": 20430 + }, + { + "epoch": 48.01, + "grad_norm": 0.003713756799697876, + "learning_rate": 1.794494332401001e-05, + "loss": 0.0001, + "step": 20440 + }, + { + "epoch": 48.01, + "grad_norm": 0.004229304380714893, + "learning_rate": 1.7926542028558814e-05, + "loss": 0.0558, + "step": 20450 + }, + { + "epoch": 48.01, + "grad_norm": 0.0006775284418836236, + "learning_rate": 1.790814073310761e-05, + "loss": 0.001, + "step": 20460 + }, + { + "epoch": 48.01, + "grad_norm": 0.0010827960213646293, + "learning_rate": 1.788973943765641e-05, + "loss": 0.0223, + "step": 20470 + }, + { + "epoch": 48.01, + "grad_norm": 0.0010313157690688968, + "learning_rate": 1.7871338142205214e-05, + "loss": 0.1121, + "step": 20480 + }, + { + "epoch": 48.01, + "grad_norm": 2.7552216053009033, + "learning_rate": 1.785293684675401e-05, + "loss": 0.0005, + "step": 20490 + }, + { + "epoch": 48.01, + "grad_norm": 0.016195788979530334, + "learning_rate": 1.7834535551302814e-05, + "loss": 0.0009, + "step": 20500 + }, + { + "epoch": 48.01, + "grad_norm": 0.0725574940443039, + "learning_rate": 1.7816134255851614e-05, + "loss": 0.0002, + "step": 20510 + }, + { + "epoch": 48.01, + "grad_norm": 0.0008012351463548839, + "learning_rate": 1.779773296040041e-05, + "loss": 0.0289, + "step": 20520 + }, + { + "epoch": 48.01, + "grad_norm": 0.0045431689359247684, + "learning_rate": 1.7779331664949215e-05, + "loss": 0.0001, + "step": 20530 + }, + { + "epoch": 48.01, + "grad_norm": 0.0012236082693561912, + "learning_rate": 1.7760930369498015e-05, + "loss": 0.005, + "step": 20540 + }, + { + "epoch": 48.01, + "grad_norm": 3.1969592571258545, + "learning_rate": 1.774252907404681e-05, + "loss": 0.0007, + "step": 20550 + }, + { + "epoch": 48.01, + "grad_norm": 0.0031280736438930035, + "learning_rate": 1.7724127778595615e-05, + "loss": 0.0191, + "step": 20560 + }, + { + "epoch": 48.01, + "grad_norm": 0.12724265456199646, + "learning_rate": 1.7705726483144415e-05, + "loss": 0.0004, + "step": 20570 + }, + { + "epoch": 48.01, + "grad_norm": 1.0310579538345337, + "learning_rate": 1.7687325187693215e-05, + "loss": 0.017, + "step": 20580 + }, + { + "epoch": 48.01, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 1.886398434638977, + "eval_runtime": 40.674, + "eval_samples_per_second": 21.832, + "eval_steps_per_second": 1.819, + "step": 20580 + }, + { + "epoch": 49.0, + "grad_norm": 0.3085496127605438, + "learning_rate": 1.7668923892242015e-05, + "loss": 0.0078, + "step": 20590 + }, + { + "epoch": 49.0, + "grad_norm": 0.0024839958641678095, + "learning_rate": 1.7650522596790815e-05, + "loss": 0.0003, + "step": 20600 + }, + { + "epoch": 49.0, + "grad_norm": 0.0012105575297027826, + "learning_rate": 1.7632121301339616e-05, + "loss": 0.0001, + "step": 20610 + }, + { + "epoch": 49.0, + "grad_norm": 1.1235806941986084, + "learning_rate": 1.7613720005888416e-05, + "loss": 0.0165, + "step": 20620 + }, + { + "epoch": 49.0, + "grad_norm": 0.0009100966854020953, + "learning_rate": 1.7595318710437216e-05, + "loss": 0.0625, + "step": 20630 + }, + { + "epoch": 49.0, + "grad_norm": 2.2181501388549805, + "learning_rate": 1.7576917414986016e-05, + "loss": 0.0104, + "step": 20640 + }, + { + "epoch": 49.0, + "grad_norm": 0.11276418715715408, + "learning_rate": 1.7558516119534816e-05, + "loss": 0.0056, + "step": 20650 + }, + { + "epoch": 49.0, + "grad_norm": 0.009900757111608982, + "learning_rate": 1.7540114824083616e-05, + "loss": 0.0001, + "step": 20660 + }, + { + "epoch": 49.0, + "grad_norm": 0.016009271144866943, + "learning_rate": 1.7521713528632416e-05, + "loss": 0.0001, + "step": 20670 + }, + { + "epoch": 49.0, + "grad_norm": 52.463294982910156, + "learning_rate": 1.7503312233181216e-05, + "loss": 0.0275, + "step": 20680 + }, + { + "epoch": 49.0, + "grad_norm": 42.01731491088867, + "learning_rate": 1.7484910937730016e-05, + "loss": 0.0424, + "step": 20690 + }, + { + "epoch": 49.0, + "grad_norm": 0.002209991682320833, + "learning_rate": 1.7466509642278817e-05, + "loss": 0.0118, + "step": 20700 + }, + { + "epoch": 49.0, + "grad_norm": 0.0023402757942676544, + "learning_rate": 1.744810834682762e-05, + "loss": 0.0001, + "step": 20710 + }, + { + "epoch": 49.0, + "grad_norm": 0.0017410296713933349, + "learning_rate": 1.7429707051376417e-05, + "loss": 0.0016, + "step": 20720 + }, + { + "epoch": 49.0, + "grad_norm": 2.1473774909973145, + "learning_rate": 1.7411305755925217e-05, + "loss": 0.0471, + "step": 20730 + }, + { + "epoch": 49.01, + "grad_norm": 0.0010794149711728096, + "learning_rate": 1.739290446047402e-05, + "loss": 0.0644, + "step": 20740 + }, + { + "epoch": 49.01, + "grad_norm": 0.0012719827936962247, + "learning_rate": 1.7374503165022817e-05, + "loss": 0.0001, + "step": 20750 + }, + { + "epoch": 49.01, + "grad_norm": 0.0008286166121251881, + "learning_rate": 1.735610186957162e-05, + "loss": 0.0004, + "step": 20760 + }, + { + "epoch": 49.01, + "grad_norm": 0.0054702237248420715, + "learning_rate": 1.7337700574120417e-05, + "loss": 0.0002, + "step": 20770 + }, + { + "epoch": 49.01, + "grad_norm": 0.009072530083358288, + "learning_rate": 1.7319299278669217e-05, + "loss": 0.0676, + "step": 20780 + }, + { + "epoch": 49.01, + "grad_norm": 0.0006656598416157067, + "learning_rate": 1.730089798321802e-05, + "loss": 0.0066, + "step": 20790 + }, + { + "epoch": 49.01, + "grad_norm": 0.08233454823493958, + "learning_rate": 1.7282496687766818e-05, + "loss": 0.0001, + "step": 20800 + }, + { + "epoch": 49.01, + "grad_norm": 0.0012012611841782928, + "learning_rate": 1.726409539231562e-05, + "loss": 0.0001, + "step": 20810 + }, + { + "epoch": 49.01, + "grad_norm": 2.6688947677612305, + "learning_rate": 1.724569409686442e-05, + "loss": 0.105, + "step": 20820 + }, + { + "epoch": 49.01, + "grad_norm": 0.002076583681628108, + "learning_rate": 1.7227292801413218e-05, + "loss": 0.0192, + "step": 20830 + }, + { + "epoch": 49.01, + "grad_norm": 0.0016304003074765205, + "learning_rate": 1.720889150596202e-05, + "loss": 0.0001, + "step": 20840 + }, + { + "epoch": 49.01, + "grad_norm": 0.05900062620639801, + "learning_rate": 1.7190490210510822e-05, + "loss": 0.0016, + "step": 20850 + }, + { + "epoch": 49.01, + "grad_norm": 0.00635264627635479, + "learning_rate": 1.717208891505962e-05, + "loss": 0.0001, + "step": 20860 + }, + { + "epoch": 49.01, + "grad_norm": 67.24282836914062, + "learning_rate": 1.7153687619608422e-05, + "loss": 0.0191, + "step": 20870 + }, + { + "epoch": 49.01, + "grad_norm": 0.04959714412689209, + "learning_rate": 1.7135286324157222e-05, + "loss": 0.0266, + "step": 20880 + }, + { + "epoch": 49.01, + "grad_norm": 11.01009464263916, + "learning_rate": 1.7116885028706022e-05, + "loss": 0.0656, + "step": 20890 + }, + { + "epoch": 49.01, + "grad_norm": 0.004300578963011503, + "learning_rate": 1.7098483733254822e-05, + "loss": 0.0293, + "step": 20900 + }, + { + "epoch": 49.01, + "grad_norm": 0.08246956020593643, + "learning_rate": 1.7080082437803622e-05, + "loss": 0.0195, + "step": 20910 + }, + { + "epoch": 49.01, + "grad_norm": 0.005194004625082016, + "learning_rate": 1.7061681142352423e-05, + "loss": 0.0145, + "step": 20920 + }, + { + "epoch": 49.01, + "grad_norm": 0.007515768054872751, + "learning_rate": 1.7043279846901223e-05, + "loss": 0.0472, + "step": 20930 + }, + { + "epoch": 49.01, + "grad_norm": 0.0014940837863832712, + "learning_rate": 1.7024878551450023e-05, + "loss": 0.0532, + "step": 20940 + }, + { + "epoch": 49.01, + "grad_norm": 0.017531683668494225, + "learning_rate": 1.7006477255998823e-05, + "loss": 0.0049, + "step": 20950 + }, + { + "epoch": 49.01, + "grad_norm": 0.004585204645991325, + "learning_rate": 1.6988075960547623e-05, + "loss": 0.0143, + "step": 20960 + }, + { + "epoch": 49.01, + "grad_norm": 0.002103234874084592, + "learning_rate": 1.6969674665096423e-05, + "loss": 0.0896, + "step": 20970 + }, + { + "epoch": 49.01, + "grad_norm": 0.0011310140835121274, + "learning_rate": 1.6951273369645223e-05, + "loss": 0.0074, + "step": 20980 + }, + { + "epoch": 49.01, + "grad_norm": 0.04580579325556755, + "learning_rate": 1.6932872074194023e-05, + "loss": 0.0733, + "step": 20990 + }, + { + "epoch": 49.01, + "grad_norm": 0.039697013795375824, + "learning_rate": 1.6914470778742823e-05, + "loss": 0.121, + "step": 21000 + }, + { + "epoch": 49.01, + "eval_accuracy": 0.722972972972973, + "eval_loss": 1.6861770153045654, + "eval_runtime": 43.5753, + "eval_samples_per_second": 20.379, + "eval_steps_per_second": 1.698, + "step": 21000 + }, + { + "epoch": 50.0, + "grad_norm": 0.003826475003734231, + "learning_rate": 1.6896069483291624e-05, + "loss": 0.0202, + "step": 21010 + }, + { + "epoch": 50.0, + "grad_norm": 0.016017960384488106, + "learning_rate": 1.6877668187840427e-05, + "loss": 0.0314, + "step": 21020 + }, + { + "epoch": 50.0, + "grad_norm": 34.23000717163086, + "learning_rate": 1.6859266892389224e-05, + "loss": 0.0168, + "step": 21030 + }, + { + "epoch": 50.0, + "grad_norm": 11.643963813781738, + "learning_rate": 1.6840865596938024e-05, + "loss": 0.0135, + "step": 21040 + }, + { + "epoch": 50.0, + "grad_norm": 0.32508620619773865, + "learning_rate": 1.6822464301486827e-05, + "loss": 0.0003, + "step": 21050 + }, + { + "epoch": 50.0, + "grad_norm": 2.0690901279449463, + "learning_rate": 1.6804063006035624e-05, + "loss": 0.0008, + "step": 21060 + }, + { + "epoch": 50.0, + "grad_norm": 0.6246060132980347, + "learning_rate": 1.6785661710584428e-05, + "loss": 0.003, + "step": 21070 + }, + { + "epoch": 50.0, + "grad_norm": 0.011646988801658154, + "learning_rate": 1.6767260415133228e-05, + "loss": 0.0127, + "step": 21080 + }, + { + "epoch": 50.0, + "grad_norm": 0.003978679422289133, + "learning_rate": 1.6748859119682024e-05, + "loss": 0.0003, + "step": 21090 + }, + { + "epoch": 50.0, + "grad_norm": 0.0010175154311582446, + "learning_rate": 1.6730457824230828e-05, + "loss": 0.0001, + "step": 21100 + }, + { + "epoch": 50.0, + "grad_norm": 0.002983462531119585, + "learning_rate": 1.6712056528779628e-05, + "loss": 0.0001, + "step": 21110 + }, + { + "epoch": 50.0, + "grad_norm": 0.0006442716694436967, + "learning_rate": 1.6693655233328425e-05, + "loss": 0.0001, + "step": 21120 + }, + { + "epoch": 50.0, + "grad_norm": 13.063587188720703, + "learning_rate": 1.667525393787723e-05, + "loss": 0.0338, + "step": 21130 + }, + { + "epoch": 50.0, + "grad_norm": 72.1922836303711, + "learning_rate": 1.665685264242603e-05, + "loss": 0.0071, + "step": 21140 + }, + { + "epoch": 50.0, + "grad_norm": 0.007421460468322039, + "learning_rate": 1.663845134697483e-05, + "loss": 0.0001, + "step": 21150 + }, + { + "epoch": 50.01, + "grad_norm": 0.020990287885069847, + "learning_rate": 1.662005005152363e-05, + "loss": 0.0267, + "step": 21160 + }, + { + "epoch": 50.01, + "grad_norm": 0.009370788931846619, + "learning_rate": 1.660164875607243e-05, + "loss": 0.054, + "step": 21170 + }, + { + "epoch": 50.01, + "grad_norm": 0.0025914448779076338, + "learning_rate": 1.658324746062123e-05, + "loss": 0.0002, + "step": 21180 + }, + { + "epoch": 50.01, + "grad_norm": 0.0516793355345726, + "learning_rate": 1.656484616517003e-05, + "loss": 0.0003, + "step": 21190 + }, + { + "epoch": 50.01, + "grad_norm": 0.0009586882079020143, + "learning_rate": 1.654644486971883e-05, + "loss": 0.0002, + "step": 21200 + }, + { + "epoch": 50.01, + "grad_norm": 0.06358382105827332, + "learning_rate": 1.652804357426763e-05, + "loss": 0.0291, + "step": 21210 + }, + { + "epoch": 50.01, + "grad_norm": 0.0018362919799983501, + "learning_rate": 1.650964227881643e-05, + "loss": 0.0149, + "step": 21220 + }, + { + "epoch": 50.01, + "grad_norm": 1.6524035930633545, + "learning_rate": 1.649124098336523e-05, + "loss": 0.0015, + "step": 21230 + }, + { + "epoch": 50.01, + "grad_norm": 0.03593685105443001, + "learning_rate": 1.647283968791403e-05, + "loss": 0.0425, + "step": 21240 + }, + { + "epoch": 50.01, + "grad_norm": 0.004203982185572386, + "learning_rate": 1.6454438392462833e-05, + "loss": 0.0003, + "step": 21250 + }, + { + "epoch": 50.01, + "grad_norm": 0.0018243154045194387, + "learning_rate": 1.643603709701163e-05, + "loss": 0.0001, + "step": 21260 + }, + { + "epoch": 50.01, + "grad_norm": 0.023052990436553955, + "learning_rate": 1.641763580156043e-05, + "loss": 0.0927, + "step": 21270 + }, + { + "epoch": 50.01, + "grad_norm": 0.0010030419798567891, + "learning_rate": 1.639923450610923e-05, + "loss": 0.0499, + "step": 21280 + }, + { + "epoch": 50.01, + "grad_norm": 0.0008574148523621261, + "learning_rate": 1.638083321065803e-05, + "loss": 0.0018, + "step": 21290 + }, + { + "epoch": 50.01, + "grad_norm": 0.005586415994912386, + "learning_rate": 1.636243191520683e-05, + "loss": 0.0209, + "step": 21300 + }, + { + "epoch": 50.01, + "grad_norm": 0.0025664528366178274, + "learning_rate": 1.634403061975563e-05, + "loss": 0.0015, + "step": 21310 + }, + { + "epoch": 50.01, + "grad_norm": 0.0018880977295339108, + "learning_rate": 1.632562932430443e-05, + "loss": 0.0324, + "step": 21320 + }, + { + "epoch": 50.01, + "grad_norm": 0.1711210310459137, + "learning_rate": 1.6307228028853234e-05, + "loss": 0.0003, + "step": 21330 + }, + { + "epoch": 50.01, + "grad_norm": 0.012983834370970726, + "learning_rate": 1.628882673340203e-05, + "loss": 0.0074, + "step": 21340 + }, + { + "epoch": 50.01, + "grad_norm": 0.003905842313542962, + "learning_rate": 1.627042543795083e-05, + "loss": 0.0001, + "step": 21350 + }, + { + "epoch": 50.01, + "grad_norm": 0.01312506664544344, + "learning_rate": 1.6252024142499634e-05, + "loss": 0.0517, + "step": 21360 + }, + { + "epoch": 50.01, + "grad_norm": 0.0010434292489662766, + "learning_rate": 1.623362284704843e-05, + "loss": 0.0655, + "step": 21370 + }, + { + "epoch": 50.01, + "grad_norm": 0.006396695505827665, + "learning_rate": 1.6215221551597235e-05, + "loss": 0.01, + "step": 21380 + }, + { + "epoch": 50.01, + "grad_norm": 0.19382664561271667, + "learning_rate": 1.6196820256146035e-05, + "loss": 0.0044, + "step": 21390 + }, + { + "epoch": 50.01, + "grad_norm": 0.010313979350030422, + "learning_rate": 1.617841896069483e-05, + "loss": 0.0407, + "step": 21400 + }, + { + "epoch": 50.01, + "grad_norm": 0.003178203012794256, + "learning_rate": 1.6160017665243635e-05, + "loss": 0.0123, + "step": 21410 + }, + { + "epoch": 50.01, + "grad_norm": 0.0013943740632385015, + "learning_rate": 1.6141616369792435e-05, + "loss": 0.0001, + "step": 21420 + }, + { + "epoch": 50.01, + "eval_accuracy": 0.7364864864864865, + "eval_loss": 1.8462260961532593, + "eval_runtime": 40.7016, + "eval_samples_per_second": 21.817, + "eval_steps_per_second": 1.818, + "step": 21420 + }, + { + "epoch": 51.0, + "grad_norm": 0.0011855022748932242, + "learning_rate": 1.6123215074341232e-05, + "loss": 0.0004, + "step": 21430 + }, + { + "epoch": 51.0, + "grad_norm": 0.006975261494517326, + "learning_rate": 1.6104813778890035e-05, + "loss": 0.0112, + "step": 21440 + }, + { + "epoch": 51.0, + "grad_norm": 0.0014384695095941424, + "learning_rate": 1.6086412483438835e-05, + "loss": 0.0006, + "step": 21450 + }, + { + "epoch": 51.0, + "grad_norm": 0.0062417034059762955, + "learning_rate": 1.6068011187987636e-05, + "loss": 0.0546, + "step": 21460 + }, + { + "epoch": 51.0, + "grad_norm": 0.0007521304069086909, + "learning_rate": 1.6049609892536436e-05, + "loss": 0.0002, + "step": 21470 + }, + { + "epoch": 51.0, + "grad_norm": 0.0006712984177283943, + "learning_rate": 1.6031208597085236e-05, + "loss": 0.0072, + "step": 21480 + }, + { + "epoch": 51.0, + "grad_norm": 0.00106193742249161, + "learning_rate": 1.6012807301634036e-05, + "loss": 0.0147, + "step": 21490 + }, + { + "epoch": 51.0, + "grad_norm": 0.0004047717957291752, + "learning_rate": 1.5994406006182836e-05, + "loss": 0.0009, + "step": 21500 + }, + { + "epoch": 51.0, + "grad_norm": 0.0011434787884354591, + "learning_rate": 1.5976004710731636e-05, + "loss": 0.0709, + "step": 21510 + }, + { + "epoch": 51.0, + "grad_norm": 16.264480590820312, + "learning_rate": 1.5957603415280436e-05, + "loss": 0.023, + "step": 21520 + }, + { + "epoch": 51.0, + "grad_norm": 0.0010457668686285615, + "learning_rate": 1.5939202119829236e-05, + "loss": 0.0001, + "step": 21530 + }, + { + "epoch": 51.0, + "grad_norm": 0.0018191535491496325, + "learning_rate": 1.5920800824378036e-05, + "loss": 0.0001, + "step": 21540 + }, + { + "epoch": 51.0, + "grad_norm": 0.0005493704811669886, + "learning_rate": 1.5902399528926837e-05, + "loss": 0.0001, + "step": 21550 + }, + { + "epoch": 51.0, + "grad_norm": 0.000556766171939671, + "learning_rate": 1.588399823347564e-05, + "loss": 0.0061, + "step": 21560 + }, + { + "epoch": 51.0, + "grad_norm": 2.683346748352051, + "learning_rate": 1.5865596938024437e-05, + "loss": 0.0006, + "step": 21570 + }, + { + "epoch": 51.01, + "grad_norm": 0.0024154935963451862, + "learning_rate": 1.5847195642573237e-05, + "loss": 0.1307, + "step": 21580 + }, + { + "epoch": 51.01, + "grad_norm": 0.045853517949581146, + "learning_rate": 1.582879434712204e-05, + "loss": 0.0001, + "step": 21590 + }, + { + "epoch": 51.01, + "grad_norm": 0.059176698327064514, + "learning_rate": 1.5810393051670837e-05, + "loss": 0.1623, + "step": 21600 + }, + { + "epoch": 51.01, + "grad_norm": 0.010752598755061626, + "learning_rate": 1.5791991756219637e-05, + "loss": 0.0094, + "step": 21610 + }, + { + "epoch": 51.01, + "grad_norm": 0.004883680492639542, + "learning_rate": 1.577359046076844e-05, + "loss": 0.0579, + "step": 21620 + }, + { + "epoch": 51.01, + "grad_norm": 0.004795772023499012, + "learning_rate": 1.5755189165317237e-05, + "loss": 0.0003, + "step": 21630 + }, + { + "epoch": 51.01, + "grad_norm": 0.0015640510246157646, + "learning_rate": 1.573678786986604e-05, + "loss": 0.0314, + "step": 21640 + }, + { + "epoch": 51.01, + "grad_norm": 0.005454888101667166, + "learning_rate": 1.571838657441484e-05, + "loss": 0.0003, + "step": 21650 + }, + { + "epoch": 51.01, + "grad_norm": 0.0020785073284059763, + "learning_rate": 1.5699985278963638e-05, + "loss": 0.0319, + "step": 21660 + }, + { + "epoch": 51.01, + "grad_norm": 0.0016603749245405197, + "learning_rate": 1.568158398351244e-05, + "loss": 0.0004, + "step": 21670 + }, + { + "epoch": 51.01, + "grad_norm": 0.0012977832229807973, + "learning_rate": 1.566318268806124e-05, + "loss": 0.0071, + "step": 21680 + }, + { + "epoch": 51.01, + "grad_norm": 0.0006946497596800327, + "learning_rate": 1.564478139261004e-05, + "loss": 0.0001, + "step": 21690 + }, + { + "epoch": 51.01, + "grad_norm": 0.013320227153599262, + "learning_rate": 1.562638009715884e-05, + "loss": 0.0002, + "step": 21700 + }, + { + "epoch": 51.01, + "grad_norm": 0.002938096411526203, + "learning_rate": 1.560797880170764e-05, + "loss": 0.0001, + "step": 21710 + }, + { + "epoch": 51.01, + "grad_norm": 0.004965800791978836, + "learning_rate": 1.5589577506256442e-05, + "loss": 0.0, + "step": 21720 + }, + { + "epoch": 51.01, + "grad_norm": 0.0008785520331002772, + "learning_rate": 1.5571176210805242e-05, + "loss": 0.0179, + "step": 21730 + }, + { + "epoch": 51.01, + "grad_norm": 0.002170178573578596, + "learning_rate": 1.555277491535404e-05, + "loss": 0.0565, + "step": 21740 + }, + { + "epoch": 51.01, + "grad_norm": 0.008555195294320583, + "learning_rate": 1.5534373619902842e-05, + "loss": 0.0, + "step": 21750 + }, + { + "epoch": 51.01, + "grad_norm": 0.0019417338771745563, + "learning_rate": 1.5515972324451642e-05, + "loss": 0.0001, + "step": 21760 + }, + { + "epoch": 51.01, + "grad_norm": 0.0005679653841070831, + "learning_rate": 1.5497571029000442e-05, + "loss": 0.0, + "step": 21770 + }, + { + "epoch": 51.01, + "grad_norm": 0.0006499973824247718, + "learning_rate": 1.5479169733549243e-05, + "loss": 0.0099, + "step": 21780 + }, + { + "epoch": 51.01, + "grad_norm": 0.9170955419540405, + "learning_rate": 1.5460768438098043e-05, + "loss": 0.0242, + "step": 21790 + }, + { + "epoch": 51.01, + "grad_norm": 0.0007745533948764205, + "learning_rate": 1.5442367142646843e-05, + "loss": 0.0111, + "step": 21800 + }, + { + "epoch": 51.01, + "grad_norm": 0.0008127467590384185, + "learning_rate": 1.5423965847195643e-05, + "loss": 0.0, + "step": 21810 + }, + { + "epoch": 51.01, + "grad_norm": 0.00048087682807818055, + "learning_rate": 1.5405564551744443e-05, + "loss": 0.0006, + "step": 21820 + }, + { + "epoch": 51.01, + "grad_norm": 0.0003872001834679395, + "learning_rate": 1.5387163256293243e-05, + "loss": 0.0, + "step": 21830 + }, + { + "epoch": 51.01, + "grad_norm": 0.0020749419927597046, + "learning_rate": 1.5368761960842043e-05, + "loss": 0.0319, + "step": 21840 + }, + { + "epoch": 51.01, + "eval_accuracy": 0.7286036036036037, + "eval_loss": 1.9071617126464844, + "eval_runtime": 40.8643, + "eval_samples_per_second": 21.73, + "eval_steps_per_second": 1.811, + "step": 21840 + }, + { + "epoch": 52.0, + "grad_norm": 0.0010090394644066691, + "learning_rate": 1.5350360665390843e-05, + "loss": 0.0001, + "step": 21850 + }, + { + "epoch": 52.0, + "grad_norm": 0.0006268495344556868, + "learning_rate": 1.5331959369939644e-05, + "loss": 0.0001, + "step": 21860 + }, + { + "epoch": 52.0, + "grad_norm": 0.0034793647937476635, + "learning_rate": 1.5313558074488447e-05, + "loss": 0.0001, + "step": 21870 + }, + { + "epoch": 52.0, + "grad_norm": 0.003257141914218664, + "learning_rate": 1.5295156779037244e-05, + "loss": 0.0026, + "step": 21880 + }, + { + "epoch": 52.0, + "grad_norm": 0.0005939814727753401, + "learning_rate": 1.5276755483586044e-05, + "loss": 0.0513, + "step": 21890 + }, + { + "epoch": 52.0, + "grad_norm": 0.0034212921746075153, + "learning_rate": 1.5258354188134846e-05, + "loss": 0.0003, + "step": 21900 + }, + { + "epoch": 52.0, + "grad_norm": 0.00782698392868042, + "learning_rate": 1.5239952892683646e-05, + "loss": 0.0001, + "step": 21910 + }, + { + "epoch": 52.0, + "grad_norm": 0.0007986134150996804, + "learning_rate": 1.5221551597232444e-05, + "loss": 0.0001, + "step": 21920 + }, + { + "epoch": 52.0, + "grad_norm": 0.0010540427174419165, + "learning_rate": 1.5203150301781246e-05, + "loss": 0.0002, + "step": 21930 + }, + { + "epoch": 52.0, + "grad_norm": 0.00037895364221185446, + "learning_rate": 1.5184749006330046e-05, + "loss": 0.0, + "step": 21940 + }, + { + "epoch": 52.0, + "grad_norm": 0.0050917621701955795, + "learning_rate": 1.5166347710878848e-05, + "loss": 0.0002, + "step": 21950 + }, + { + "epoch": 52.0, + "grad_norm": 0.017639679834246635, + "learning_rate": 1.5147946415427646e-05, + "loss": 0.0003, + "step": 21960 + }, + { + "epoch": 52.0, + "grad_norm": 0.00042161368764936924, + "learning_rate": 1.5129545119976446e-05, + "loss": 0.0001, + "step": 21970 + }, + { + "epoch": 52.0, + "grad_norm": 0.0010595995699986815, + "learning_rate": 1.5111143824525248e-05, + "loss": 0.0171, + "step": 21980 + }, + { + "epoch": 52.0, + "grad_norm": 0.001517163822427392, + "learning_rate": 1.5092742529074047e-05, + "loss": 0.0002, + "step": 21990 + }, + { + "epoch": 52.01, + "grad_norm": 0.00044204984442330897, + "learning_rate": 1.5074341233622849e-05, + "loss": 0.0649, + "step": 22000 + }, + { + "epoch": 52.01, + "grad_norm": 0.022290529683232307, + "learning_rate": 1.5055939938171649e-05, + "loss": 0.0001, + "step": 22010 + }, + { + "epoch": 52.01, + "grad_norm": 0.0006498958100564778, + "learning_rate": 1.5037538642720447e-05, + "loss": 0.0009, + "step": 22020 + }, + { + "epoch": 52.01, + "grad_norm": 0.0012360989348962903, + "learning_rate": 1.5019137347269249e-05, + "loss": 0.0129, + "step": 22030 + }, + { + "epoch": 52.01, + "grad_norm": 0.0009521116153337061, + "learning_rate": 1.5000736051818049e-05, + "loss": 0.0397, + "step": 22040 + }, + { + "epoch": 52.01, + "grad_norm": 0.0004721345321740955, + "learning_rate": 1.4982334756366847e-05, + "loss": 0.0001, + "step": 22050 + }, + { + "epoch": 52.01, + "grad_norm": 0.0019844111520797014, + "learning_rate": 1.496393346091565e-05, + "loss": 0.0, + "step": 22060 + }, + { + "epoch": 52.01, + "grad_norm": 0.06269257515668869, + "learning_rate": 1.494553216546445e-05, + "loss": 0.0108, + "step": 22070 + }, + { + "epoch": 52.01, + "grad_norm": 0.005493814591318369, + "learning_rate": 1.4927130870013251e-05, + "loss": 0.0689, + "step": 22080 + }, + { + "epoch": 52.01, + "grad_norm": 0.020394032821059227, + "learning_rate": 1.490872957456205e-05, + "loss": 0.0001, + "step": 22090 + }, + { + "epoch": 52.01, + "grad_norm": 0.0004452892462722957, + "learning_rate": 1.489032827911085e-05, + "loss": 0.0, + "step": 22100 + }, + { + "epoch": 52.01, + "grad_norm": 0.005485246889293194, + "learning_rate": 1.4871926983659651e-05, + "loss": 0.0, + "step": 22110 + }, + { + "epoch": 52.01, + "grad_norm": 0.000731699459720403, + "learning_rate": 1.485352568820845e-05, + "loss": 0.001, + "step": 22120 + }, + { + "epoch": 52.01, + "grad_norm": 0.0008120546699501574, + "learning_rate": 1.4835124392757252e-05, + "loss": 0.0, + "step": 22130 + }, + { + "epoch": 52.01, + "grad_norm": 0.0005848497967235744, + "learning_rate": 1.4816723097306052e-05, + "loss": 0.0003, + "step": 22140 + }, + { + "epoch": 52.01, + "grad_norm": 0.0952475443482399, + "learning_rate": 1.479832180185485e-05, + "loss": 0.0526, + "step": 22150 + }, + { + "epoch": 52.01, + "grad_norm": 0.004363304935395718, + "learning_rate": 1.4779920506403652e-05, + "loss": 0.0788, + "step": 22160 + }, + { + "epoch": 52.01, + "grad_norm": 0.0003654547908809036, + "learning_rate": 1.4761519210952452e-05, + "loss": 0.0, + "step": 22170 + }, + { + "epoch": 52.01, + "grad_norm": 0.001634811982512474, + "learning_rate": 1.4743117915501254e-05, + "loss": 0.0118, + "step": 22180 + }, + { + "epoch": 52.01, + "grad_norm": 0.00047340861055999994, + "learning_rate": 1.4724716620050052e-05, + "loss": 0.016, + "step": 22190 + }, + { + "epoch": 52.01, + "grad_norm": 0.0008979640551842749, + "learning_rate": 1.470631532459885e-05, + "loss": 0.0001, + "step": 22200 + }, + { + "epoch": 52.01, + "grad_norm": 0.0005534207448363304, + "learning_rate": 1.4687914029147654e-05, + "loss": 0.0004, + "step": 22210 + }, + { + "epoch": 52.01, + "grad_norm": 0.0010612070327624679, + "learning_rate": 1.4669512733696453e-05, + "loss": 0.001, + "step": 22220 + }, + { + "epoch": 52.01, + "grad_norm": 0.0008421811508014798, + "learning_rate": 1.4651111438245251e-05, + "loss": 0.0149, + "step": 22230 + }, + { + "epoch": 52.01, + "grad_norm": 0.0024608599487692118, + "learning_rate": 1.4632710142794053e-05, + "loss": 0.0302, + "step": 22240 + }, + { + "epoch": 52.01, + "grad_norm": 0.0005315292510204017, + "learning_rate": 1.4614308847342853e-05, + "loss": 0.0282, + "step": 22250 + }, + { + "epoch": 52.01, + "grad_norm": 0.0015135396970435977, + "learning_rate": 1.4595907551891655e-05, + "loss": 0.065, + "step": 22260 + }, + { + "epoch": 52.01, + "eval_accuracy": 0.7556306306306306, + "eval_loss": 1.6631345748901367, + "eval_runtime": 41.1878, + "eval_samples_per_second": 21.56, + "eval_steps_per_second": 1.797, + "step": 22260 + }, + { + "epoch": 53.0, + "grad_norm": 0.001163458451628685, + "learning_rate": 1.4577506256440453e-05, + "loss": 0.0001, + "step": 22270 + }, + { + "epoch": 53.0, + "grad_norm": 0.0030114694964140654, + "learning_rate": 1.4559104960989253e-05, + "loss": 0.0023, + "step": 22280 + }, + { + "epoch": 53.0, + "grad_norm": 0.007465164177119732, + "learning_rate": 1.4540703665538055e-05, + "loss": 0.0678, + "step": 22290 + }, + { + "epoch": 53.0, + "grad_norm": 0.001718674087896943, + "learning_rate": 1.4522302370086854e-05, + "loss": 0.0006, + "step": 22300 + }, + { + "epoch": 53.0, + "grad_norm": 0.08228703588247299, + "learning_rate": 1.4503901074635655e-05, + "loss": 0.0093, + "step": 22310 + }, + { + "epoch": 53.0, + "grad_norm": 0.00858758483082056, + "learning_rate": 1.4485499779184456e-05, + "loss": 0.0002, + "step": 22320 + }, + { + "epoch": 53.0, + "grad_norm": 36.46480178833008, + "learning_rate": 1.4467098483733254e-05, + "loss": 0.0389, + "step": 22330 + }, + { + "epoch": 53.0, + "grad_norm": 0.0019482868956401944, + "learning_rate": 1.4448697188282056e-05, + "loss": 0.0009, + "step": 22340 + }, + { + "epoch": 53.0, + "grad_norm": 0.020813103765249252, + "learning_rate": 1.4430295892830856e-05, + "loss": 0.0001, + "step": 22350 + }, + { + "epoch": 53.0, + "grad_norm": 0.09965714067220688, + "learning_rate": 1.4411894597379654e-05, + "loss": 0.051, + "step": 22360 + }, + { + "epoch": 53.0, + "grad_norm": 0.0007514178869314492, + "learning_rate": 1.4393493301928456e-05, + "loss": 0.0001, + "step": 22370 + }, + { + "epoch": 53.0, + "grad_norm": 0.0009033564128912985, + "learning_rate": 1.4375092006477256e-05, + "loss": 0.0484, + "step": 22380 + }, + { + "epoch": 53.0, + "grad_norm": 0.0006943594198673964, + "learning_rate": 1.4356690711026058e-05, + "loss": 0.0002, + "step": 22390 + }, + { + "epoch": 53.0, + "grad_norm": 0.003571439301595092, + "learning_rate": 1.4338289415574857e-05, + "loss": 0.0001, + "step": 22400 + }, + { + "epoch": 53.0, + "grad_norm": 0.06675244867801666, + "learning_rate": 1.4319888120123657e-05, + "loss": 0.0632, + "step": 22410 + }, + { + "epoch": 53.01, + "grad_norm": 13.023971557617188, + "learning_rate": 1.4301486824672458e-05, + "loss": 0.0606, + "step": 22420 + }, + { + "epoch": 53.01, + "grad_norm": 0.3222871720790863, + "learning_rate": 1.4283085529221257e-05, + "loss": 0.1178, + "step": 22430 + }, + { + "epoch": 53.01, + "grad_norm": 0.0009562097257003188, + "learning_rate": 1.4264684233770059e-05, + "loss": 0.0001, + "step": 22440 + }, + { + "epoch": 53.01, + "grad_norm": 0.06681627035140991, + "learning_rate": 1.4246282938318859e-05, + "loss": 0.0311, + "step": 22450 + }, + { + "epoch": 53.01, + "grad_norm": 0.017502538859844208, + "learning_rate": 1.4227881642867657e-05, + "loss": 0.006, + "step": 22460 + }, + { + "epoch": 53.01, + "grad_norm": 0.0009717533830553293, + "learning_rate": 1.4209480347416459e-05, + "loss": 0.0001, + "step": 22470 + }, + { + "epoch": 53.01, + "grad_norm": 0.0013100715586915612, + "learning_rate": 1.4191079051965259e-05, + "loss": 0.0001, + "step": 22480 + }, + { + "epoch": 53.01, + "grad_norm": 0.08804736286401749, + "learning_rate": 1.4172677756514061e-05, + "loss": 0.0001, + "step": 22490 + }, + { + "epoch": 53.01, + "grad_norm": 0.06496791541576385, + "learning_rate": 1.415427646106286e-05, + "loss": 0.0004, + "step": 22500 + }, + { + "epoch": 53.01, + "grad_norm": 0.008714644238352776, + "learning_rate": 1.413587516561166e-05, + "loss": 0.0585, + "step": 22510 + }, + { + "epoch": 53.01, + "grad_norm": 0.0048969946801662445, + "learning_rate": 1.4117473870160461e-05, + "loss": 0.0001, + "step": 22520 + }, + { + "epoch": 53.01, + "grad_norm": 0.006837640888988972, + "learning_rate": 1.409907257470926e-05, + "loss": 0.0003, + "step": 22530 + }, + { + "epoch": 53.01, + "grad_norm": 0.004497607238590717, + "learning_rate": 1.408067127925806e-05, + "loss": 0.0001, + "step": 22540 + }, + { + "epoch": 53.01, + "grad_norm": 0.0015444208402186632, + "learning_rate": 1.4062269983806862e-05, + "loss": 0.0004, + "step": 22550 + }, + { + "epoch": 53.01, + "grad_norm": 0.002304267603904009, + "learning_rate": 1.404386868835566e-05, + "loss": 0.0095, + "step": 22560 + }, + { + "epoch": 53.01, + "grad_norm": 0.0013628543820232153, + "learning_rate": 1.4025467392904462e-05, + "loss": 0.0001, + "step": 22570 + }, + { + "epoch": 53.01, + "grad_norm": 0.0009183231741189957, + "learning_rate": 1.4007066097453262e-05, + "loss": 0.0001, + "step": 22580 + }, + { + "epoch": 53.01, + "grad_norm": 0.0007543744286522269, + "learning_rate": 1.398866480200206e-05, + "loss": 0.0242, + "step": 22590 + }, + { + "epoch": 53.01, + "grad_norm": 0.056899409741163254, + "learning_rate": 1.3970263506550862e-05, + "loss": 0.0001, + "step": 22600 + }, + { + "epoch": 53.01, + "grad_norm": 0.002434780355542898, + "learning_rate": 1.3951862211099662e-05, + "loss": 0.0079, + "step": 22610 + }, + { + "epoch": 53.01, + "grad_norm": 0.0021614390425384045, + "learning_rate": 1.3933460915648464e-05, + "loss": 0.0891, + "step": 22620 + }, + { + "epoch": 53.01, + "grad_norm": 0.00045263092033565044, + "learning_rate": 1.3915059620197263e-05, + "loss": 0.0678, + "step": 22630 + }, + { + "epoch": 53.01, + "grad_norm": 0.0013739608693867922, + "learning_rate": 1.3896658324746063e-05, + "loss": 0.0743, + "step": 22640 + }, + { + "epoch": 53.01, + "grad_norm": 0.002203054493293166, + "learning_rate": 1.3878257029294864e-05, + "loss": 0.0001, + "step": 22650 + }, + { + "epoch": 53.01, + "grad_norm": 1.5518015623092651, + "learning_rate": 1.3859855733843663e-05, + "loss": 0.0007, + "step": 22660 + }, + { + "epoch": 53.01, + "grad_norm": 0.17362336814403534, + "learning_rate": 1.3841454438392461e-05, + "loss": 0.043, + "step": 22670 + }, + { + "epoch": 53.01, + "grad_norm": 0.0003964619245380163, + "learning_rate": 1.3823053142941265e-05, + "loss": 0.0424, + "step": 22680 + }, + { + "epoch": 53.01, + "eval_accuracy": 0.7398648648648649, + "eval_loss": 1.9176682233810425, + "eval_runtime": 40.6837, + "eval_samples_per_second": 21.827, + "eval_steps_per_second": 1.819, + "step": 22680 + }, + { + "epoch": 54.0, + "grad_norm": 0.006115254946053028, + "learning_rate": 1.3804651847490063e-05, + "loss": 0.0733, + "step": 22690 + }, + { + "epoch": 54.0, + "grad_norm": 0.0023755570873618126, + "learning_rate": 1.3786250552038865e-05, + "loss": 0.0001, + "step": 22700 + }, + { + "epoch": 54.0, + "grad_norm": 0.003317100927233696, + "learning_rate": 1.3767849256587663e-05, + "loss": 0.001, + "step": 22710 + }, + { + "epoch": 54.0, + "grad_norm": 0.0010947121772915125, + "learning_rate": 1.3749447961136464e-05, + "loss": 0.0031, + "step": 22720 + }, + { + "epoch": 54.0, + "grad_norm": 0.0013286080211400986, + "learning_rate": 1.3731046665685265e-05, + "loss": 0.0001, + "step": 22730 + }, + { + "epoch": 54.0, + "grad_norm": 11.449026107788086, + "learning_rate": 1.3712645370234064e-05, + "loss": 0.0022, + "step": 22740 + }, + { + "epoch": 54.0, + "grad_norm": 0.0035011095460504293, + "learning_rate": 1.3694244074782867e-05, + "loss": 0.0001, + "step": 22750 + }, + { + "epoch": 54.0, + "grad_norm": 0.00158277852460742, + "learning_rate": 1.3675842779331666e-05, + "loss": 0.0001, + "step": 22760 + }, + { + "epoch": 54.0, + "grad_norm": 0.010587048716843128, + "learning_rate": 1.3657441483880464e-05, + "loss": 0.0001, + "step": 22770 + }, + { + "epoch": 54.0, + "grad_norm": 0.00178767298348248, + "learning_rate": 1.3639040188429266e-05, + "loss": 0.0008, + "step": 22780 + }, + { + "epoch": 54.0, + "grad_norm": 0.002173611195757985, + "learning_rate": 1.3620638892978066e-05, + "loss": 0.0001, + "step": 22790 + }, + { + "epoch": 54.0, + "grad_norm": 0.000830679084174335, + "learning_rate": 1.3602237597526868e-05, + "loss": 0.0161, + "step": 22800 + }, + { + "epoch": 54.0, + "grad_norm": 0.00802791677415371, + "learning_rate": 1.3583836302075666e-05, + "loss": 0.0515, + "step": 22810 + }, + { + "epoch": 54.0, + "grad_norm": 0.006544044241309166, + "learning_rate": 1.3565435006624466e-05, + "loss": 0.0001, + "step": 22820 + }, + { + "epoch": 54.0, + "grad_norm": 0.0021956958808004856, + "learning_rate": 1.3547033711173268e-05, + "loss": 0.0025, + "step": 22830 + }, + { + "epoch": 54.01, + "grad_norm": 0.004467161372303963, + "learning_rate": 1.3528632415722067e-05, + "loss": 0.0001, + "step": 22840 + }, + { + "epoch": 54.01, + "grad_norm": 0.0011982051655650139, + "learning_rate": 1.3510231120270867e-05, + "loss": 0.0354, + "step": 22850 + }, + { + "epoch": 54.01, + "grad_norm": 0.0005390816950239241, + "learning_rate": 1.3491829824819669e-05, + "loss": 0.0001, + "step": 22860 + }, + { + "epoch": 54.01, + "grad_norm": 0.0041159712709486485, + "learning_rate": 1.3473428529368467e-05, + "loss": 0.0001, + "step": 22870 + }, + { + "epoch": 54.01, + "grad_norm": 0.004346021916717291, + "learning_rate": 1.3455027233917269e-05, + "loss": 0.0265, + "step": 22880 + }, + { + "epoch": 54.01, + "grad_norm": 0.0595536045730114, + "learning_rate": 1.3436625938466069e-05, + "loss": 0.0039, + "step": 22890 + }, + { + "epoch": 54.01, + "grad_norm": 0.005771622993052006, + "learning_rate": 1.3418224643014867e-05, + "loss": 0.0001, + "step": 22900 + }, + { + "epoch": 54.01, + "grad_norm": 0.2933156490325928, + "learning_rate": 1.339982334756367e-05, + "loss": 0.0003, + "step": 22910 + }, + { + "epoch": 54.01, + "grad_norm": 0.0017350377747789025, + "learning_rate": 1.338142205211247e-05, + "loss": 0.0005, + "step": 22920 + }, + { + "epoch": 54.01, + "grad_norm": 0.0007483300869353116, + "learning_rate": 1.3363020756661271e-05, + "loss": 0.0001, + "step": 22930 + }, + { + "epoch": 54.01, + "grad_norm": 0.0008262667688541114, + "learning_rate": 1.334461946121007e-05, + "loss": 0.0001, + "step": 22940 + }, + { + "epoch": 54.01, + "grad_norm": 0.002304819645360112, + "learning_rate": 1.332621816575887e-05, + "loss": 0.0001, + "step": 22950 + }, + { + "epoch": 54.01, + "grad_norm": 0.0025526960380375385, + "learning_rate": 1.3307816870307671e-05, + "loss": 0.0565, + "step": 22960 + }, + { + "epoch": 54.01, + "grad_norm": 0.00723969517275691, + "learning_rate": 1.328941557485647e-05, + "loss": 0.0648, + "step": 22970 + }, + { + "epoch": 54.01, + "grad_norm": 0.006033417768776417, + "learning_rate": 1.327101427940527e-05, + "loss": 0.0002, + "step": 22980 + }, + { + "epoch": 54.01, + "grad_norm": 0.0007365961209870875, + "learning_rate": 1.3252612983954072e-05, + "loss": 0.0001, + "step": 22990 + }, + { + "epoch": 54.01, + "grad_norm": 0.013198223896324635, + "learning_rate": 1.323421168850287e-05, + "loss": 0.0001, + "step": 23000 + }, + { + "epoch": 54.01, + "grad_norm": 0.0013402728363871574, + "learning_rate": 1.3215810393051672e-05, + "loss": 0.0004, + "step": 23010 + }, + { + "epoch": 54.01, + "grad_norm": 0.015665873885154724, + "learning_rate": 1.3197409097600472e-05, + "loss": 0.0001, + "step": 23020 + }, + { + "epoch": 54.01, + "grad_norm": 0.0020574575755745173, + "learning_rate": 1.317900780214927e-05, + "loss": 0.0025, + "step": 23030 + }, + { + "epoch": 54.01, + "grad_norm": 0.004725297912955284, + "learning_rate": 1.3160606506698072e-05, + "loss": 0.1044, + "step": 23040 + }, + { + "epoch": 54.01, + "grad_norm": 0.0010245188605040312, + "learning_rate": 1.3142205211246873e-05, + "loss": 0.0001, + "step": 23050 + }, + { + "epoch": 54.01, + "grad_norm": 35.485294342041016, + "learning_rate": 1.3123803915795674e-05, + "loss": 0.0717, + "step": 23060 + }, + { + "epoch": 54.01, + "grad_norm": 0.0023749656975269318, + "learning_rate": 1.3105402620344473e-05, + "loss": 0.0001, + "step": 23070 + }, + { + "epoch": 54.01, + "grad_norm": 0.028252527117729187, + "learning_rate": 1.3087001324893273e-05, + "loss": 0.0001, + "step": 23080 + }, + { + "epoch": 54.01, + "grad_norm": 0.0029571724589914083, + "learning_rate": 1.3068600029442075e-05, + "loss": 0.0007, + "step": 23090 + }, + { + "epoch": 54.01, + "grad_norm": 0.0011734621366485953, + "learning_rate": 1.3050198733990873e-05, + "loss": 0.0001, + "step": 23100 + }, + { + "epoch": 54.01, + "eval_accuracy": 0.7364864864864865, + "eval_loss": 1.8990436792373657, + "eval_runtime": 39.9552, + "eval_samples_per_second": 22.225, + "eval_steps_per_second": 1.852, + "step": 23100 + }, + { + "epoch": 55.0, + "grad_norm": 0.0017267257208004594, + "learning_rate": 1.3031797438539675e-05, + "loss": 0.0001, + "step": 23110 + }, + { + "epoch": 55.0, + "grad_norm": 0.004545911680907011, + "learning_rate": 1.3013396143088475e-05, + "loss": 0.0342, + "step": 23120 + }, + { + "epoch": 55.0, + "grad_norm": 0.02509649097919464, + "learning_rate": 1.2994994847637273e-05, + "loss": 0.0001, + "step": 23130 + }, + { + "epoch": 55.0, + "grad_norm": 5.717916011810303, + "learning_rate": 1.2976593552186075e-05, + "loss": 0.0023, + "step": 23140 + }, + { + "epoch": 55.0, + "grad_norm": 0.005418738350272179, + "learning_rate": 1.2958192256734875e-05, + "loss": 0.0001, + "step": 23150 + }, + { + "epoch": 55.0, + "grad_norm": 0.0012522684410214424, + "learning_rate": 1.2939790961283674e-05, + "loss": 0.0001, + "step": 23160 + }, + { + "epoch": 55.0, + "grad_norm": 0.003435043152421713, + "learning_rate": 1.2921389665832476e-05, + "loss": 0.015, + "step": 23170 + }, + { + "epoch": 55.0, + "grad_norm": 0.0026250167284160852, + "learning_rate": 1.2902988370381274e-05, + "loss": 0.0016, + "step": 23180 + }, + { + "epoch": 55.0, + "grad_norm": 0.00528159411624074, + "learning_rate": 1.2884587074930078e-05, + "loss": 0.0001, + "step": 23190 + }, + { + "epoch": 55.0, + "grad_norm": 0.002424650127068162, + "learning_rate": 1.2866185779478876e-05, + "loss": 0.0, + "step": 23200 + }, + { + "epoch": 55.0, + "grad_norm": 0.0007755476981401443, + "learning_rate": 1.2847784484027674e-05, + "loss": 0.0, + "step": 23210 + }, + { + "epoch": 55.0, + "grad_norm": 0.0006827607867307961, + "learning_rate": 1.2829383188576478e-05, + "loss": 0.0001, + "step": 23220 + }, + { + "epoch": 55.0, + "grad_norm": 0.0006369174807332456, + "learning_rate": 1.2810981893125276e-05, + "loss": 0.0001, + "step": 23230 + }, + { + "epoch": 55.0, + "grad_norm": 0.0015232323203235865, + "learning_rate": 1.2792580597674078e-05, + "loss": 0.0001, + "step": 23240 + }, + { + "epoch": 55.0, + "grad_norm": 0.0009344415739178658, + "learning_rate": 1.2774179302222877e-05, + "loss": 0.0, + "step": 23250 + }, + { + "epoch": 55.01, + "grad_norm": 0.0012839095434173942, + "learning_rate": 1.2755778006771677e-05, + "loss": 0.0001, + "step": 23260 + }, + { + "epoch": 55.01, + "grad_norm": 0.0006341927219182253, + "learning_rate": 1.2737376711320478e-05, + "loss": 0.0423, + "step": 23270 + }, + { + "epoch": 55.01, + "grad_norm": 0.008475244976580143, + "learning_rate": 1.2718975415869277e-05, + "loss": 0.0001, + "step": 23280 + }, + { + "epoch": 55.01, + "grad_norm": 0.0011694369604811072, + "learning_rate": 1.2700574120418077e-05, + "loss": 0.0558, + "step": 23290 + }, + { + "epoch": 55.01, + "grad_norm": 1.4542289972305298, + "learning_rate": 1.2682172824966879e-05, + "loss": 0.0043, + "step": 23300 + }, + { + "epoch": 55.01, + "grad_norm": 0.0018425858579576015, + "learning_rate": 1.2663771529515677e-05, + "loss": 0.0001, + "step": 23310 + }, + { + "epoch": 55.01, + "grad_norm": 0.0018493493553251028, + "learning_rate": 1.2645370234064479e-05, + "loss": 0.0001, + "step": 23320 + }, + { + "epoch": 55.01, + "grad_norm": 0.0013431626139208674, + "learning_rate": 1.2626968938613279e-05, + "loss": 0.0002, + "step": 23330 + }, + { + "epoch": 55.01, + "grad_norm": 0.0005847708671353757, + "learning_rate": 1.2608567643162078e-05, + "loss": 0.0001, + "step": 23340 + }, + { + "epoch": 55.01, + "grad_norm": 0.0038386487867683172, + "learning_rate": 1.259016634771088e-05, + "loss": 0.0, + "step": 23350 + }, + { + "epoch": 55.01, + "grad_norm": 0.0013865749351680279, + "learning_rate": 1.257176505225968e-05, + "loss": 0.0, + "step": 23360 + }, + { + "epoch": 55.01, + "grad_norm": 0.0010828847298398614, + "learning_rate": 1.2553363756808481e-05, + "loss": 0.0661, + "step": 23370 + }, + { + "epoch": 55.01, + "grad_norm": 0.0009535017306916416, + "learning_rate": 1.253496246135728e-05, + "loss": 0.0006, + "step": 23380 + }, + { + "epoch": 55.01, + "grad_norm": 0.0013523722300305963, + "learning_rate": 1.251656116590608e-05, + "loss": 0.0, + "step": 23390 + }, + { + "epoch": 55.01, + "grad_norm": 0.0022847868967801332, + "learning_rate": 1.249815987045488e-05, + "loss": 0.0, + "step": 23400 + }, + { + "epoch": 55.01, + "grad_norm": 0.0003955428546760231, + "learning_rate": 1.247975857500368e-05, + "loss": 0.0, + "step": 23410 + }, + { + "epoch": 55.01, + "grad_norm": 0.0007086016703397036, + "learning_rate": 1.2461357279552482e-05, + "loss": 0.0, + "step": 23420 + }, + { + "epoch": 55.01, + "grad_norm": 0.0007778684375807643, + "learning_rate": 1.2442955984101282e-05, + "loss": 0.0001, + "step": 23430 + }, + { + "epoch": 55.01, + "grad_norm": 0.0017607809277251363, + "learning_rate": 1.2424554688650082e-05, + "loss": 0.0001, + "step": 23440 + }, + { + "epoch": 55.01, + "grad_norm": 0.1019691601395607, + "learning_rate": 1.240615339319888e-05, + "loss": 0.0226, + "step": 23450 + }, + { + "epoch": 55.01, + "grad_norm": 0.0013777402928099036, + "learning_rate": 1.2387752097747682e-05, + "loss": 0.0, + "step": 23460 + }, + { + "epoch": 55.01, + "grad_norm": 0.0017346058739349246, + "learning_rate": 1.2369350802296482e-05, + "loss": 0.0102, + "step": 23470 + }, + { + "epoch": 55.01, + "grad_norm": 0.0004230730119161308, + "learning_rate": 1.2350949506845283e-05, + "loss": 0.0, + "step": 23480 + }, + { + "epoch": 55.01, + "grad_norm": 0.00860625971108675, + "learning_rate": 1.2332548211394083e-05, + "loss": 0.0013, + "step": 23490 + }, + { + "epoch": 55.01, + "grad_norm": 0.0013665214646607637, + "learning_rate": 1.2314146915942883e-05, + "loss": 0.0077, + "step": 23500 + }, + { + "epoch": 55.01, + "grad_norm": 0.0007249111076816916, + "learning_rate": 1.2295745620491683e-05, + "loss": 0.0002, + "step": 23510 + }, + { + "epoch": 55.01, + "grad_norm": 0.00032015485339798033, + "learning_rate": 1.2277344325040483e-05, + "loss": 0.0, + "step": 23520 + }, + { + "epoch": 55.01, + "eval_accuracy": 0.7454954954954955, + "eval_loss": 2.0621790885925293, + "eval_runtime": 39.7983, + "eval_samples_per_second": 22.312, + "eval_steps_per_second": 1.859, + "step": 23520 + }, + { + "epoch": 56.0, + "grad_norm": 0.0004462806973606348, + "learning_rate": 1.2258943029589285e-05, + "loss": 0.0016, + "step": 23530 + }, + { + "epoch": 56.0, + "grad_norm": 3.490710735321045, + "learning_rate": 1.2240541734138083e-05, + "loss": 0.0054, + "step": 23540 + }, + { + "epoch": 56.0, + "grad_norm": 0.0019791782833635807, + "learning_rate": 1.2222140438686883e-05, + "loss": 0.1066, + "step": 23550 + }, + { + "epoch": 56.0, + "grad_norm": 0.010116740129888058, + "learning_rate": 1.2203739143235685e-05, + "loss": 0.0, + "step": 23560 + }, + { + "epoch": 56.0, + "grad_norm": 0.0034511485137045383, + "learning_rate": 1.2185337847784485e-05, + "loss": 0.0488, + "step": 23570 + }, + { + "epoch": 56.0, + "grad_norm": 0.01684543490409851, + "learning_rate": 1.2166936552333284e-05, + "loss": 0.0001, + "step": 23580 + }, + { + "epoch": 56.0, + "grad_norm": 0.0020218139979988337, + "learning_rate": 1.2148535256882086e-05, + "loss": 0.0001, + "step": 23590 + }, + { + "epoch": 56.0, + "grad_norm": 0.0003876470436807722, + "learning_rate": 1.2130133961430886e-05, + "loss": 0.0001, + "step": 23600 + }, + { + "epoch": 56.0, + "grad_norm": 0.0014865519478917122, + "learning_rate": 1.2111732665979686e-05, + "loss": 0.0009, + "step": 23610 + }, + { + "epoch": 56.0, + "grad_norm": 0.0008013385813683271, + "learning_rate": 1.2093331370528486e-05, + "loss": 0.0, + "step": 23620 + }, + { + "epoch": 56.0, + "grad_norm": 0.0025243335403501987, + "learning_rate": 1.2074930075077286e-05, + "loss": 0.0714, + "step": 23630 + }, + { + "epoch": 56.0, + "grad_norm": 0.001431646873243153, + "learning_rate": 1.2056528779626086e-05, + "loss": 0.0071, + "step": 23640 + }, + { + "epoch": 56.0, + "grad_norm": 0.0007202305714599788, + "learning_rate": 1.2038127484174886e-05, + "loss": 0.0287, + "step": 23650 + }, + { + "epoch": 56.0, + "grad_norm": 0.0022084242664277554, + "learning_rate": 1.2019726188723688e-05, + "loss": 0.0, + "step": 23660 + }, + { + "epoch": 56.0, + "grad_norm": 0.0018906533950939775, + "learning_rate": 1.2001324893272486e-05, + "loss": 0.0, + "step": 23670 + }, + { + "epoch": 56.01, + "grad_norm": 0.012800070457160473, + "learning_rate": 1.1982923597821287e-05, + "loss": 0.0572, + "step": 23680 + }, + { + "epoch": 56.01, + "grad_norm": 5.582653045654297, + "learning_rate": 1.1964522302370088e-05, + "loss": 0.0847, + "step": 23690 + }, + { + "epoch": 56.01, + "grad_norm": 0.0026636242400854826, + "learning_rate": 1.1946121006918888e-05, + "loss": 0.0001, + "step": 23700 + }, + { + "epoch": 56.01, + "grad_norm": 0.0005984007730148733, + "learning_rate": 1.1927719711467687e-05, + "loss": 0.0399, + "step": 23710 + }, + { + "epoch": 56.01, + "grad_norm": 0.0014047266449779272, + "learning_rate": 1.1909318416016487e-05, + "loss": 0.0411, + "step": 23720 + }, + { + "epoch": 56.01, + "grad_norm": 0.00455590570345521, + "learning_rate": 1.1890917120565289e-05, + "loss": 0.0003, + "step": 23730 + }, + { + "epoch": 56.01, + "grad_norm": 0.004561448935419321, + "learning_rate": 1.1872515825114089e-05, + "loss": 0.0682, + "step": 23740 + }, + { + "epoch": 56.01, + "grad_norm": 0.004140935372561216, + "learning_rate": 1.1854114529662889e-05, + "loss": 0.0001, + "step": 23750 + }, + { + "epoch": 56.01, + "grad_norm": 0.0024083659518510103, + "learning_rate": 1.183571323421169e-05, + "loss": 0.0001, + "step": 23760 + }, + { + "epoch": 56.01, + "grad_norm": 0.001977574313059449, + "learning_rate": 1.181731193876049e-05, + "loss": 0.0001, + "step": 23770 + }, + { + "epoch": 56.01, + "grad_norm": 0.004812993109226227, + "learning_rate": 1.179891064330929e-05, + "loss": 0.0003, + "step": 23780 + }, + { + "epoch": 56.01, + "grad_norm": 0.0009600510820746422, + "learning_rate": 1.178050934785809e-05, + "loss": 0.0001, + "step": 23790 + }, + { + "epoch": 56.01, + "grad_norm": 7.466368198394775, + "learning_rate": 1.176210805240689e-05, + "loss": 0.049, + "step": 23800 + }, + { + "epoch": 56.01, + "grad_norm": 0.21662557125091553, + "learning_rate": 1.174370675695569e-05, + "loss": 0.0001, + "step": 23810 + }, + { + "epoch": 56.01, + "grad_norm": 0.00443542143329978, + "learning_rate": 1.172530546150449e-05, + "loss": 0.0096, + "step": 23820 + }, + { + "epoch": 56.01, + "grad_norm": 3.03646183013916, + "learning_rate": 1.1706904166053292e-05, + "loss": 0.0139, + "step": 23830 + }, + { + "epoch": 56.01, + "grad_norm": 0.002869045827537775, + "learning_rate": 1.1688502870602092e-05, + "loss": 0.0, + "step": 23840 + }, + { + "epoch": 56.01, + "grad_norm": 0.0007343690376728773, + "learning_rate": 1.167010157515089e-05, + "loss": 0.0001, + "step": 23850 + }, + { + "epoch": 56.01, + "grad_norm": 0.13527420163154602, + "learning_rate": 1.1651700279699692e-05, + "loss": 0.0017, + "step": 23860 + }, + { + "epoch": 56.01, + "grad_norm": 0.002437218092381954, + "learning_rate": 1.1633298984248492e-05, + "loss": 0.0251, + "step": 23870 + }, + { + "epoch": 56.01, + "grad_norm": 0.013060529716312885, + "learning_rate": 1.1614897688797292e-05, + "loss": 0.0001, + "step": 23880 + }, + { + "epoch": 56.01, + "grad_norm": 0.0010747129563242197, + "learning_rate": 1.1596496393346092e-05, + "loss": 0.0001, + "step": 23890 + }, + { + "epoch": 56.01, + "grad_norm": 0.018001865595579147, + "learning_rate": 1.1578095097894892e-05, + "loss": 0.0772, + "step": 23900 + }, + { + "epoch": 56.01, + "grad_norm": 3.493405342102051, + "learning_rate": 1.1559693802443693e-05, + "loss": 0.1437, + "step": 23910 + }, + { + "epoch": 56.01, + "grad_norm": 0.05796351283788681, + "learning_rate": 1.1541292506992493e-05, + "loss": 0.0068, + "step": 23920 + }, + { + "epoch": 56.01, + "grad_norm": 45.330631256103516, + "learning_rate": 1.1522891211541295e-05, + "loss": 0.0343, + "step": 23930 + }, + { + "epoch": 56.01, + "grad_norm": 0.00885185319930315, + "learning_rate": 1.1504489916090093e-05, + "loss": 0.0582, + "step": 23940 + }, + { + "epoch": 56.01, + "eval_accuracy": 0.7443693693693694, + "eval_loss": 1.4820666313171387, + "eval_runtime": 39.7674, + "eval_samples_per_second": 22.33, + "eval_steps_per_second": 1.861, + "step": 23940 + }, + { + "epoch": 57.0, + "grad_norm": 0.0019421263132244349, + "learning_rate": 1.1486088620638893e-05, + "loss": 0.0004, + "step": 23950 + }, + { + "epoch": 57.0, + "grad_norm": 0.00982197280973196, + "learning_rate": 1.1467687325187693e-05, + "loss": 0.0493, + "step": 23960 + }, + { + "epoch": 57.0, + "grad_norm": 0.048929549753665924, + "learning_rate": 1.1449286029736495e-05, + "loss": 0.0003, + "step": 23970 + }, + { + "epoch": 57.0, + "grad_norm": 0.0012740027159452438, + "learning_rate": 1.1430884734285293e-05, + "loss": 0.0038, + "step": 23980 + }, + { + "epoch": 57.0, + "grad_norm": 0.004289202857762575, + "learning_rate": 1.1412483438834094e-05, + "loss": 0.0031, + "step": 23990 + }, + { + "epoch": 57.0, + "grad_norm": 0.0018165657529607415, + "learning_rate": 1.1394082143382895e-05, + "loss": 0.0002, + "step": 24000 + }, + { + "epoch": 57.0, + "grad_norm": 0.021423837170004845, + "learning_rate": 1.1375680847931695e-05, + "loss": 0.0066, + "step": 24010 + }, + { + "epoch": 57.0, + "grad_norm": 0.01438127364963293, + "learning_rate": 1.1357279552480494e-05, + "loss": 0.0136, + "step": 24020 + }, + { + "epoch": 57.0, + "grad_norm": 0.0032948721200227737, + "learning_rate": 1.1338878257029296e-05, + "loss": 0.0002, + "step": 24030 + }, + { + "epoch": 57.0, + "grad_norm": 22.448226928710938, + "learning_rate": 1.1320476961578096e-05, + "loss": 0.0026, + "step": 24040 + }, + { + "epoch": 57.0, + "grad_norm": 0.0013019995531067252, + "learning_rate": 1.1302075666126896e-05, + "loss": 0.0001, + "step": 24050 + }, + { + "epoch": 57.0, + "grad_norm": 0.009402398020029068, + "learning_rate": 1.1283674370675696e-05, + "loss": 0.0004, + "step": 24060 + }, + { + "epoch": 57.0, + "grad_norm": 0.005693600047379732, + "learning_rate": 1.1265273075224496e-05, + "loss": 0.0001, + "step": 24070 + }, + { + "epoch": 57.0, + "grad_norm": 0.0012536332942545414, + "learning_rate": 1.1246871779773296e-05, + "loss": 0.0002, + "step": 24080 + }, + { + "epoch": 57.0, + "grad_norm": 0.0019443167839199305, + "learning_rate": 1.1228470484322096e-05, + "loss": 0.0101, + "step": 24090 + }, + { + "epoch": 57.01, + "grad_norm": 0.006146451458334923, + "learning_rate": 1.1210069188870898e-05, + "loss": 0.0001, + "step": 24100 + }, + { + "epoch": 57.01, + "grad_norm": 31.494211196899414, + "learning_rate": 1.1191667893419697e-05, + "loss": 0.0022, + "step": 24110 + }, + { + "epoch": 57.01, + "grad_norm": 0.0124431187286973, + "learning_rate": 1.1173266597968497e-05, + "loss": 0.0002, + "step": 24120 + }, + { + "epoch": 57.01, + "grad_norm": 0.0027170858811587095, + "learning_rate": 1.1154865302517299e-05, + "loss": 0.042, + "step": 24130 + }, + { + "epoch": 57.01, + "grad_norm": 63.92512512207031, + "learning_rate": 1.1136464007066099e-05, + "loss": 0.0493, + "step": 24140 + }, + { + "epoch": 57.01, + "grad_norm": 0.0034329602494835854, + "learning_rate": 1.1118062711614899e-05, + "loss": 0.006, + "step": 24150 + }, + { + "epoch": 57.01, + "grad_norm": 0.0025639557279646397, + "learning_rate": 1.1099661416163699e-05, + "loss": 0.0001, + "step": 24160 + }, + { + "epoch": 57.01, + "grad_norm": 0.0006689762230962515, + "learning_rate": 1.1081260120712499e-05, + "loss": 0.0002, + "step": 24170 + }, + { + "epoch": 57.01, + "grad_norm": 0.0014391548465937376, + "learning_rate": 1.1062858825261299e-05, + "loss": 0.0021, + "step": 24180 + }, + { + "epoch": 57.01, + "grad_norm": 0.0008516352972947061, + "learning_rate": 1.10444575298101e-05, + "loss": 0.0001, + "step": 24190 + }, + { + "epoch": 57.01, + "grad_norm": 0.04408324137330055, + "learning_rate": 1.10260562343589e-05, + "loss": 0.0422, + "step": 24200 + }, + { + "epoch": 57.01, + "grad_norm": 0.008414952084422112, + "learning_rate": 1.10076549389077e-05, + "loss": 0.0002, + "step": 24210 + }, + { + "epoch": 57.01, + "grad_norm": 0.002066493732854724, + "learning_rate": 1.09892536434565e-05, + "loss": 0.0007, + "step": 24220 + }, + { + "epoch": 57.01, + "grad_norm": 0.12472962588071823, + "learning_rate": 1.09708523480053e-05, + "loss": 0.0184, + "step": 24230 + }, + { + "epoch": 57.01, + "grad_norm": 0.0016410744283348322, + "learning_rate": 1.09524510525541e-05, + "loss": 0.001, + "step": 24240 + }, + { + "epoch": 57.01, + "grad_norm": 0.0014384149108082056, + "learning_rate": 1.09340497571029e-05, + "loss": 0.0001, + "step": 24250 + }, + { + "epoch": 57.01, + "grad_norm": 0.018821023404598236, + "learning_rate": 1.09156484616517e-05, + "loss": 0.0551, + "step": 24260 + }, + { + "epoch": 57.01, + "grad_norm": 0.0015349116874858737, + "learning_rate": 1.0897247166200502e-05, + "loss": 0.0116, + "step": 24270 + }, + { + "epoch": 57.01, + "grad_norm": 0.009763521142303944, + "learning_rate": 1.0878845870749302e-05, + "loss": 0.0647, + "step": 24280 + }, + { + "epoch": 57.01, + "grad_norm": 0.0007470657583326101, + "learning_rate": 1.08604445752981e-05, + "loss": 0.0055, + "step": 24290 + }, + { + "epoch": 57.01, + "grad_norm": 0.0014731376431882381, + "learning_rate": 1.0842043279846902e-05, + "loss": 0.0262, + "step": 24300 + }, + { + "epoch": 57.01, + "grad_norm": 0.0022547373082488775, + "learning_rate": 1.0823641984395702e-05, + "loss": 0.001, + "step": 24310 + }, + { + "epoch": 57.01, + "grad_norm": 0.0013335180701687932, + "learning_rate": 1.0805240688944502e-05, + "loss": 0.0001, + "step": 24320 + }, + { + "epoch": 57.01, + "grad_norm": 0.0014102818677201867, + "learning_rate": 1.0786839393493303e-05, + "loss": 0.0001, + "step": 24330 + }, + { + "epoch": 57.01, + "grad_norm": 0.002224271185696125, + "learning_rate": 1.0768438098042103e-05, + "loss": 0.0001, + "step": 24340 + }, + { + "epoch": 57.01, + "grad_norm": 0.0031190093141049147, + "learning_rate": 1.0750036802590903e-05, + "loss": 0.0001, + "step": 24350 + }, + { + "epoch": 57.01, + "grad_norm": 0.0007708192570134997, + "learning_rate": 1.0731635507139703e-05, + "loss": 0.0001, + "step": 24360 + }, + { + "epoch": 57.01, + "eval_accuracy": 0.7623873873873874, + "eval_loss": 1.6253712177276611, + "eval_runtime": 39.4401, + "eval_samples_per_second": 22.515, + "eval_steps_per_second": 1.876, + "step": 24360 + }, + { + "epoch": 58.0, + "grad_norm": 0.001005275989882648, + "learning_rate": 1.0713234211688505e-05, + "loss": 0.0001, + "step": 24370 + }, + { + "epoch": 58.0, + "grad_norm": 0.0007201577536761761, + "learning_rate": 1.0694832916237303e-05, + "loss": 0.0146, + "step": 24380 + }, + { + "epoch": 58.0, + "grad_norm": 0.0415475107729435, + "learning_rate": 1.0676431620786103e-05, + "loss": 0.0001, + "step": 24390 + }, + { + "epoch": 58.0, + "grad_norm": 0.000509662670083344, + "learning_rate": 1.0658030325334905e-05, + "loss": 0.0001, + "step": 24400 + }, + { + "epoch": 58.0, + "grad_norm": 0.0016784222098067403, + "learning_rate": 1.0639629029883705e-05, + "loss": 0.0061, + "step": 24410 + }, + { + "epoch": 58.0, + "grad_norm": 0.001808426110073924, + "learning_rate": 1.0621227734432504e-05, + "loss": 0.0001, + "step": 24420 + }, + { + "epoch": 58.0, + "grad_norm": 0.00614283187314868, + "learning_rate": 1.0602826438981304e-05, + "loss": 0.0001, + "step": 24430 + }, + { + "epoch": 58.0, + "grad_norm": 0.0009809379698708653, + "learning_rate": 1.0584425143530105e-05, + "loss": 0.0002, + "step": 24440 + }, + { + "epoch": 58.0, + "grad_norm": 0.0018786874134093523, + "learning_rate": 1.0566023848078906e-05, + "loss": 0.0001, + "step": 24450 + }, + { + "epoch": 58.0, + "grad_norm": 0.0009892601519823074, + "learning_rate": 1.0547622552627706e-05, + "loss": 0.0, + "step": 24460 + }, + { + "epoch": 58.0, + "grad_norm": 0.0009283177787438035, + "learning_rate": 1.0529221257176506e-05, + "loss": 0.0002, + "step": 24470 + }, + { + "epoch": 58.0, + "grad_norm": 0.0007727140327915549, + "learning_rate": 1.0510819961725306e-05, + "loss": 0.0, + "step": 24480 + }, + { + "epoch": 58.0, + "grad_norm": 0.0018267113482579589, + "learning_rate": 1.0492418666274106e-05, + "loss": 0.0025, + "step": 24490 + }, + { + "epoch": 58.0, + "grad_norm": 0.0010749399662017822, + "learning_rate": 1.0474017370822906e-05, + "loss": 0.0435, + "step": 24500 + }, + { + "epoch": 58.0, + "grad_norm": 0.0004598453233484179, + "learning_rate": 1.0455616075371706e-05, + "loss": 0.0001, + "step": 24510 + }, + { + "epoch": 58.01, + "grad_norm": 0.0019546435214579105, + "learning_rate": 1.0437214779920506e-05, + "loss": 0.0067, + "step": 24520 + }, + { + "epoch": 58.01, + "grad_norm": 0.0007498570485040545, + "learning_rate": 1.0418813484469307e-05, + "loss": 0.0001, + "step": 24530 + }, + { + "epoch": 58.01, + "grad_norm": 0.0015781412366777658, + "learning_rate": 1.0400412189018108e-05, + "loss": 0.0277, + "step": 24540 + }, + { + "epoch": 58.01, + "grad_norm": 0.5236406922340393, + "learning_rate": 1.0382010893566907e-05, + "loss": 0.0004, + "step": 24550 + }, + { + "epoch": 58.01, + "grad_norm": 0.0012669709976762533, + "learning_rate": 1.0363609598115707e-05, + "loss": 0.0001, + "step": 24560 + }, + { + "epoch": 58.01, + "grad_norm": 0.0012568996753543615, + "learning_rate": 1.0345208302664509e-05, + "loss": 0.0004, + "step": 24570 + }, + { + "epoch": 58.01, + "grad_norm": 0.043358806520700455, + "learning_rate": 1.0326807007213309e-05, + "loss": 0.0001, + "step": 24580 + }, + { + "epoch": 58.01, + "grad_norm": 0.0007607506704516709, + "learning_rate": 1.0308405711762109e-05, + "loss": 0.0003, + "step": 24590 + }, + { + "epoch": 58.01, + "grad_norm": 0.0019133040914312005, + "learning_rate": 1.0290004416310909e-05, + "loss": 0.0002, + "step": 24600 + }, + { + "epoch": 58.01, + "grad_norm": 0.0040087285451591015, + "learning_rate": 1.0271603120859709e-05, + "loss": 0.0001, + "step": 24610 + }, + { + "epoch": 58.01, + "grad_norm": 0.005804694723337889, + "learning_rate": 1.025320182540851e-05, + "loss": 0.0, + "step": 24620 + }, + { + "epoch": 58.01, + "grad_norm": 0.003029964864253998, + "learning_rate": 1.023480052995731e-05, + "loss": 0.0473, + "step": 24630 + }, + { + "epoch": 58.01, + "grad_norm": 0.0011636598501354456, + "learning_rate": 1.021639923450611e-05, + "loss": 0.0001, + "step": 24640 + }, + { + "epoch": 58.01, + "grad_norm": 0.002884934889152646, + "learning_rate": 1.019799793905491e-05, + "loss": 0.0001, + "step": 24650 + }, + { + "epoch": 58.01, + "grad_norm": 0.11349088698625565, + "learning_rate": 1.017959664360371e-05, + "loss": 0.0125, + "step": 24660 + }, + { + "epoch": 58.01, + "grad_norm": 0.002500980393961072, + "learning_rate": 1.0161195348152512e-05, + "loss": 0.0003, + "step": 24670 + }, + { + "epoch": 58.01, + "grad_norm": 0.0008682305924594402, + "learning_rate": 1.0142794052701312e-05, + "loss": 0.0, + "step": 24680 + }, + { + "epoch": 58.01, + "grad_norm": 0.014253217726945877, + "learning_rate": 1.012439275725011e-05, + "loss": 0.0001, + "step": 24690 + }, + { + "epoch": 58.01, + "grad_norm": 0.0009429533965885639, + "learning_rate": 1.010599146179891e-05, + "loss": 0.077, + "step": 24700 + }, + { + "epoch": 58.01, + "grad_norm": 0.0008992166840471327, + "learning_rate": 1.0087590166347712e-05, + "loss": 0.007, + "step": 24710 + }, + { + "epoch": 58.01, + "grad_norm": 0.0009217716287821531, + "learning_rate": 1.0069188870896512e-05, + "loss": 0.0001, + "step": 24720 + }, + { + "epoch": 58.01, + "grad_norm": 0.003018986666575074, + "learning_rate": 1.005078757544531e-05, + "loss": 0.0001, + "step": 24730 + }, + { + "epoch": 58.01, + "grad_norm": 0.0009471423109062016, + "learning_rate": 1.0032386279994112e-05, + "loss": 0.0001, + "step": 24740 + }, + { + "epoch": 58.01, + "grad_norm": 0.0012168257962912321, + "learning_rate": 1.0013984984542912e-05, + "loss": 0.0001, + "step": 24750 + }, + { + "epoch": 58.01, + "grad_norm": 0.0008903819834813476, + "learning_rate": 9.995583689091713e-06, + "loss": 0.0001, + "step": 24760 + }, + { + "epoch": 58.01, + "grad_norm": 5.247084617614746, + "learning_rate": 9.977182393640513e-06, + "loss": 0.0009, + "step": 24770 + }, + { + "epoch": 58.01, + "grad_norm": 0.00031116019818000495, + "learning_rate": 9.958781098189313e-06, + "loss": 0.0, + "step": 24780 + }, + { + "epoch": 58.01, + "eval_accuracy": 0.7545045045045045, + "eval_loss": 1.8023662567138672, + "eval_runtime": 39.4053, + "eval_samples_per_second": 22.535, + "eval_steps_per_second": 1.878, + "step": 24780 + }, + { + "epoch": 59.0, + "grad_norm": 0.0064481995068490505, + "learning_rate": 9.940379802738113e-06, + "loss": 0.0, + "step": 24790 + }, + { + "epoch": 59.0, + "grad_norm": 0.0012003867886960506, + "learning_rate": 9.921978507286913e-06, + "loss": 0.0001, + "step": 24800 + }, + { + "epoch": 59.0, + "grad_norm": 0.000993055640719831, + "learning_rate": 9.903577211835715e-06, + "loss": 0.0, + "step": 24810 + }, + { + "epoch": 59.0, + "grad_norm": 0.0007150658057071269, + "learning_rate": 9.885175916384513e-06, + "loss": 0.0, + "step": 24820 + }, + { + "epoch": 59.0, + "grad_norm": 0.004784159362316132, + "learning_rate": 9.866774620933313e-06, + "loss": 0.0, + "step": 24830 + }, + { + "epoch": 59.0, + "grad_norm": 0.0005838919896632433, + "learning_rate": 9.848373325482115e-06, + "loss": 0.0001, + "step": 24840 + }, + { + "epoch": 59.0, + "grad_norm": 0.0009973630076274276, + "learning_rate": 9.829972030030915e-06, + "loss": 0.0, + "step": 24850 + }, + { + "epoch": 59.0, + "grad_norm": 0.000762183393817395, + "learning_rate": 9.811570734579714e-06, + "loss": 0.0001, + "step": 24860 + }, + { + "epoch": 59.0, + "grad_norm": 0.0015864388551563025, + "learning_rate": 9.793169439128516e-06, + "loss": 0.0499, + "step": 24870 + }, + { + "epoch": 59.0, + "grad_norm": 0.0018210052512586117, + "learning_rate": 9.774768143677316e-06, + "loss": 0.0, + "step": 24880 + }, + { + "epoch": 59.0, + "grad_norm": 0.012355053797364235, + "learning_rate": 9.756366848226116e-06, + "loss": 0.0058, + "step": 24890 + }, + { + "epoch": 59.0, + "grad_norm": 0.0005313998553901911, + "learning_rate": 9.737965552774916e-06, + "loss": 0.0, + "step": 24900 + }, + { + "epoch": 59.0, + "grad_norm": 0.0005420492379926145, + "learning_rate": 9.719564257323716e-06, + "loss": 0.0, + "step": 24910 + }, + { + "epoch": 59.0, + "grad_norm": 0.0017370874993503094, + "learning_rate": 9.701162961872516e-06, + "loss": 0.1459, + "step": 24920 + }, + { + "epoch": 59.0, + "grad_norm": 14.99138069152832, + "learning_rate": 9.682761666421316e-06, + "loss": 0.0016, + "step": 24930 + }, + { + "epoch": 59.01, + "grad_norm": 0.0008930095937103033, + "learning_rate": 9.664360370970118e-06, + "loss": 0.0001, + "step": 24940 + }, + { + "epoch": 59.01, + "grad_norm": 0.0012798807583749294, + "learning_rate": 9.645959075518916e-06, + "loss": 0.0001, + "step": 24950 + }, + { + "epoch": 59.01, + "grad_norm": 38.88914108276367, + "learning_rate": 9.627557780067717e-06, + "loss": 0.0334, + "step": 24960 + }, + { + "epoch": 59.01, + "grad_norm": 0.0011596691329032183, + "learning_rate": 9.609156484616517e-06, + "loss": 0.0001, + "step": 24970 + }, + { + "epoch": 59.01, + "grad_norm": 0.1758793741464615, + "learning_rate": 9.590755189165319e-06, + "loss": 0.0187, + "step": 24980 + }, + { + "epoch": 59.01, + "grad_norm": 0.003486826317384839, + "learning_rate": 9.572353893714119e-06, + "loss": 0.0001, + "step": 24990 + }, + { + "epoch": 59.01, + "grad_norm": 1.0641165971755981, + "learning_rate": 9.553952598262917e-06, + "loss": 0.0181, + "step": 25000 + }, + { + "epoch": 59.01, + "grad_norm": 0.6313084959983826, + "learning_rate": 9.535551302811719e-06, + "loss": 0.003, + "step": 25010 + }, + { + "epoch": 59.01, + "grad_norm": 0.0015551378019154072, + "learning_rate": 9.517150007360519e-06, + "loss": 0.0001, + "step": 25020 + }, + { + "epoch": 59.01, + "grad_norm": 0.0037916789297014475, + "learning_rate": 9.498748711909319e-06, + "loss": 0.0002, + "step": 25030 + }, + { + "epoch": 59.01, + "grad_norm": 10.630950927734375, + "learning_rate": 9.48034741645812e-06, + "loss": 0.1094, + "step": 25040 + }, + { + "epoch": 59.01, + "grad_norm": 0.002494827611371875, + "learning_rate": 9.46194612100692e-06, + "loss": 0.0464, + "step": 25050 + }, + { + "epoch": 59.01, + "grad_norm": 0.0022337674163281918, + "learning_rate": 9.44354482555572e-06, + "loss": 0.0025, + "step": 25060 + }, + { + "epoch": 59.01, + "grad_norm": 0.02816508710384369, + "learning_rate": 9.42514353010452e-06, + "loss": 0.0605, + "step": 25070 + }, + { + "epoch": 59.01, + "grad_norm": 0.0023118199314922094, + "learning_rate": 9.406742234653321e-06, + "loss": 0.0056, + "step": 25080 + }, + { + "epoch": 59.01, + "grad_norm": 0.17282597720623016, + "learning_rate": 9.38834093920212e-06, + "loss": 0.0029, + "step": 25090 + }, + { + "epoch": 59.01, + "grad_norm": 0.007972006686031818, + "learning_rate": 9.36993964375092e-06, + "loss": 0.0001, + "step": 25100 + }, + { + "epoch": 59.01, + "grad_norm": 0.0013798163272440434, + "learning_rate": 9.351538348299722e-06, + "loss": 0.0196, + "step": 25110 + }, + { + "epoch": 59.01, + "grad_norm": 0.0026264681946486235, + "learning_rate": 9.333137052848522e-06, + "loss": 0.0004, + "step": 25120 + }, + { + "epoch": 59.01, + "grad_norm": 0.0016933433944359422, + "learning_rate": 9.31473575739732e-06, + "loss": 0.001, + "step": 25130 + }, + { + "epoch": 59.01, + "grad_norm": 0.0603969544172287, + "learning_rate": 9.296334461946122e-06, + "loss": 0.0002, + "step": 25140 + }, + { + "epoch": 59.01, + "grad_norm": 0.001932127634063363, + "learning_rate": 9.277933166494922e-06, + "loss": 0.0362, + "step": 25150 + }, + { + "epoch": 59.01, + "grad_norm": 0.001338793197646737, + "learning_rate": 9.259531871043722e-06, + "loss": 0.0002, + "step": 25160 + }, + { + "epoch": 59.01, + "grad_norm": 0.0024657603353261948, + "learning_rate": 9.24113057559252e-06, + "loss": 0.0001, + "step": 25170 + }, + { + "epoch": 59.01, + "grad_norm": 0.001342698698863387, + "learning_rate": 9.222729280141323e-06, + "loss": 0.0001, + "step": 25180 + }, + { + "epoch": 59.01, + "grad_norm": 0.0012493436224758625, + "learning_rate": 9.204327984690123e-06, + "loss": 0.0001, + "step": 25190 + }, + { + "epoch": 59.01, + "grad_norm": 0.0005642689066007733, + "learning_rate": 9.185926689238923e-06, + "loss": 0.0486, + "step": 25200 + }, + { + "epoch": 59.01, + "eval_accuracy": 0.7545045045045045, + "eval_loss": 1.6803523302078247, + "eval_runtime": 39.5194, + "eval_samples_per_second": 22.47, + "eval_steps_per_second": 1.872, + "step": 25200 + }, + { + "epoch": 60.0, + "grad_norm": 0.002399663208052516, + "learning_rate": 9.167525393787723e-06, + "loss": 0.0001, + "step": 25210 + }, + { + "epoch": 60.0, + "grad_norm": 0.12855449318885803, + "learning_rate": 9.149124098336523e-06, + "loss": 0.0002, + "step": 25220 + }, + { + "epoch": 60.0, + "grad_norm": 0.002423466183245182, + "learning_rate": 9.130722802885323e-06, + "loss": 0.0048, + "step": 25230 + }, + { + "epoch": 60.0, + "grad_norm": 0.003666324308142066, + "learning_rate": 9.112321507434123e-06, + "loss": 0.0002, + "step": 25240 + }, + { + "epoch": 60.0, + "grad_norm": 0.015467680059373379, + "learning_rate": 9.093920211982925e-06, + "loss": 0.0002, + "step": 25250 + }, + { + "epoch": 60.0, + "grad_norm": 0.002297742525115609, + "learning_rate": 9.075518916531723e-06, + "loss": 0.0427, + "step": 25260 + }, + { + "epoch": 60.0, + "grad_norm": 0.002047237940132618, + "learning_rate": 9.057117621080524e-06, + "loss": 0.0001, + "step": 25270 + }, + { + "epoch": 60.0, + "grad_norm": 0.004319984000176191, + "learning_rate": 9.038716325629325e-06, + "loss": 0.0001, + "step": 25280 + }, + { + "epoch": 60.0, + "grad_norm": 0.01682116463780403, + "learning_rate": 9.020315030178125e-06, + "loss": 0.0001, + "step": 25290 + }, + { + "epoch": 60.0, + "grad_norm": 0.07739716023206711, + "learning_rate": 9.001913734726926e-06, + "loss": 0.0003, + "step": 25300 + }, + { + "epoch": 60.0, + "grad_norm": 0.0007034876034595072, + "learning_rate": 8.983512439275726e-06, + "loss": 0.0001, + "step": 25310 + }, + { + "epoch": 60.0, + "grad_norm": 0.0012769020395353436, + "learning_rate": 8.965111143824526e-06, + "loss": 0.0001, + "step": 25320 + }, + { + "epoch": 60.0, + "grad_norm": 0.0047972784377634525, + "learning_rate": 8.946709848373326e-06, + "loss": 0.0001, + "step": 25330 + }, + { + "epoch": 60.0, + "grad_norm": 0.005869260523468256, + "learning_rate": 8.928308552922126e-06, + "loss": 0.0442, + "step": 25340 + }, + { + "epoch": 60.0, + "grad_norm": 0.0843689814209938, + "learning_rate": 8.909907257470926e-06, + "loss": 0.0001, + "step": 25350 + }, + { + "epoch": 60.01, + "grad_norm": 0.008844790048897266, + "learning_rate": 8.891505962019726e-06, + "loss": 0.0001, + "step": 25360 + }, + { + "epoch": 60.01, + "grad_norm": 0.0014894960913807154, + "learning_rate": 8.873104666568526e-06, + "loss": 0.0001, + "step": 25370 + }, + { + "epoch": 60.01, + "grad_norm": 0.002464174758642912, + "learning_rate": 8.854703371117328e-06, + "loss": 0.0001, + "step": 25380 + }, + { + "epoch": 60.01, + "grad_norm": 0.013474004343152046, + "learning_rate": 8.836302075666128e-06, + "loss": 0.0001, + "step": 25390 + }, + { + "epoch": 60.01, + "grad_norm": 0.0005289826076477766, + "learning_rate": 8.817900780214927e-06, + "loss": 0.0001, + "step": 25400 + }, + { + "epoch": 60.01, + "grad_norm": 0.001807571155950427, + "learning_rate": 8.799499484763729e-06, + "loss": 0.0039, + "step": 25410 + }, + { + "epoch": 60.01, + "grad_norm": 0.0014281687326729298, + "learning_rate": 8.781098189312529e-06, + "loss": 0.0001, + "step": 25420 + }, + { + "epoch": 60.01, + "grad_norm": 0.001596023328602314, + "learning_rate": 8.762696893861329e-06, + "loss": 0.0001, + "step": 25430 + }, + { + "epoch": 60.01, + "grad_norm": 0.0017493361374363303, + "learning_rate": 8.744295598410127e-06, + "loss": 0.0033, + "step": 25440 + }, + { + "epoch": 60.01, + "grad_norm": 0.0007126057171262801, + "learning_rate": 8.725894302958929e-06, + "loss": 0.0041, + "step": 25450 + }, + { + "epoch": 60.01, + "grad_norm": 0.00909177865833044, + "learning_rate": 8.707493007507729e-06, + "loss": 0.0001, + "step": 25460 + }, + { + "epoch": 60.01, + "grad_norm": 10.200445175170898, + "learning_rate": 8.68909171205653e-06, + "loss": 0.0827, + "step": 25470 + }, + { + "epoch": 60.01, + "grad_norm": 0.014321324415504932, + "learning_rate": 8.67069041660533e-06, + "loss": 0.0001, + "step": 25480 + }, + { + "epoch": 60.01, + "grad_norm": 0.141360342502594, + "learning_rate": 8.65228912115413e-06, + "loss": 0.0002, + "step": 25490 + }, + { + "epoch": 60.01, + "grad_norm": 0.0010122742969542742, + "learning_rate": 8.63388782570293e-06, + "loss": 0.0002, + "step": 25500 + }, + { + "epoch": 60.01, + "grad_norm": 0.0054549286141991615, + "learning_rate": 8.61548653025173e-06, + "loss": 0.0196, + "step": 25510 + }, + { + "epoch": 60.01, + "grad_norm": 0.002311424119397998, + "learning_rate": 8.597085234800532e-06, + "loss": 0.0001, + "step": 25520 + }, + { + "epoch": 60.01, + "grad_norm": 0.0010803097393363714, + "learning_rate": 8.57868393934933e-06, + "loss": 0.0112, + "step": 25530 + }, + { + "epoch": 60.01, + "grad_norm": 0.0027651439886540174, + "learning_rate": 8.56028264389813e-06, + "loss": 0.006, + "step": 25540 + }, + { + "epoch": 60.01, + "grad_norm": 0.0009034753893502057, + "learning_rate": 8.541881348446932e-06, + "loss": 0.0001, + "step": 25550 + }, + { + "epoch": 60.01, + "grad_norm": 0.002131812274456024, + "learning_rate": 8.523480052995732e-06, + "loss": 0.0, + "step": 25560 + }, + { + "epoch": 60.01, + "grad_norm": 0.0011010438902303576, + "learning_rate": 8.50507875754453e-06, + "loss": 0.0001, + "step": 25570 + }, + { + "epoch": 60.01, + "grad_norm": 0.0020116691011935472, + "learning_rate": 8.486677462093332e-06, + "loss": 0.036, + "step": 25580 + }, + { + "epoch": 60.01, + "grad_norm": 0.008289303630590439, + "learning_rate": 8.468276166642132e-06, + "loss": 0.0001, + "step": 25590 + }, + { + "epoch": 60.01, + "grad_norm": 0.004598530940711498, + "learning_rate": 8.449874871190932e-06, + "loss": 0.001, + "step": 25600 + }, + { + "epoch": 60.01, + "grad_norm": 0.0016972459852695465, + "learning_rate": 8.431473575739733e-06, + "loss": 0.0001, + "step": 25610 + }, + { + "epoch": 60.01, + "grad_norm": 0.002091147704049945, + "learning_rate": 8.413072280288533e-06, + "loss": 0.0001, + "step": 25620 + }, + { + "epoch": 60.01, + "eval_accuracy": 0.7522522522522522, + "eval_loss": 1.7991126775741577, + "eval_runtime": 39.379, + "eval_samples_per_second": 22.55, + "eval_steps_per_second": 1.879, + "step": 25620 + }, + { + "epoch": 61.0, + "grad_norm": 0.0016870342660695314, + "learning_rate": 8.394670984837333e-06, + "loss": 0.0, + "step": 25630 + }, + { + "epoch": 61.0, + "grad_norm": 0.001305489568039775, + "learning_rate": 8.376269689386133e-06, + "loss": 0.0034, + "step": 25640 + }, + { + "epoch": 61.0, + "grad_norm": 0.0020442858804017305, + "learning_rate": 8.357868393934935e-06, + "loss": 0.0001, + "step": 25650 + }, + { + "epoch": 61.0, + "grad_norm": 0.0010033355792984366, + "learning_rate": 8.339467098483733e-06, + "loss": 0.0105, + "step": 25660 + }, + { + "epoch": 61.0, + "grad_norm": 0.015114572830498219, + "learning_rate": 8.321065803032533e-06, + "loss": 0.0212, + "step": 25670 + }, + { + "epoch": 61.0, + "grad_norm": 0.001860952703282237, + "learning_rate": 8.302664507581333e-06, + "loss": 0.0001, + "step": 25680 + }, + { + "epoch": 61.0, + "grad_norm": 0.002813951577991247, + "learning_rate": 8.284263212130135e-06, + "loss": 0.0001, + "step": 25690 + }, + { + "epoch": 61.0, + "grad_norm": 0.00690379599109292, + "learning_rate": 8.265861916678935e-06, + "loss": 0.0001, + "step": 25700 + }, + { + "epoch": 61.0, + "grad_norm": 0.0035497399512678385, + "learning_rate": 8.247460621227734e-06, + "loss": 0.0043, + "step": 25710 + }, + { + "epoch": 61.0, + "grad_norm": 0.0012824188452214003, + "learning_rate": 8.229059325776536e-06, + "loss": 0.0001, + "step": 25720 + }, + { + "epoch": 61.0, + "grad_norm": 0.0018320622621104121, + "learning_rate": 8.210658030325336e-06, + "loss": 0.0004, + "step": 25730 + }, + { + "epoch": 61.0, + "grad_norm": 0.0008166917832568288, + "learning_rate": 8.192256734874136e-06, + "loss": 0.0, + "step": 25740 + }, + { + "epoch": 61.0, + "grad_norm": 0.000992298242636025, + "learning_rate": 8.173855439422936e-06, + "loss": 0.061, + "step": 25750 + }, + { + "epoch": 61.0, + "grad_norm": 0.001933283288963139, + "learning_rate": 8.155454143971736e-06, + "loss": 0.0, + "step": 25760 + }, + { + "epoch": 61.0, + "grad_norm": 0.002809977624565363, + "learning_rate": 8.137052848520536e-06, + "loss": 0.0001, + "step": 25770 + }, + { + "epoch": 61.01, + "grad_norm": 0.0009199827327392995, + "learning_rate": 8.118651553069336e-06, + "loss": 0.0, + "step": 25780 + }, + { + "epoch": 61.01, + "grad_norm": 0.0006985805230215192, + "learning_rate": 8.100250257618136e-06, + "loss": 0.0002, + "step": 25790 + }, + { + "epoch": 61.01, + "grad_norm": 0.0004489361308515072, + "learning_rate": 8.081848962166936e-06, + "loss": 0.0001, + "step": 25800 + }, + { + "epoch": 61.01, + "grad_norm": 0.0005292571731843054, + "learning_rate": 8.063447666715737e-06, + "loss": 0.0, + "step": 25810 + }, + { + "epoch": 61.01, + "grad_norm": 0.00038617965765297413, + "learning_rate": 8.045046371264538e-06, + "loss": 0.0001, + "step": 25820 + }, + { + "epoch": 61.01, + "grad_norm": 0.0032925093546509743, + "learning_rate": 8.026645075813338e-06, + "loss": 0.0, + "step": 25830 + }, + { + "epoch": 61.01, + "grad_norm": 0.0006309591117314994, + "learning_rate": 8.008243780362137e-06, + "loss": 0.0105, + "step": 25840 + }, + { + "epoch": 61.01, + "grad_norm": 0.000899383972864598, + "learning_rate": 7.989842484910939e-06, + "loss": 0.0001, + "step": 25850 + }, + { + "epoch": 61.01, + "grad_norm": 0.013826681300997734, + "learning_rate": 7.971441189459739e-06, + "loss": 0.0159, + "step": 25860 + }, + { + "epoch": 61.01, + "grad_norm": 0.0012361772824078798, + "learning_rate": 7.953039894008539e-06, + "loss": 0.0, + "step": 25870 + }, + { + "epoch": 61.01, + "grad_norm": 0.000654926523566246, + "learning_rate": 7.934638598557337e-06, + "loss": 0.0001, + "step": 25880 + }, + { + "epoch": 61.01, + "grad_norm": 0.00048330603749491274, + "learning_rate": 7.91623730310614e-06, + "loss": 0.0, + "step": 25890 + }, + { + "epoch": 61.01, + "grad_norm": 0.0006944110500626266, + "learning_rate": 7.89783600765494e-06, + "loss": 0.0005, + "step": 25900 + }, + { + "epoch": 61.01, + "grad_norm": 0.0006543623167090118, + "learning_rate": 7.87943471220374e-06, + "loss": 0.0004, + "step": 25910 + }, + { + "epoch": 61.01, + "grad_norm": 0.03564361482858658, + "learning_rate": 7.861033416752541e-06, + "loss": 0.0001, + "step": 25920 + }, + { + "epoch": 61.01, + "grad_norm": 0.0036910008639097214, + "learning_rate": 7.84263212130134e-06, + "loss": 0.0, + "step": 25930 + }, + { + "epoch": 61.01, + "grad_norm": 66.63784790039062, + "learning_rate": 7.82423082585014e-06, + "loss": 0.0335, + "step": 25940 + }, + { + "epoch": 61.01, + "grad_norm": 0.0006726859137415886, + "learning_rate": 7.80582953039894e-06, + "loss": 0.0001, + "step": 25950 + }, + { + "epoch": 61.01, + "grad_norm": 0.0008944363798946142, + "learning_rate": 7.787428234947742e-06, + "loss": 0.0023, + "step": 25960 + }, + { + "epoch": 61.01, + "grad_norm": 0.0020627244375646114, + "learning_rate": 7.76902693949654e-06, + "loss": 0.0, + "step": 25970 + }, + { + "epoch": 61.01, + "grad_norm": 0.004097119905054569, + "learning_rate": 7.75062564404534e-06, + "loss": 0.0001, + "step": 25980 + }, + { + "epoch": 61.01, + "grad_norm": 0.0006828421028330922, + "learning_rate": 7.732224348594142e-06, + "loss": 0.0, + "step": 25990 + }, + { + "epoch": 61.01, + "grad_norm": 0.9965651631355286, + "learning_rate": 7.713823053142942e-06, + "loss": 0.0002, + "step": 26000 + }, + { + "epoch": 61.01, + "grad_norm": 0.001254824921488762, + "learning_rate": 7.69542175769174e-06, + "loss": 0.0, + "step": 26010 + }, + { + "epoch": 61.01, + "grad_norm": 0.0006275831838138402, + "learning_rate": 7.677020462240542e-06, + "loss": 0.0001, + "step": 26020 + }, + { + "epoch": 61.01, + "grad_norm": 0.0017072111368179321, + "learning_rate": 7.658619166789342e-06, + "loss": 0.0001, + "step": 26030 + }, + { + "epoch": 61.01, + "grad_norm": 0.00032576528610661626, + "learning_rate": 7.640217871338143e-06, + "loss": 0.0, + "step": 26040 + }, + { + "epoch": 61.01, + "eval_accuracy": 0.7511261261261262, + "eval_loss": 1.8280649185180664, + "eval_runtime": 38.5276, + "eval_samples_per_second": 23.048, + "eval_steps_per_second": 1.921, + "step": 26040 + }, + { + "epoch": 62.0, + "grad_norm": 0.015026925131678581, + "learning_rate": 7.6218165758869436e-06, + "loss": 0.0083, + "step": 26050 + }, + { + "epoch": 62.0, + "grad_norm": 0.00498881796374917, + "learning_rate": 7.603415280435743e-06, + "loss": 0.0001, + "step": 26060 + }, + { + "epoch": 62.0, + "grad_norm": 95.47059631347656, + "learning_rate": 7.585013984984543e-06, + "loss": 0.045, + "step": 26070 + }, + { + "epoch": 62.0, + "grad_norm": 0.004085544031113386, + "learning_rate": 7.566612689533344e-06, + "loss": 0.0001, + "step": 26080 + }, + { + "epoch": 62.0, + "grad_norm": 0.0010452407877892256, + "learning_rate": 7.548211394082144e-06, + "loss": 0.0001, + "step": 26090 + }, + { + "epoch": 62.0, + "grad_norm": 0.0003794306539930403, + "learning_rate": 7.529810098630943e-06, + "loss": 0.0, + "step": 26100 + }, + { + "epoch": 62.0, + "grad_norm": 0.00040680027450434864, + "learning_rate": 7.511408803179744e-06, + "loss": 0.0004, + "step": 26110 + }, + { + "epoch": 62.0, + "grad_norm": 0.0007529466529376805, + "learning_rate": 7.493007507728544e-06, + "loss": 0.0, + "step": 26120 + }, + { + "epoch": 62.0, + "grad_norm": 0.016848569735884666, + "learning_rate": 7.474606212277345e-06, + "loss": 0.0001, + "step": 26130 + }, + { + "epoch": 62.0, + "grad_norm": 0.0005395954358391464, + "learning_rate": 7.4562049168261454e-06, + "loss": 0.0, + "step": 26140 + }, + { + "epoch": 62.0, + "grad_norm": 0.009298846125602722, + "learning_rate": 7.437803621374945e-06, + "loss": 0.0, + "step": 26150 + }, + { + "epoch": 62.0, + "grad_norm": 0.004532150458544493, + "learning_rate": 7.419402325923745e-06, + "loss": 0.0001, + "step": 26160 + }, + { + "epoch": 62.0, + "grad_norm": 0.0007697180844843388, + "learning_rate": 7.401001030472546e-06, + "loss": 0.0373, + "step": 26170 + }, + { + "epoch": 62.0, + "grad_norm": 0.0009432808728888631, + "learning_rate": 7.382599735021346e-06, + "loss": 0.0611, + "step": 26180 + }, + { + "epoch": 62.0, + "grad_norm": 0.0006800147821195424, + "learning_rate": 7.364198439570145e-06, + "loss": 0.0765, + "step": 26190 + }, + { + "epoch": 62.01, + "grad_norm": 0.0006660166545771062, + "learning_rate": 7.345797144118946e-06, + "loss": 0.0001, + "step": 26200 + }, + { + "epoch": 62.01, + "grad_norm": 0.0007884202641434968, + "learning_rate": 7.327395848667746e-06, + "loss": 0.0, + "step": 26210 + }, + { + "epoch": 62.01, + "grad_norm": 0.0006302872789092362, + "learning_rate": 7.308994553216547e-06, + "loss": 0.0, + "step": 26220 + }, + { + "epoch": 62.01, + "grad_norm": 0.0007916418253444135, + "learning_rate": 7.290593257765347e-06, + "loss": 0.0696, + "step": 26230 + }, + { + "epoch": 62.01, + "grad_norm": 0.0010438553290441632, + "learning_rate": 7.272191962314147e-06, + "loss": 0.0001, + "step": 26240 + }, + { + "epoch": 62.01, + "grad_norm": 0.004884437192231417, + "learning_rate": 7.2537906668629476e-06, + "loss": 0.0207, + "step": 26250 + }, + { + "epoch": 62.01, + "grad_norm": 0.001540785189718008, + "learning_rate": 7.235389371411748e-06, + "loss": 0.0003, + "step": 26260 + }, + { + "epoch": 62.01, + "grad_norm": 0.0025039571337401867, + "learning_rate": 7.216988075960549e-06, + "loss": 0.0001, + "step": 26270 + }, + { + "epoch": 62.01, + "grad_norm": 0.001049380050972104, + "learning_rate": 7.198586780509348e-06, + "loss": 0.0, + "step": 26280 + }, + { + "epoch": 62.01, + "grad_norm": 0.0032229118514806032, + "learning_rate": 7.180185485058148e-06, + "loss": 0.0003, + "step": 26290 + }, + { + "epoch": 62.01, + "grad_norm": 0.0008956646779552102, + "learning_rate": 7.161784189606949e-06, + "loss": 0.0001, + "step": 26300 + }, + { + "epoch": 62.01, + "grad_norm": 0.014946524985134602, + "learning_rate": 7.143382894155749e-06, + "loss": 0.0001, + "step": 26310 + }, + { + "epoch": 62.01, + "grad_norm": 0.0014685116475448012, + "learning_rate": 7.124981598704548e-06, + "loss": 0.0001, + "step": 26320 + }, + { + "epoch": 62.01, + "grad_norm": 0.001738280989229679, + "learning_rate": 7.106580303253349e-06, + "loss": 0.0211, + "step": 26330 + }, + { + "epoch": 62.01, + "grad_norm": 0.0006036240374669433, + "learning_rate": 7.0881790078021495e-06, + "loss": 0.0002, + "step": 26340 + }, + { + "epoch": 62.01, + "grad_norm": 0.0010698206024244428, + "learning_rate": 7.06977771235095e-06, + "loss": 0.0002, + "step": 26350 + }, + { + "epoch": 62.01, + "grad_norm": 0.00232374994084239, + "learning_rate": 7.0513764168997505e-06, + "loss": 0.0001, + "step": 26360 + }, + { + "epoch": 62.01, + "grad_norm": 0.0006454029935412109, + "learning_rate": 7.03297512144855e-06, + "loss": 0.0, + "step": 26370 + }, + { + "epoch": 62.01, + "grad_norm": 0.002342033665627241, + "learning_rate": 7.014573825997351e-06, + "loss": 0.0002, + "step": 26380 + }, + { + "epoch": 62.01, + "grad_norm": 0.0028809804935008287, + "learning_rate": 6.996172530546151e-06, + "loss": 0.0, + "step": 26390 + }, + { + "epoch": 62.01, + "grad_norm": 0.008059854619204998, + "learning_rate": 6.977771235094952e-06, + "loss": 0.0002, + "step": 26400 + }, + { + "epoch": 62.01, + "grad_norm": 0.0008460727403871715, + "learning_rate": 6.95936993964375e-06, + "loss": 0.0002, + "step": 26410 + }, + { + "epoch": 62.01, + "grad_norm": 0.02910284884274006, + "learning_rate": 6.940968644192551e-06, + "loss": 0.0001, + "step": 26420 + }, + { + "epoch": 62.01, + "grad_norm": 0.005363269243389368, + "learning_rate": 6.922567348741351e-06, + "loss": 0.0041, + "step": 26430 + }, + { + "epoch": 62.01, + "grad_norm": 0.0016924857627600431, + "learning_rate": 6.904166053290152e-06, + "loss": 0.0001, + "step": 26440 + }, + { + "epoch": 62.01, + "grad_norm": 0.00135804305318743, + "learning_rate": 6.885764757838952e-06, + "loss": 0.0001, + "step": 26450 + }, + { + "epoch": 62.01, + "grad_norm": 0.000540624838322401, + "learning_rate": 6.867363462387752e-06, + "loss": 0.0022, + "step": 26460 + }, + { + "epoch": 62.01, + "eval_accuracy": 0.75, + "eval_loss": 1.8172096014022827, + "eval_runtime": 38.4534, + "eval_samples_per_second": 23.093, + "eval_steps_per_second": 1.924, + "step": 26460 + }, + { + "epoch": 63.0, + "grad_norm": 0.0026345832739025354, + "learning_rate": 6.848962166936553e-06, + "loss": 0.0002, + "step": 26470 + }, + { + "epoch": 63.0, + "grad_norm": 0.0017918674275279045, + "learning_rate": 6.830560871485353e-06, + "loss": 0.0, + "step": 26480 + }, + { + "epoch": 63.0, + "grad_norm": 0.0011441055685281754, + "learning_rate": 6.812159576034154e-06, + "loss": 0.0822, + "step": 26490 + }, + { + "epoch": 63.0, + "grad_norm": 0.0020869425497949123, + "learning_rate": 6.793758280582953e-06, + "loss": 0.0067, + "step": 26500 + }, + { + "epoch": 63.0, + "grad_norm": 0.003741749795153737, + "learning_rate": 6.775356985131753e-06, + "loss": 0.0001, + "step": 26510 + }, + { + "epoch": 63.0, + "grad_norm": 0.001574240275658667, + "learning_rate": 6.756955689680554e-06, + "loss": 0.0, + "step": 26520 + }, + { + "epoch": 63.0, + "grad_norm": 0.08041319251060486, + "learning_rate": 6.738554394229354e-06, + "loss": 0.0001, + "step": 26530 + }, + { + "epoch": 63.0, + "grad_norm": 0.001029444974847138, + "learning_rate": 6.720153098778155e-06, + "loss": 0.0001, + "step": 26540 + }, + { + "epoch": 63.0, + "grad_norm": 0.0004752335953526199, + "learning_rate": 6.701751803326954e-06, + "loss": 0.0001, + "step": 26550 + }, + { + "epoch": 63.0, + "grad_norm": 0.0011958446120843291, + "learning_rate": 6.6833505078757545e-06, + "loss": 0.0001, + "step": 26560 + }, + { + "epoch": 63.0, + "grad_norm": 0.10393550246953964, + "learning_rate": 6.6649492124245555e-06, + "loss": 0.0001, + "step": 26570 + }, + { + "epoch": 63.0, + "grad_norm": 0.002258807886391878, + "learning_rate": 6.646547916973356e-06, + "loss": 0.0001, + "step": 26580 + }, + { + "epoch": 63.0, + "grad_norm": 0.007346214726567268, + "learning_rate": 6.628146621522155e-06, + "loss": 0.0001, + "step": 26590 + }, + { + "epoch": 63.0, + "grad_norm": 0.0009463768219575286, + "learning_rate": 6.609745326070956e-06, + "loss": 0.0002, + "step": 26600 + }, + { + "epoch": 63.0, + "grad_norm": 0.0016016251174733043, + "learning_rate": 6.591344030619756e-06, + "loss": 0.0001, + "step": 26610 + }, + { + "epoch": 63.01, + "grad_norm": 0.0009076215210370719, + "learning_rate": 6.572942735168557e-06, + "loss": 0.0001, + "step": 26620 + }, + { + "epoch": 63.01, + "grad_norm": 0.013018609955906868, + "learning_rate": 6.554541439717355e-06, + "loss": 0.003, + "step": 26630 + }, + { + "epoch": 63.01, + "grad_norm": 0.003185126930475235, + "learning_rate": 6.536140144266156e-06, + "loss": 0.0, + "step": 26640 + }, + { + "epoch": 63.01, + "grad_norm": 0.0011634600814431906, + "learning_rate": 6.517738848814956e-06, + "loss": 0.0001, + "step": 26650 + }, + { + "epoch": 63.01, + "grad_norm": 0.0022628027945756912, + "learning_rate": 6.499337553363757e-06, + "loss": 0.0001, + "step": 26660 + }, + { + "epoch": 63.01, + "grad_norm": 0.0006948548834770918, + "learning_rate": 6.480936257912558e-06, + "loss": 0.0001, + "step": 26670 + }, + { + "epoch": 63.01, + "grad_norm": 0.006657461170107126, + "learning_rate": 6.462534962461357e-06, + "loss": 0.0, + "step": 26680 + }, + { + "epoch": 63.01, + "grad_norm": 0.0006550468970090151, + "learning_rate": 6.444133667010158e-06, + "loss": 0.0, + "step": 26690 + }, + { + "epoch": 63.01, + "grad_norm": 0.009990106336772442, + "learning_rate": 6.425732371558958e-06, + "loss": 0.0001, + "step": 26700 + }, + { + "epoch": 63.01, + "grad_norm": 0.0010792514076456428, + "learning_rate": 6.407331076107759e-06, + "loss": 0.0032, + "step": 26710 + }, + { + "epoch": 63.01, + "grad_norm": 0.007197075989097357, + "learning_rate": 6.388929780656558e-06, + "loss": 0.0001, + "step": 26720 + }, + { + "epoch": 63.01, + "grad_norm": 0.0025517232716083527, + "learning_rate": 6.370528485205358e-06, + "loss": 0.0001, + "step": 26730 + }, + { + "epoch": 63.01, + "grad_norm": 0.0018759402446448803, + "learning_rate": 6.352127189754159e-06, + "loss": 0.0001, + "step": 26740 + }, + { + "epoch": 63.01, + "grad_norm": 0.0016400377498939633, + "learning_rate": 6.333725894302959e-06, + "loss": 0.0005, + "step": 26750 + }, + { + "epoch": 63.01, + "grad_norm": 0.001803527120500803, + "learning_rate": 6.31532459885176e-06, + "loss": 0.0003, + "step": 26760 + }, + { + "epoch": 63.01, + "grad_norm": 0.0019011934055015445, + "learning_rate": 6.2969233034005595e-06, + "loss": 0.0, + "step": 26770 + }, + { + "epoch": 63.01, + "grad_norm": 0.0007527913548983634, + "learning_rate": 6.27852200794936e-06, + "loss": 0.0036, + "step": 26780 + }, + { + "epoch": 63.01, + "grad_norm": 0.0012235046597197652, + "learning_rate": 6.2601207124981606e-06, + "loss": 0.0252, + "step": 26790 + }, + { + "epoch": 63.01, + "grad_norm": 0.0019505118252709508, + "learning_rate": 6.24171941704696e-06, + "loss": 0.0128, + "step": 26800 + }, + { + "epoch": 63.01, + "grad_norm": 0.0008968331967480481, + "learning_rate": 6.223318121595761e-06, + "loss": 0.0078, + "step": 26810 + }, + { + "epoch": 63.01, + "grad_norm": 0.0012221608776599169, + "learning_rate": 6.204916826144561e-06, + "loss": 0.0001, + "step": 26820 + }, + { + "epoch": 63.01, + "grad_norm": 0.0025473625864833593, + "learning_rate": 6.186515530693361e-06, + "loss": 0.0001, + "step": 26830 + }, + { + "epoch": 63.01, + "grad_norm": 0.004625910427421331, + "learning_rate": 6.168114235242161e-06, + "loss": 0.0003, + "step": 26840 + }, + { + "epoch": 63.01, + "grad_norm": 0.0007219272665679455, + "learning_rate": 6.149712939790961e-06, + "loss": 0.0001, + "step": 26850 + }, + { + "epoch": 63.01, + "grad_norm": 0.0008147148182615638, + "learning_rate": 6.131311644339762e-06, + "loss": 0.0, + "step": 26860 + }, + { + "epoch": 63.01, + "grad_norm": 0.001891309511847794, + "learning_rate": 6.112910348888562e-06, + "loss": 0.0001, + "step": 26870 + }, + { + "epoch": 63.01, + "grad_norm": 0.001844471669755876, + "learning_rate": 6.0945090534373625e-06, + "loss": 0.0001, + "step": 26880 + }, + { + "epoch": 63.01, + "eval_accuracy": 0.7488738738738738, + "eval_loss": 1.9532489776611328, + "eval_runtime": 38.719, + "eval_samples_per_second": 22.934, + "eval_steps_per_second": 1.911, + "step": 26880 + }, + { + "epoch": 64.0, + "grad_norm": 0.0005300881457515061, + "learning_rate": 6.0761077579861626e-06, + "loss": 0.0, + "step": 26890 + }, + { + "epoch": 64.0, + "grad_norm": 0.0009176667081192136, + "learning_rate": 6.057706462534963e-06, + "loss": 0.0001, + "step": 26900 + }, + { + "epoch": 64.0, + "grad_norm": 0.00210366933606565, + "learning_rate": 6.039305167083763e-06, + "loss": 0.0, + "step": 26910 + }, + { + "epoch": 64.0, + "grad_norm": 0.0005858144722878933, + "learning_rate": 6.020903871632563e-06, + "loss": 0.0, + "step": 26920 + }, + { + "epoch": 64.0, + "grad_norm": 0.001394058228470385, + "learning_rate": 6.002502576181363e-06, + "loss": 0.0013, + "step": 26930 + }, + { + "epoch": 64.0, + "grad_norm": 0.0003652074665296823, + "learning_rate": 5.984101280730164e-06, + "loss": 0.0, + "step": 26940 + }, + { + "epoch": 64.0, + "grad_norm": 0.00038057751953601837, + "learning_rate": 5.965699985278964e-06, + "loss": 0.0002, + "step": 26950 + }, + { + "epoch": 64.0, + "grad_norm": 0.0007543888641521335, + "learning_rate": 5.947298689827764e-06, + "loss": 0.0, + "step": 26960 + }, + { + "epoch": 64.0, + "grad_norm": 0.0008908085874281824, + "learning_rate": 5.928897394376564e-06, + "loss": 0.0, + "step": 26970 + }, + { + "epoch": 64.0, + "grad_norm": 0.0010865289950743318, + "learning_rate": 5.9104960989253645e-06, + "loss": 0.0088, + "step": 26980 + }, + { + "epoch": 64.0, + "grad_norm": 0.0018082386814057827, + "learning_rate": 5.8920948034741654e-06, + "loss": 0.0, + "step": 26990 + }, + { + "epoch": 64.0, + "grad_norm": 0.001776510733179748, + "learning_rate": 5.873693508022965e-06, + "loss": 0.0, + "step": 27000 + }, + { + "epoch": 64.0, + "grad_norm": 0.005542340688407421, + "learning_rate": 5.855292212571766e-06, + "loss": 0.0, + "step": 27010 + }, + { + "epoch": 64.0, + "grad_norm": 0.0004181989061180502, + "learning_rate": 5.836890917120565e-06, + "loss": 0.0, + "step": 27020 + }, + { + "epoch": 64.0, + "grad_norm": 0.00034455108107067645, + "learning_rate": 5.818489621669366e-06, + "loss": 0.0271, + "step": 27030 + }, + { + "epoch": 64.01, + "grad_norm": 0.012929372489452362, + "learning_rate": 5.800088326218166e-06, + "loss": 0.0, + "step": 27040 + }, + { + "epoch": 64.01, + "grad_norm": 0.0004051885043736547, + "learning_rate": 5.781687030766966e-06, + "loss": 0.0, + "step": 27050 + }, + { + "epoch": 64.01, + "grad_norm": 0.000558310654014349, + "learning_rate": 5.763285735315767e-06, + "loss": 0.0, + "step": 27060 + }, + { + "epoch": 64.01, + "grad_norm": 0.005561283323913813, + "learning_rate": 5.744884439864566e-06, + "loss": 0.0, + "step": 27070 + }, + { + "epoch": 64.01, + "grad_norm": 0.0005570474895648658, + "learning_rate": 5.726483144413367e-06, + "loss": 0.0001, + "step": 27080 + }, + { + "epoch": 64.01, + "grad_norm": 0.0005421972018666565, + "learning_rate": 5.7080818489621674e-06, + "loss": 0.0001, + "step": 27090 + }, + { + "epoch": 64.01, + "grad_norm": 0.0008089053444564342, + "learning_rate": 5.6896805535109675e-06, + "loss": 0.0884, + "step": 27100 + }, + { + "epoch": 64.01, + "grad_norm": 0.0032031191512942314, + "learning_rate": 5.671279258059768e-06, + "loss": 0.0479, + "step": 27110 + }, + { + "epoch": 64.01, + "grad_norm": 0.0016441026236861944, + "learning_rate": 5.652877962608568e-06, + "loss": 0.0094, + "step": 27120 + }, + { + "epoch": 64.01, + "grad_norm": 0.03962777182459831, + "learning_rate": 5.634476667157368e-06, + "loss": 0.0, + "step": 27130 + }, + { + "epoch": 64.01, + "grad_norm": 0.0017225542105734348, + "learning_rate": 5.616075371706169e-06, + "loss": 0.0, + "step": 27140 + }, + { + "epoch": 64.01, + "grad_norm": 0.002667994936928153, + "learning_rate": 5.597674076254968e-06, + "loss": 0.0002, + "step": 27150 + }, + { + "epoch": 64.01, + "grad_norm": 0.00042104258318431675, + "learning_rate": 5.579272780803769e-06, + "loss": 0.0, + "step": 27160 + }, + { + "epoch": 64.01, + "grad_norm": 0.00831848755478859, + "learning_rate": 5.560871485352569e-06, + "loss": 0.0, + "step": 27170 + }, + { + "epoch": 64.01, + "grad_norm": 0.01557249017059803, + "learning_rate": 5.542470189901369e-06, + "loss": 0.0001, + "step": 27180 + }, + { + "epoch": 64.01, + "grad_norm": 0.000592841359321028, + "learning_rate": 5.5240688944501694e-06, + "loss": 0.0126, + "step": 27190 + }, + { + "epoch": 64.01, + "grad_norm": 0.0004729072388727218, + "learning_rate": 5.5056675989989695e-06, + "loss": 0.0014, + "step": 27200 + }, + { + "epoch": 64.01, + "grad_norm": 0.0007256526732817292, + "learning_rate": 5.4872663035477705e-06, + "loss": 0.0211, + "step": 27210 + }, + { + "epoch": 64.01, + "grad_norm": 0.0019068483961746097, + "learning_rate": 5.46886500809657e-06, + "loss": 0.0, + "step": 27220 + }, + { + "epoch": 64.01, + "grad_norm": 0.000938325421884656, + "learning_rate": 5.450463712645371e-06, + "loss": 0.0, + "step": 27230 + }, + { + "epoch": 64.01, + "grad_norm": 0.0009550242102704942, + "learning_rate": 5.432062417194171e-06, + "loss": 0.0, + "step": 27240 + }, + { + "epoch": 64.01, + "grad_norm": 0.00045210865209810436, + "learning_rate": 5.413661121742971e-06, + "loss": 0.0001, + "step": 27250 + }, + { + "epoch": 64.01, + "grad_norm": 0.0003768078749999404, + "learning_rate": 5.395259826291772e-06, + "loss": 0.0001, + "step": 27260 + }, + { + "epoch": 64.01, + "grad_norm": 0.0008792284643277526, + "learning_rate": 5.376858530840571e-06, + "loss": 0.0, + "step": 27270 + }, + { + "epoch": 64.01, + "grad_norm": 0.00046390038914978504, + "learning_rate": 5.358457235389372e-06, + "loss": 0.0001, + "step": 27280 + }, + { + "epoch": 64.01, + "grad_norm": 0.0005584588507190347, + "learning_rate": 5.3400559399381714e-06, + "loss": 0.0083, + "step": 27290 + }, + { + "epoch": 64.01, + "grad_norm": 0.0006229018326848745, + "learning_rate": 5.321654644486972e-06, + "loss": 0.0, + "step": 27300 + }, + { + "epoch": 64.01, + "eval_accuracy": 0.7477477477477478, + "eval_loss": 1.9208874702453613, + "eval_runtime": 38.3099, + "eval_samples_per_second": 23.179, + "eval_steps_per_second": 1.932, + "step": 27300 + }, + { + "epoch": 65.0, + "grad_norm": 0.0037541654892265797, + "learning_rate": 5.3032533490357725e-06, + "loss": 0.0, + "step": 27310 + }, + { + "epoch": 65.0, + "grad_norm": 0.1503923088312149, + "learning_rate": 5.284852053584573e-06, + "loss": 0.0001, + "step": 27320 + }, + { + "epoch": 65.0, + "grad_norm": 0.0005371617735363543, + "learning_rate": 5.266450758133373e-06, + "loss": 0.0, + "step": 27330 + }, + { + "epoch": 65.0, + "grad_norm": 0.0006791293271817267, + "learning_rate": 5.248049462682173e-06, + "loss": 0.0051, + "step": 27340 + }, + { + "epoch": 65.0, + "grad_norm": 0.00040025872294791043, + "learning_rate": 5.229648167230973e-06, + "loss": 0.0304, + "step": 27350 + }, + { + "epoch": 65.0, + "grad_norm": 0.0007911776774562895, + "learning_rate": 5.211246871779774e-06, + "loss": 0.0, + "step": 27360 + }, + { + "epoch": 65.0, + "grad_norm": 0.0005377155030146241, + "learning_rate": 5.192845576328574e-06, + "loss": 0.0, + "step": 27370 + }, + { + "epoch": 65.0, + "grad_norm": 0.001017910661175847, + "learning_rate": 5.174444280877374e-06, + "loss": 0.0, + "step": 27380 + }, + { + "epoch": 65.0, + "grad_norm": 0.0012564313365146518, + "learning_rate": 5.156042985426174e-06, + "loss": 0.0001, + "step": 27390 + }, + { + "epoch": 65.0, + "grad_norm": 0.0011663263430818915, + "learning_rate": 5.137641689974974e-06, + "loss": 0.0001, + "step": 27400 + }, + { + "epoch": 65.0, + "grad_norm": 0.001976665109395981, + "learning_rate": 5.1192403945237745e-06, + "loss": 0.0, + "step": 27410 + }, + { + "epoch": 65.0, + "grad_norm": 0.0006329611060209572, + "learning_rate": 5.100839099072575e-06, + "loss": 0.0, + "step": 27420 + }, + { + "epoch": 65.0, + "grad_norm": 0.0004533329338300973, + "learning_rate": 5.082437803621376e-06, + "loss": 0.0004, + "step": 27430 + }, + { + "epoch": 65.0, + "grad_norm": 0.0014712655683979392, + "learning_rate": 5.064036508170175e-06, + "loss": 0.0, + "step": 27440 + }, + { + "epoch": 65.0, + "grad_norm": 0.0005582648445852101, + "learning_rate": 5.045635212718976e-06, + "loss": 0.0, + "step": 27450 + }, + { + "epoch": 65.01, + "grad_norm": 0.0004935372853651643, + "learning_rate": 5.027233917267776e-06, + "loss": 0.0005, + "step": 27460 + }, + { + "epoch": 65.01, + "grad_norm": 0.0016245003789663315, + "learning_rate": 5.008832621816576e-06, + "loss": 0.0, + "step": 27470 + }, + { + "epoch": 65.01, + "grad_norm": 0.0008186784689314663, + "learning_rate": 4.990431326365377e-06, + "loss": 0.0001, + "step": 27480 + }, + { + "epoch": 65.01, + "grad_norm": 0.0021012003999203444, + "learning_rate": 4.972030030914176e-06, + "loss": 0.0002, + "step": 27490 + }, + { + "epoch": 65.01, + "grad_norm": 0.0007192987250164151, + "learning_rate": 4.953628735462977e-06, + "loss": 0.0001, + "step": 27500 + }, + { + "epoch": 65.01, + "grad_norm": 0.0005266540683805943, + "learning_rate": 4.9352274400117765e-06, + "loss": 0.0, + "step": 27510 + }, + { + "epoch": 65.01, + "grad_norm": 0.0004016093735117465, + "learning_rate": 4.9168261445605775e-06, + "loss": 0.0194, + "step": 27520 + }, + { + "epoch": 65.01, + "grad_norm": 0.0014533177018165588, + "learning_rate": 4.898424849109378e-06, + "loss": 0.0001, + "step": 27530 + }, + { + "epoch": 65.01, + "grad_norm": 0.0005293239373713732, + "learning_rate": 4.880023553658178e-06, + "loss": 0.0, + "step": 27540 + }, + { + "epoch": 65.01, + "grad_norm": 0.0020739659667015076, + "learning_rate": 4.861622258206978e-06, + "loss": 0.0, + "step": 27550 + }, + { + "epoch": 65.01, + "grad_norm": 0.1151590347290039, + "learning_rate": 4.843220962755778e-06, + "loss": 0.0006, + "step": 27560 + }, + { + "epoch": 65.01, + "grad_norm": 1.1214478015899658, + "learning_rate": 4.824819667304578e-06, + "loss": 0.0064, + "step": 27570 + }, + { + "epoch": 65.01, + "grad_norm": 0.0003342399431858212, + "learning_rate": 4.806418371853379e-06, + "loss": 0.0001, + "step": 27580 + }, + { + "epoch": 65.01, + "grad_norm": 0.0003858323907479644, + "learning_rate": 4.788017076402179e-06, + "loss": 0.0, + "step": 27590 + }, + { + "epoch": 65.01, + "grad_norm": 0.0004176953516434878, + "learning_rate": 4.769615780950979e-06, + "loss": 0.0, + "step": 27600 + }, + { + "epoch": 65.01, + "grad_norm": 0.0005611016531474888, + "learning_rate": 4.751214485499779e-06, + "loss": 0.0, + "step": 27610 + }, + { + "epoch": 65.01, + "grad_norm": 0.0020231925882399082, + "learning_rate": 4.7328131900485795e-06, + "loss": 0.0, + "step": 27620 + }, + { + "epoch": 65.01, + "grad_norm": 0.0022195447236299515, + "learning_rate": 4.7144118945973804e-06, + "loss": 0.0001, + "step": 27630 + }, + { + "epoch": 65.01, + "grad_norm": 0.0003322585253044963, + "learning_rate": 4.69601059914618e-06, + "loss": 0.0, + "step": 27640 + }, + { + "epoch": 65.01, + "grad_norm": 0.0010606865398585796, + "learning_rate": 4.677609303694981e-06, + "loss": 0.0, + "step": 27650 + }, + { + "epoch": 65.01, + "grad_norm": 0.006328342016786337, + "learning_rate": 4.65920800824378e-06, + "loss": 0.0, + "step": 27660 + }, + { + "epoch": 65.01, + "grad_norm": 0.0013095044996589422, + "learning_rate": 4.640806712792581e-06, + "loss": 0.0, + "step": 27670 + }, + { + "epoch": 65.01, + "grad_norm": 0.0007367559592239559, + "learning_rate": 4.622405417341381e-06, + "loss": 0.0, + "step": 27680 + }, + { + "epoch": 65.01, + "grad_norm": 0.0003999292675871402, + "learning_rate": 4.604004121890181e-06, + "loss": 0.0138, + "step": 27690 + }, + { + "epoch": 65.01, + "grad_norm": 0.0006201888318173587, + "learning_rate": 4.585602826438982e-06, + "loss": 0.0106, + "step": 27700 + }, + { + "epoch": 65.01, + "grad_norm": 0.0005041907425038517, + "learning_rate": 4.567201530987781e-06, + "loss": 0.0142, + "step": 27710 + }, + { + "epoch": 65.01, + "grad_norm": 0.002044772496446967, + "learning_rate": 4.548800235536582e-06, + "loss": 0.0, + "step": 27720 + }, + { + "epoch": 65.01, + "eval_accuracy": 0.7578828828828829, + "eval_loss": 1.9100127220153809, + "eval_runtime": 38.7134, + "eval_samples_per_second": 22.938, + "eval_steps_per_second": 1.911, + "step": 27720 + }, + { + "epoch": 66.0, + "grad_norm": 0.001818476477637887, + "learning_rate": 4.5303989400853824e-06, + "loss": 0.003, + "step": 27730 + }, + { + "epoch": 66.0, + "grad_norm": 0.0004958516801707447, + "learning_rate": 4.5119976446341826e-06, + "loss": 0.0076, + "step": 27740 + }, + { + "epoch": 66.0, + "grad_norm": 0.0003787777677644044, + "learning_rate": 4.493596349182983e-06, + "loss": 0.0, + "step": 27750 + }, + { + "epoch": 66.0, + "grad_norm": 0.00042003163252957165, + "learning_rate": 4.475195053731783e-06, + "loss": 0.0, + "step": 27760 + }, + { + "epoch": 66.0, + "grad_norm": 0.000411301531130448, + "learning_rate": 4.456793758280583e-06, + "loss": 0.0, + "step": 27770 + }, + { + "epoch": 66.0, + "grad_norm": 0.0021449129562824965, + "learning_rate": 4.438392462829383e-06, + "loss": 0.0435, + "step": 27780 + }, + { + "epoch": 66.0, + "grad_norm": 0.0018116917926818132, + "learning_rate": 4.419991167378184e-06, + "loss": 0.0008, + "step": 27790 + }, + { + "epoch": 66.0, + "grad_norm": 0.0003143279755022377, + "learning_rate": 4.401589871926984e-06, + "loss": 0.0, + "step": 27800 + }, + { + "epoch": 66.0, + "grad_norm": 0.0009564289357513189, + "learning_rate": 4.383188576475784e-06, + "loss": 0.0, + "step": 27810 + }, + { + "epoch": 66.0, + "grad_norm": 0.000356840348104015, + "learning_rate": 4.364787281024584e-06, + "loss": 0.0, + "step": 27820 + }, + { + "epoch": 66.0, + "grad_norm": 0.0005603270838037133, + "learning_rate": 4.3463859855733844e-06, + "loss": 0.0, + "step": 27830 + }, + { + "epoch": 66.0, + "grad_norm": 0.0020791892893612385, + "learning_rate": 4.3279846901221846e-06, + "loss": 0.0, + "step": 27840 + }, + { + "epoch": 66.0, + "grad_norm": 0.0005288394168019295, + "learning_rate": 4.3095833946709855e-06, + "loss": 0.0, + "step": 27850 + }, + { + "epoch": 66.0, + "grad_norm": 0.0009745580609887838, + "learning_rate": 4.291182099219785e-06, + "loss": 0.0, + "step": 27860 + }, + { + "epoch": 66.0, + "grad_norm": 0.0019139602081850171, + "learning_rate": 4.272780803768586e-06, + "loss": 0.0063, + "step": 27870 + }, + { + "epoch": 66.01, + "grad_norm": 0.0005171472439542413, + "learning_rate": 4.254379508317385e-06, + "loss": 0.0, + "step": 27880 + }, + { + "epoch": 66.01, + "grad_norm": 0.0031747317407280207, + "learning_rate": 4.235978212866186e-06, + "loss": 0.0, + "step": 27890 + }, + { + "epoch": 66.01, + "grad_norm": 0.0003597049508243799, + "learning_rate": 4.217576917414986e-06, + "loss": 0.0, + "step": 27900 + }, + { + "epoch": 66.01, + "grad_norm": 0.0008909418829716742, + "learning_rate": 4.199175621963786e-06, + "loss": 0.0, + "step": 27910 + }, + { + "epoch": 66.01, + "grad_norm": 0.0005209531518630683, + "learning_rate": 4.180774326512587e-06, + "loss": 0.0153, + "step": 27920 + }, + { + "epoch": 66.01, + "grad_norm": 0.00047349196393042803, + "learning_rate": 4.1623730310613864e-06, + "loss": 0.0433, + "step": 27930 + }, + { + "epoch": 66.01, + "grad_norm": 0.00046791709610261023, + "learning_rate": 4.143971735610187e-06, + "loss": 0.0001, + "step": 27940 + }, + { + "epoch": 66.01, + "grad_norm": 0.0037538569886237383, + "learning_rate": 4.1255704401589875e-06, + "loss": 0.0, + "step": 27950 + }, + { + "epoch": 66.01, + "grad_norm": 0.003720578271895647, + "learning_rate": 4.107169144707788e-06, + "loss": 0.0, + "step": 27960 + }, + { + "epoch": 66.01, + "grad_norm": 0.006445688661187887, + "learning_rate": 4.088767849256588e-06, + "loss": 0.0004, + "step": 27970 + }, + { + "epoch": 66.01, + "grad_norm": 0.000625148881226778, + "learning_rate": 4.070366553805388e-06, + "loss": 0.0002, + "step": 27980 + }, + { + "epoch": 66.01, + "grad_norm": 0.0010217278031632304, + "learning_rate": 4.051965258354189e-06, + "loss": 0.0, + "step": 27990 + }, + { + "epoch": 66.01, + "grad_norm": 0.0008879475644789636, + "learning_rate": 4.033563962902988e-06, + "loss": 0.0, + "step": 28000 + }, + { + "epoch": 66.01, + "grad_norm": 0.3618137836456299, + "learning_rate": 4.015162667451789e-06, + "loss": 0.0001, + "step": 28010 + }, + { + "epoch": 66.01, + "grad_norm": 0.00038238533306866884, + "learning_rate": 3.996761372000589e-06, + "loss": 0.0001, + "step": 28020 + }, + { + "epoch": 66.01, + "grad_norm": 0.0006025524926371872, + "learning_rate": 3.978360076549389e-06, + "loss": 0.0, + "step": 28030 + }, + { + "epoch": 66.01, + "grad_norm": 0.0008740944904275239, + "learning_rate": 3.959958781098189e-06, + "loss": 0.0, + "step": 28040 + }, + { + "epoch": 66.01, + "grad_norm": 0.0003794162184931338, + "learning_rate": 3.9415574856469895e-06, + "loss": 0.0, + "step": 28050 + }, + { + "epoch": 66.01, + "grad_norm": 0.0004938667407259345, + "learning_rate": 3.92315619019579e-06, + "loss": 0.0, + "step": 28060 + }, + { + "epoch": 66.01, + "grad_norm": 0.0031972068827599287, + "learning_rate": 3.904754894744591e-06, + "loss": 0.0, + "step": 28070 + }, + { + "epoch": 66.01, + "grad_norm": 0.0005764389061369002, + "learning_rate": 3.88635359929339e-06, + "loss": 0.0, + "step": 28080 + }, + { + "epoch": 66.01, + "grad_norm": 0.0011365560349076986, + "learning_rate": 3.867952303842191e-06, + "loss": 0.0, + "step": 28090 + }, + { + "epoch": 66.01, + "grad_norm": 0.0017140272539108992, + "learning_rate": 3.849551008390991e-06, + "loss": 0.0, + "step": 28100 + }, + { + "epoch": 66.01, + "grad_norm": 0.0005445133429020643, + "learning_rate": 3.831149712939791e-06, + "loss": 0.0022, + "step": 28110 + }, + { + "epoch": 66.01, + "grad_norm": 0.0008768728584982455, + "learning_rate": 3.8127484174885916e-06, + "loss": 0.0, + "step": 28120 + }, + { + "epoch": 66.01, + "grad_norm": 0.0003417479747440666, + "learning_rate": 3.7943471220373913e-06, + "loss": 0.0, + "step": 28130 + }, + { + "epoch": 66.01, + "grad_norm": 0.0004897731123492122, + "learning_rate": 3.775945826586192e-06, + "loss": 0.0, + "step": 28140 + }, + { + "epoch": 66.01, + "eval_accuracy": 0.7533783783783784, + "eval_loss": 1.957198143005371, + "eval_runtime": 39.1151, + "eval_samples_per_second": 22.702, + "eval_steps_per_second": 1.892, + "step": 28140 + }, + { + "epoch": 67.0, + "grad_norm": 0.0009105826611630619, + "learning_rate": 3.757544531134992e-06, + "loss": 0.0, + "step": 28150 + }, + { + "epoch": 67.0, + "grad_norm": 0.0004885323578491807, + "learning_rate": 3.7391432356837925e-06, + "loss": 0.0, + "step": 28160 + }, + { + "epoch": 67.0, + "grad_norm": 0.0008118312689475715, + "learning_rate": 3.720741940232592e-06, + "loss": 0.0, + "step": 28170 + }, + { + "epoch": 67.0, + "grad_norm": 0.0014635213883593678, + "learning_rate": 3.7023406447813927e-06, + "loss": 0.0001, + "step": 28180 + }, + { + "epoch": 67.0, + "grad_norm": 0.0004924469976685941, + "learning_rate": 3.683939349330193e-06, + "loss": 0.0, + "step": 28190 + }, + { + "epoch": 67.0, + "grad_norm": 0.011600484140217304, + "learning_rate": 3.6655380538789934e-06, + "loss": 0.0001, + "step": 28200 + }, + { + "epoch": 67.0, + "grad_norm": 0.000475892738904804, + "learning_rate": 3.647136758427794e-06, + "loss": 0.0046, + "step": 28210 + }, + { + "epoch": 67.0, + "grad_norm": 0.00038393758586607873, + "learning_rate": 3.6287354629765936e-06, + "loss": 0.0, + "step": 28220 + }, + { + "epoch": 67.0, + "grad_norm": 0.00033145310590043664, + "learning_rate": 3.610334167525394e-06, + "loss": 0.0001, + "step": 28230 + }, + { + "epoch": 67.0, + "grad_norm": 0.0016265606973320246, + "learning_rate": 3.5919328720741943e-06, + "loss": 0.0, + "step": 28240 + }, + { + "epoch": 67.0, + "grad_norm": 0.013253612443804741, + "learning_rate": 3.573531576622995e-06, + "loss": 0.0, + "step": 28250 + }, + { + "epoch": 67.0, + "grad_norm": 0.0012261946685612202, + "learning_rate": 3.5551302811717945e-06, + "loss": 0.0, + "step": 28260 + }, + { + "epoch": 67.0, + "grad_norm": 0.00045756183681078255, + "learning_rate": 3.536728985720595e-06, + "loss": 0.0113, + "step": 28270 + }, + { + "epoch": 67.0, + "grad_norm": 0.0007601910037919879, + "learning_rate": 3.5183276902693947e-06, + "loss": 0.0, + "step": 28280 + }, + { + "epoch": 67.0, + "grad_norm": 0.0007499887724407017, + "learning_rate": 3.4999263948181953e-06, + "loss": 0.0, + "step": 28290 + }, + { + "epoch": 67.01, + "grad_norm": 0.0011956521775573492, + "learning_rate": 3.481525099366996e-06, + "loss": 0.0, + "step": 28300 + }, + { + "epoch": 67.01, + "grad_norm": 0.0003698724030982703, + "learning_rate": 3.463123803915796e-06, + "loss": 0.0, + "step": 28310 + }, + { + "epoch": 67.01, + "grad_norm": 0.0003628400154411793, + "learning_rate": 3.4447225084645964e-06, + "loss": 0.0385, + "step": 28320 + }, + { + "epoch": 67.01, + "grad_norm": 0.0010094452882185578, + "learning_rate": 3.426321213013396e-06, + "loss": 0.0001, + "step": 28330 + }, + { + "epoch": 67.01, + "grad_norm": 0.0012927157804369926, + "learning_rate": 3.4079199175621967e-06, + "loss": 0.0, + "step": 28340 + }, + { + "epoch": 67.01, + "grad_norm": 0.000493381405249238, + "learning_rate": 3.389518622110997e-06, + "loss": 0.0, + "step": 28350 + }, + { + "epoch": 67.01, + "grad_norm": 0.0003783382708206773, + "learning_rate": 3.3711173266597973e-06, + "loss": 0.0, + "step": 28360 + }, + { + "epoch": 67.01, + "grad_norm": 0.0012582663912326097, + "learning_rate": 3.352716031208597e-06, + "loss": 0.0, + "step": 28370 + }, + { + "epoch": 67.01, + "grad_norm": 0.00036473103682510555, + "learning_rate": 3.3343147357573976e-06, + "loss": 0.0003, + "step": 28380 + }, + { + "epoch": 67.01, + "grad_norm": 0.0003472709213383496, + "learning_rate": 3.3159134403061973e-06, + "loss": 0.0, + "step": 28390 + }, + { + "epoch": 67.01, + "grad_norm": 0.004546219948679209, + "learning_rate": 3.297512144854998e-06, + "loss": 0.0, + "step": 28400 + }, + { + "epoch": 67.01, + "grad_norm": 0.0006614047451876104, + "learning_rate": 3.2791108494037983e-06, + "loss": 0.0134, + "step": 28410 + }, + { + "epoch": 67.01, + "grad_norm": 0.00038801078335382044, + "learning_rate": 3.2607095539525985e-06, + "loss": 0.0067, + "step": 28420 + }, + { + "epoch": 67.01, + "grad_norm": 0.00122673693113029, + "learning_rate": 3.242308258501399e-06, + "loss": 0.0, + "step": 28430 + }, + { + "epoch": 67.01, + "grad_norm": 0.0005847996799275279, + "learning_rate": 3.2239069630501987e-06, + "loss": 0.0001, + "step": 28440 + }, + { + "epoch": 67.01, + "grad_norm": 0.0003658024943433702, + "learning_rate": 3.2055056675989992e-06, + "loss": 0.0112, + "step": 28450 + }, + { + "epoch": 67.01, + "grad_norm": 0.011086560785770416, + "learning_rate": 3.1871043721477993e-06, + "loss": 0.0, + "step": 28460 + }, + { + "epoch": 67.01, + "grad_norm": 0.000345290289260447, + "learning_rate": 3.1687030766966e-06, + "loss": 0.0192, + "step": 28470 + }, + { + "epoch": 67.01, + "grad_norm": 0.00038937729550525546, + "learning_rate": 3.1503017812453996e-06, + "loss": 0.0, + "step": 28480 + }, + { + "epoch": 67.01, + "grad_norm": 0.00036691149580292404, + "learning_rate": 3.1319004857942e-06, + "loss": 0.0702, + "step": 28490 + }, + { + "epoch": 67.01, + "grad_norm": 0.00034635685733519495, + "learning_rate": 3.1134991903430002e-06, + "loss": 0.0, + "step": 28500 + }, + { + "epoch": 67.01, + "grad_norm": 0.0003403785522095859, + "learning_rate": 3.0950978948918003e-06, + "loss": 0.0, + "step": 28510 + }, + { + "epoch": 67.01, + "grad_norm": 0.0004076850600540638, + "learning_rate": 3.076696599440601e-06, + "loss": 0.0, + "step": 28520 + }, + { + "epoch": 67.01, + "grad_norm": 0.0008295393199659884, + "learning_rate": 3.058295303989401e-06, + "loss": 0.0, + "step": 28530 + }, + { + "epoch": 67.01, + "grad_norm": 0.00039547396590933204, + "learning_rate": 3.039894008538201e-06, + "loss": 0.0, + "step": 28540 + }, + { + "epoch": 67.01, + "grad_norm": 0.004409555811434984, + "learning_rate": 3.0214927130870012e-06, + "loss": 0.0, + "step": 28550 + }, + { + "epoch": 67.01, + "grad_norm": 0.0004421043850015849, + "learning_rate": 3.0030914176358018e-06, + "loss": 0.0007, + "step": 28560 + }, + { + "epoch": 67.01, + "eval_accuracy": 0.75, + "eval_loss": 2.0379910469055176, + "eval_runtime": 39.3148, + "eval_samples_per_second": 22.587, + "eval_steps_per_second": 1.882, + "step": 28560 + }, + { + "epoch": 68.0, + "grad_norm": 0.0004478379269130528, + "learning_rate": 2.984690122184602e-06, + "loss": 0.0, + "step": 28570 + }, + { + "epoch": 68.0, + "grad_norm": 0.040339428931474686, + "learning_rate": 2.9662888267334024e-06, + "loss": 0.0001, + "step": 28580 + }, + { + "epoch": 68.0, + "grad_norm": 0.0004363558837212622, + "learning_rate": 2.9478875312822025e-06, + "loss": 0.0517, + "step": 28590 + }, + { + "epoch": 68.0, + "grad_norm": 0.0005274597206152976, + "learning_rate": 2.9294862358310026e-06, + "loss": 0.0001, + "step": 28600 + }, + { + "epoch": 68.0, + "grad_norm": 1.1247056722640991, + "learning_rate": 2.9110849403798028e-06, + "loss": 0.006, + "step": 28610 + }, + { + "epoch": 68.0, + "grad_norm": 0.008089935407042503, + "learning_rate": 2.892683644928603e-06, + "loss": 0.0, + "step": 28620 + }, + { + "epoch": 68.0, + "grad_norm": 0.0004192329361103475, + "learning_rate": 2.8742823494774034e-06, + "loss": 0.0, + "step": 28630 + }, + { + "epoch": 68.0, + "grad_norm": 0.00038057431811466813, + "learning_rate": 2.8558810540262035e-06, + "loss": 0.0001, + "step": 28640 + }, + { + "epoch": 68.0, + "grad_norm": 0.0011997149558737874, + "learning_rate": 2.8374797585750036e-06, + "loss": 0.0, + "step": 28650 + }, + { + "epoch": 68.0, + "grad_norm": 0.00041154009522870183, + "learning_rate": 2.8190784631238038e-06, + "loss": 0.0, + "step": 28660 + }, + { + "epoch": 68.0, + "grad_norm": 9.79233169555664, + "learning_rate": 2.8006771676726043e-06, + "loss": 0.0811, + "step": 28670 + }, + { + "epoch": 68.0, + "grad_norm": 0.001218300429172814, + "learning_rate": 2.782275872221405e-06, + "loss": 0.0, + "step": 28680 + }, + { + "epoch": 68.0, + "grad_norm": 0.00035766957444138825, + "learning_rate": 2.763874576770205e-06, + "loss": 0.0, + "step": 28690 + }, + { + "epoch": 68.0, + "grad_norm": 0.0008741321507841349, + "learning_rate": 2.745473281319005e-06, + "loss": 0.0, + "step": 28700 + }, + { + "epoch": 68.0, + "grad_norm": 0.004477696027606726, + "learning_rate": 2.727071985867805e-06, + "loss": 0.0, + "step": 28710 + }, + { + "epoch": 68.01, + "grad_norm": 0.00045869784662500024, + "learning_rate": 2.7086706904166053e-06, + "loss": 0.0, + "step": 28720 + }, + { + "epoch": 68.01, + "grad_norm": 0.0005403147079050541, + "learning_rate": 2.690269394965406e-06, + "loss": 0.0, + "step": 28730 + }, + { + "epoch": 68.01, + "grad_norm": 0.0004175172944087535, + "learning_rate": 2.671868099514206e-06, + "loss": 0.0, + "step": 28740 + }, + { + "epoch": 68.01, + "grad_norm": 0.0007928918348625302, + "learning_rate": 2.653466804063006e-06, + "loss": 0.0001, + "step": 28750 + }, + { + "epoch": 68.01, + "grad_norm": 0.001285827485844493, + "learning_rate": 2.635065508611806e-06, + "loss": 0.0, + "step": 28760 + }, + { + "epoch": 68.01, + "grad_norm": 0.026378748938441277, + "learning_rate": 2.6166642131606067e-06, + "loss": 0.0001, + "step": 28770 + }, + { + "epoch": 68.01, + "grad_norm": 0.0004781365569215268, + "learning_rate": 2.598262917709407e-06, + "loss": 0.0, + "step": 28780 + }, + { + "epoch": 68.01, + "grad_norm": 0.0003623313969001174, + "learning_rate": 2.5798616222582074e-06, + "loss": 0.0, + "step": 28790 + }, + { + "epoch": 68.01, + "grad_norm": 0.00031867920188233256, + "learning_rate": 2.5614603268070075e-06, + "loss": 0.0, + "step": 28800 + }, + { + "epoch": 68.01, + "grad_norm": 0.03376823663711548, + "learning_rate": 2.5430590313558076e-06, + "loss": 0.0, + "step": 28810 + }, + { + "epoch": 68.01, + "grad_norm": 0.0004712707013823092, + "learning_rate": 2.5246577359046077e-06, + "loss": 0.0, + "step": 28820 + }, + { + "epoch": 68.01, + "grad_norm": 0.0008019096567295492, + "learning_rate": 2.506256440453408e-06, + "loss": 0.0, + "step": 28830 + }, + { + "epoch": 68.01, + "grad_norm": 0.0006348516908474267, + "learning_rate": 2.4878551450022084e-06, + "loss": 0.0, + "step": 28840 + }, + { + "epoch": 68.01, + "grad_norm": 0.008974735625088215, + "learning_rate": 2.4694538495510085e-06, + "loss": 0.0025, + "step": 28850 + }, + { + "epoch": 68.01, + "grad_norm": 0.006573653779923916, + "learning_rate": 2.4510525540998086e-06, + "loss": 0.0283, + "step": 28860 + }, + { + "epoch": 68.01, + "grad_norm": 0.0004212147614452988, + "learning_rate": 2.4326512586486087e-06, + "loss": 0.0, + "step": 28870 + }, + { + "epoch": 68.01, + "grad_norm": 0.0006103275809437037, + "learning_rate": 2.4142499631974093e-06, + "loss": 0.0, + "step": 28880 + }, + { + "epoch": 68.01, + "grad_norm": 0.0003645585966296494, + "learning_rate": 2.3958486677462094e-06, + "loss": 0.0, + "step": 28890 + }, + { + "epoch": 68.01, + "grad_norm": 0.0007065955433063209, + "learning_rate": 2.37744737229501e-06, + "loss": 0.0075, + "step": 28900 + }, + { + "epoch": 68.01, + "grad_norm": 0.000578741601202637, + "learning_rate": 2.35904607684381e-06, + "loss": 0.0, + "step": 28910 + }, + { + "epoch": 68.01, + "grad_norm": 7.712615966796875, + "learning_rate": 2.34064478139261e-06, + "loss": 0.0346, + "step": 28920 + }, + { + "epoch": 68.01, + "grad_norm": 0.00035892019513994455, + "learning_rate": 2.3222434859414103e-06, + "loss": 0.0, + "step": 28930 + }, + { + "epoch": 68.01, + "grad_norm": 0.0003427111078053713, + "learning_rate": 2.3038421904902104e-06, + "loss": 0.0, + "step": 28940 + }, + { + "epoch": 68.01, + "grad_norm": 0.0008762883953750134, + "learning_rate": 2.285440895039011e-06, + "loss": 0.0, + "step": 28950 + }, + { + "epoch": 68.01, + "grad_norm": 0.0004895761958323419, + "learning_rate": 2.267039599587811e-06, + "loss": 0.0, + "step": 28960 + }, + { + "epoch": 68.01, + "grad_norm": 0.00041842888458631933, + "learning_rate": 2.248638304136611e-06, + "loss": 0.0019, + "step": 28970 + }, + { + "epoch": 68.01, + "grad_norm": 0.0004923155647702515, + "learning_rate": 2.2302370086854117e-06, + "loss": 0.0627, + "step": 28980 + }, + { + "epoch": 68.01, + "eval_accuracy": 0.7578828828828829, + "eval_loss": 1.8911042213439941, + "eval_runtime": 39.4147, + "eval_samples_per_second": 22.53, + "eval_steps_per_second": 1.877, + "step": 28980 + }, + { + "epoch": 69.0, + "grad_norm": 0.001296228845603764, + "learning_rate": 2.211835713234212e-06, + "loss": 0.0, + "step": 28990 + }, + { + "epoch": 69.0, + "grad_norm": 0.005461554042994976, + "learning_rate": 2.193434417783012e-06, + "loss": 0.0, + "step": 29000 + }, + { + "epoch": 69.0, + "grad_norm": 0.0005484812427312136, + "learning_rate": 2.1750331223318125e-06, + "loss": 0.0, + "step": 29010 + }, + { + "epoch": 69.0, + "grad_norm": 0.0005643205367960036, + "learning_rate": 2.1566318268806126e-06, + "loss": 0.0, + "step": 29020 + }, + { + "epoch": 69.0, + "grad_norm": 0.000412571185734123, + "learning_rate": 2.1382305314294127e-06, + "loss": 0.0067, + "step": 29030 + }, + { + "epoch": 69.0, + "grad_norm": 0.0004682897706516087, + "learning_rate": 2.119829235978213e-06, + "loss": 0.0, + "step": 29040 + }, + { + "epoch": 69.0, + "grad_norm": 0.0003385832242202014, + "learning_rate": 2.101427940527013e-06, + "loss": 0.0006, + "step": 29050 + }, + { + "epoch": 69.0, + "grad_norm": 0.0004343747568782419, + "learning_rate": 2.0830266450758135e-06, + "loss": 0.0, + "step": 29060 + }, + { + "epoch": 69.0, + "grad_norm": 0.0006563673377968371, + "learning_rate": 2.0646253496246136e-06, + "loss": 0.0604, + "step": 29070 + }, + { + "epoch": 69.0, + "grad_norm": 0.0003242646635044366, + "learning_rate": 2.046224054173414e-06, + "loss": 0.0, + "step": 29080 + }, + { + "epoch": 69.0, + "grad_norm": 0.00507451081648469, + "learning_rate": 2.0278227587222142e-06, + "loss": 0.0, + "step": 29090 + }, + { + "epoch": 69.0, + "grad_norm": 0.0006768747116439044, + "learning_rate": 2.0094214632710143e-06, + "loss": 0.0, + "step": 29100 + }, + { + "epoch": 69.0, + "grad_norm": 0.0005010124295949936, + "learning_rate": 1.991020167819815e-06, + "loss": 0.0, + "step": 29110 + }, + { + "epoch": 69.0, + "grad_norm": 0.018840555101633072, + "learning_rate": 1.972618872368615e-06, + "loss": 0.0001, + "step": 29120 + }, + { + "epoch": 69.0, + "grad_norm": 0.0009619954507797956, + "learning_rate": 1.954217576917415e-06, + "loss": 0.0, + "step": 29130 + }, + { + "epoch": 69.01, + "grad_norm": 0.0002928886387962848, + "learning_rate": 1.9358162814662152e-06, + "loss": 0.0, + "step": 29140 + }, + { + "epoch": 69.01, + "grad_norm": 0.0005956662353128195, + "learning_rate": 1.9174149860150153e-06, + "loss": 0.0, + "step": 29150 + }, + { + "epoch": 69.01, + "grad_norm": 0.0006041673477739096, + "learning_rate": 1.8990136905638157e-06, + "loss": 0.0, + "step": 29160 + }, + { + "epoch": 69.01, + "grad_norm": 0.0002941975835710764, + "learning_rate": 1.8806123951126158e-06, + "loss": 0.0, + "step": 29170 + }, + { + "epoch": 69.01, + "grad_norm": 0.0005585135659202933, + "learning_rate": 1.8622110996614161e-06, + "loss": 0.0796, + "step": 29180 + }, + { + "epoch": 69.01, + "grad_norm": 0.0006412527291104198, + "learning_rate": 1.8438098042102167e-06, + "loss": 0.047, + "step": 29190 + }, + { + "epoch": 69.01, + "grad_norm": 0.00156042177695781, + "learning_rate": 1.8254085087590168e-06, + "loss": 0.0, + "step": 29200 + }, + { + "epoch": 69.01, + "grad_norm": 0.000436204340076074, + "learning_rate": 1.807007213307817e-06, + "loss": 0.0, + "step": 29210 + }, + { + "epoch": 69.01, + "grad_norm": 0.0007626641308888793, + "learning_rate": 1.7886059178566172e-06, + "loss": 0.0134, + "step": 29220 + }, + { + "epoch": 69.01, + "grad_norm": 0.0009147358941845596, + "learning_rate": 1.7702046224054175e-06, + "loss": 0.0001, + "step": 29230 + }, + { + "epoch": 69.01, + "grad_norm": 0.000774869869928807, + "learning_rate": 1.7518033269542177e-06, + "loss": 0.0, + "step": 29240 + }, + { + "epoch": 69.01, + "grad_norm": 0.0009357577655464411, + "learning_rate": 1.7334020315030178e-06, + "loss": 0.0, + "step": 29250 + }, + { + "epoch": 69.01, + "grad_norm": 0.00034666634746827185, + "learning_rate": 1.715000736051818e-06, + "loss": 0.0, + "step": 29260 + }, + { + "epoch": 69.01, + "grad_norm": 0.0022142743691802025, + "learning_rate": 1.6965994406006182e-06, + "loss": 0.0, + "step": 29270 + }, + { + "epoch": 69.01, + "grad_norm": 0.00055282301036641, + "learning_rate": 1.6781981451494185e-06, + "loss": 0.0001, + "step": 29280 + }, + { + "epoch": 69.01, + "grad_norm": 0.0010218261741101742, + "learning_rate": 1.659796849698219e-06, + "loss": 0.0, + "step": 29290 + }, + { + "epoch": 69.01, + "grad_norm": 0.0004027817049063742, + "learning_rate": 1.6413955542470192e-06, + "loss": 0.0, + "step": 29300 + }, + { + "epoch": 69.01, + "grad_norm": 0.0006661299848929048, + "learning_rate": 1.6229942587958193e-06, + "loss": 0.0, + "step": 29310 + }, + { + "epoch": 69.01, + "grad_norm": 0.0005064262077212334, + "learning_rate": 1.6045929633446196e-06, + "loss": 0.0, + "step": 29320 + }, + { + "epoch": 69.01, + "grad_norm": 0.5367152094841003, + "learning_rate": 1.5861916678934198e-06, + "loss": 0.0001, + "step": 29330 + }, + { + "epoch": 69.01, + "grad_norm": 0.005905908532440662, + "learning_rate": 1.56779037244222e-06, + "loss": 0.0001, + "step": 29340 + }, + { + "epoch": 69.01, + "grad_norm": 0.00037446129135787487, + "learning_rate": 1.5493890769910202e-06, + "loss": 0.0, + "step": 29350 + }, + { + "epoch": 69.01, + "grad_norm": 1.1278949975967407, + "learning_rate": 1.5309877815398203e-06, + "loss": 0.0069, + "step": 29360 + }, + { + "epoch": 69.01, + "grad_norm": 0.0008586321491748095, + "learning_rate": 1.5125864860886209e-06, + "loss": 0.0, + "step": 29370 + }, + { + "epoch": 69.01, + "grad_norm": 0.00120458600576967, + "learning_rate": 1.494185190637421e-06, + "loss": 0.0, + "step": 29380 + }, + { + "epoch": 69.01, + "grad_norm": 0.002725456142798066, + "learning_rate": 1.475783895186221e-06, + "loss": 0.0, + "step": 29390 + }, + { + "epoch": 69.01, + "grad_norm": 0.00047182320849969983, + "learning_rate": 1.4573825997350214e-06, + "loss": 0.0002, + "step": 29400 + }, + { + "epoch": 69.01, + "eval_accuracy": 0.75, + "eval_loss": 1.9255236387252808, + "eval_runtime": 38.4915, + "eval_samples_per_second": 23.07, + "eval_steps_per_second": 1.923, + "step": 29400 + }, + { + "epoch": 70.0, + "grad_norm": 0.0007249056943692267, + "learning_rate": 1.4389813042838215e-06, + "loss": 0.0003, + "step": 29410 + }, + { + "epoch": 70.0, + "grad_norm": 0.0006666265544481575, + "learning_rate": 1.420580008832622e-06, + "loss": 0.0, + "step": 29420 + }, + { + "epoch": 70.0, + "grad_norm": 0.0003448748611845076, + "learning_rate": 1.4021787133814222e-06, + "loss": 0.0033, + "step": 29430 + }, + { + "epoch": 70.0, + "grad_norm": 0.0004704460152424872, + "learning_rate": 1.3837774179302223e-06, + "loss": 0.0, + "step": 29440 + }, + { + "epoch": 70.0, + "grad_norm": 0.0005372213781811297, + "learning_rate": 1.3653761224790226e-06, + "loss": 0.0, + "step": 29450 + }, + { + "epoch": 70.0, + "grad_norm": 0.004357726778835058, + "learning_rate": 1.3469748270278227e-06, + "loss": 0.0001, + "step": 29460 + }, + { + "epoch": 70.0, + "grad_norm": 0.0006257134955376387, + "learning_rate": 1.328573531576623e-06, + "loss": 0.0, + "step": 29470 + }, + { + "epoch": 70.0, + "grad_norm": 0.00044351391261443496, + "learning_rate": 1.3101722361254234e-06, + "loss": 0.0, + "step": 29480 + }, + { + "epoch": 70.0, + "grad_norm": 0.0006109604146331549, + "learning_rate": 1.2917709406742235e-06, + "loss": 0.0, + "step": 29490 + }, + { + "epoch": 70.0, + "grad_norm": 0.0006328423623926938, + "learning_rate": 1.2733696452230238e-06, + "loss": 0.0, + "step": 29500 + }, + { + "epoch": 70.0, + "grad_norm": 0.0005007470608688891, + "learning_rate": 1.254968349771824e-06, + "loss": 0.0, + "step": 29510 + }, + { + "epoch": 70.0, + "grad_norm": 0.0008502065320499241, + "learning_rate": 1.236567054320624e-06, + "loss": 0.0, + "step": 29520 + }, + { + "epoch": 70.0, + "grad_norm": 0.0011528691975399852, + "learning_rate": 1.2181657588694246e-06, + "loss": 0.0, + "step": 29530 + }, + { + "epoch": 70.0, + "grad_norm": 0.0006126620573922992, + "learning_rate": 1.1997644634182247e-06, + "loss": 0.0099, + "step": 29540 + }, + { + "epoch": 70.0, + "grad_norm": 0.001300094067119062, + "learning_rate": 1.1813631679670248e-06, + "loss": 0.0531, + "step": 29550 + }, + { + "epoch": 70.01, + "grad_norm": 0.027509033679962158, + "learning_rate": 1.1629618725158252e-06, + "loss": 0.0, + "step": 29560 + }, + { + "epoch": 70.01, + "grad_norm": 0.00041231224895454943, + "learning_rate": 1.1445605770646253e-06, + "loss": 0.0, + "step": 29570 + }, + { + "epoch": 70.01, + "grad_norm": 0.0006713059265166521, + "learning_rate": 1.1261592816134256e-06, + "loss": 0.0, + "step": 29580 + }, + { + "epoch": 70.01, + "grad_norm": 0.0016158471116796136, + "learning_rate": 1.107757986162226e-06, + "loss": 0.0, + "step": 29590 + }, + { + "epoch": 70.01, + "grad_norm": 0.00035877004847861826, + "learning_rate": 1.089356690711026e-06, + "loss": 0.0, + "step": 29600 + }, + { + "epoch": 70.01, + "grad_norm": 0.0011403877288103104, + "learning_rate": 1.0709553952598264e-06, + "loss": 0.0, + "step": 29610 + }, + { + "epoch": 70.01, + "grad_norm": 0.000379040400730446, + "learning_rate": 1.0525540998086265e-06, + "loss": 0.0, + "step": 29620 + }, + { + "epoch": 70.01, + "grad_norm": 0.0004945829859934747, + "learning_rate": 1.0341528043574268e-06, + "loss": 0.0, + "step": 29630 + }, + { + "epoch": 70.01, + "grad_norm": 0.0005318563780747354, + "learning_rate": 1.0157515089062271e-06, + "loss": 0.0001, + "step": 29640 + }, + { + "epoch": 70.01, + "grad_norm": 0.0008199013536795974, + "learning_rate": 9.973502134550273e-07, + "loss": 0.0, + "step": 29650 + }, + { + "epoch": 70.01, + "grad_norm": 0.0007847716915421188, + "learning_rate": 9.789489180038276e-07, + "loss": 0.0, + "step": 29660 + }, + { + "epoch": 70.01, + "grad_norm": 0.000548962561879307, + "learning_rate": 9.605476225526277e-07, + "loss": 0.0, + "step": 29670 + }, + { + "epoch": 70.01, + "grad_norm": 0.006194377318024635, + "learning_rate": 9.42146327101428e-07, + "loss": 0.0, + "step": 29680 + }, + { + "epoch": 70.01, + "grad_norm": 0.00041105650598183274, + "learning_rate": 9.237450316502283e-07, + "loss": 0.0, + "step": 29690 + }, + { + "epoch": 70.01, + "grad_norm": 0.0003589960979297757, + "learning_rate": 9.053437361990285e-07, + "loss": 0.0, + "step": 29700 + }, + { + "epoch": 70.01, + "grad_norm": 0.0005238248268142343, + "learning_rate": 8.869424407478287e-07, + "loss": 0.0021, + "step": 29710 + }, + { + "epoch": 70.01, + "grad_norm": 0.0009069875814020634, + "learning_rate": 8.685411452966289e-07, + "loss": 0.0, + "step": 29720 + }, + { + "epoch": 70.01, + "grad_norm": 0.00038318498991429806, + "learning_rate": 8.501398498454292e-07, + "loss": 0.0, + "step": 29730 + }, + { + "epoch": 70.01, + "grad_norm": 0.00046108453534543514, + "learning_rate": 8.317385543942295e-07, + "loss": 0.0, + "step": 29740 + }, + { + "epoch": 70.01, + "grad_norm": 0.0004549498262349516, + "learning_rate": 8.133372589430297e-07, + "loss": 0.0, + "step": 29750 + }, + { + "epoch": 70.01, + "grad_norm": 0.0012873295927420259, + "learning_rate": 7.949359634918298e-07, + "loss": 0.0056, + "step": 29760 + }, + { + "epoch": 70.01, + "grad_norm": 0.0007264292216859758, + "learning_rate": 7.765346680406301e-07, + "loss": 0.0, + "step": 29770 + }, + { + "epoch": 70.01, + "grad_norm": 0.0005384713294915855, + "learning_rate": 7.581333725894304e-07, + "loss": 0.0, + "step": 29780 + }, + { + "epoch": 70.01, + "grad_norm": 0.0006688968860544264, + "learning_rate": 7.397320771382306e-07, + "loss": 0.0, + "step": 29790 + }, + { + "epoch": 70.01, + "grad_norm": 0.000776001950725913, + "learning_rate": 7.213307816870308e-07, + "loss": 0.0, + "step": 29800 + }, + { + "epoch": 70.01, + "grad_norm": 0.0006710118614137173, + "learning_rate": 7.02929486235831e-07, + "loss": 0.0, + "step": 29810 + }, + { + "epoch": 70.01, + "grad_norm": 0.003348682075738907, + "learning_rate": 6.845281907846312e-07, + "loss": 0.0, + "step": 29820 + }, + { + "epoch": 70.01, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 1.919495701789856, + "eval_runtime": 39.3857, + "eval_samples_per_second": 22.546, + "eval_steps_per_second": 1.879, + "step": 29820 + }, + { + "epoch": 71.0, + "grad_norm": 0.0005197848076932132, + "learning_rate": 6.661268953334316e-07, + "loss": 0.0, + "step": 29830 + }, + { + "epoch": 71.0, + "grad_norm": 0.0004043731023557484, + "learning_rate": 6.477255998822317e-07, + "loss": 0.0, + "step": 29840 + }, + { + "epoch": 71.0, + "grad_norm": 0.000854467274621129, + "learning_rate": 6.29324304431032e-07, + "loss": 0.0, + "step": 29850 + }, + { + "epoch": 71.0, + "grad_norm": 0.0010478779440745711, + "learning_rate": 6.109230089798322e-07, + "loss": 0.0, + "step": 29860 + }, + { + "epoch": 71.0, + "grad_norm": 0.0005419045337475836, + "learning_rate": 5.925217135286324e-07, + "loss": 0.0, + "step": 29870 + }, + { + "epoch": 71.0, + "grad_norm": 0.00035095299244858325, + "learning_rate": 5.741204180774327e-07, + "loss": 0.0001, + "step": 29880 + }, + { + "epoch": 71.0, + "grad_norm": 0.0006842725561000407, + "learning_rate": 5.557191226262329e-07, + "loss": 0.0, + "step": 29890 + }, + { + "epoch": 71.0, + "grad_norm": 0.0006226678378880024, + "learning_rate": 5.373178271750332e-07, + "loss": 0.0, + "step": 29900 + }, + { + "epoch": 71.0, + "grad_norm": 0.000504440104123205, + "learning_rate": 5.189165317238333e-07, + "loss": 0.0002, + "step": 29910 + }, + { + "epoch": 71.0, + "grad_norm": 0.00035010086139664054, + "learning_rate": 5.005152362726336e-07, + "loss": 0.016, + "step": 29920 + }, + { + "epoch": 71.0, + "grad_norm": 0.0004880847118329257, + "learning_rate": 4.821139408214339e-07, + "loss": 0.0001, + "step": 29930 + }, + { + "epoch": 71.0, + "grad_norm": 0.00041001607314683497, + "learning_rate": 4.6371264537023405e-07, + "loss": 0.0, + "step": 29940 + }, + { + "epoch": 71.0, + "grad_norm": 0.45223134756088257, + "learning_rate": 4.4531134991903427e-07, + "loss": 0.0002, + "step": 29950 + }, + { + "epoch": 71.0, + "grad_norm": 0.0010492827277630568, + "learning_rate": 4.2691005446783455e-07, + "loss": 0.0, + "step": 29960 + }, + { + "epoch": 71.0, + "grad_norm": 0.00040615900070406497, + "learning_rate": 4.0850875901663477e-07, + "loss": 0.0, + "step": 29970 + }, + { + "epoch": 71.01, + "grad_norm": 1.2249135971069336, + "learning_rate": 3.90107463565435e-07, + "loss": 0.007, + "step": 29980 + }, + { + "epoch": 71.01, + "grad_norm": 0.0005851155729033053, + "learning_rate": 3.7170616811423526e-07, + "loss": 0.0, + "step": 29990 + }, + { + "epoch": 71.01, + "grad_norm": 0.002505541779100895, + "learning_rate": 3.533048726630355e-07, + "loss": 0.0002, + "step": 30000 + }, + { + "epoch": 71.01, + "grad_norm": 0.0007141873356886208, + "learning_rate": 3.3490357721183576e-07, + "loss": 0.0, + "step": 30010 + }, + { + "epoch": 71.01, + "grad_norm": 0.0004104756226297468, + "learning_rate": 3.1650228176063593e-07, + "loss": 0.0, + "step": 30020 + }, + { + "epoch": 71.01, + "grad_norm": 0.0005138775450177491, + "learning_rate": 2.981009863094362e-07, + "loss": 0.0, + "step": 30030 + }, + { + "epoch": 71.01, + "grad_norm": 0.0005845970590598881, + "learning_rate": 2.796996908582364e-07, + "loss": 0.0, + "step": 30040 + }, + { + "epoch": 71.01, + "grad_norm": 0.0018744993722066283, + "learning_rate": 2.612983954070367e-07, + "loss": 0.0001, + "step": 30050 + }, + { + "epoch": 71.01, + "grad_norm": 0.0002829184231813997, + "learning_rate": 2.4289709995583687e-07, + "loss": 0.0, + "step": 30060 + }, + { + "epoch": 71.01, + "grad_norm": 0.004356312565505505, + "learning_rate": 2.2449580450463714e-07, + "loss": 0.0, + "step": 30070 + }, + { + "epoch": 71.01, + "grad_norm": 0.00044179134420119226, + "learning_rate": 2.0609450905343736e-07, + "loss": 0.0042, + "step": 30080 + }, + { + "epoch": 71.01, + "grad_norm": 0.0003790935152210295, + "learning_rate": 1.876932136022376e-07, + "loss": 0.0, + "step": 30090 + }, + { + "epoch": 71.01, + "grad_norm": 0.00037218283978290856, + "learning_rate": 1.6929191815103783e-07, + "loss": 0.0015, + "step": 30100 + }, + { + "epoch": 71.01, + "grad_norm": 0.0008030639728531241, + "learning_rate": 1.5089062269983808e-07, + "loss": 0.0, + "step": 30110 + }, + { + "epoch": 71.01, + "grad_norm": 0.0006200214847922325, + "learning_rate": 1.324893272486383e-07, + "loss": 0.0, + "step": 30120 + }, + { + "epoch": 71.01, + "grad_norm": 0.00036839168751612306, + "learning_rate": 1.1408803179743855e-07, + "loss": 0.0, + "step": 30130 + }, + { + "epoch": 71.01, + "grad_norm": 0.0003330856270622462, + "learning_rate": 9.568673634623877e-08, + "loss": 0.0, + "step": 30140 + }, + { + "epoch": 71.01, + "grad_norm": 0.0003486187488306314, + "learning_rate": 7.7285440895039e-08, + "loss": 0.0258, + "step": 30150 + }, + { + "epoch": 71.01, + "grad_norm": 0.000903045351151377, + "learning_rate": 5.888414544383925e-08, + "loss": 0.0, + "step": 30160 + }, + { + "epoch": 71.01, + "grad_norm": 0.0006916265119798481, + "learning_rate": 4.048284999263948e-08, + "loss": 0.0, + "step": 30170 + }, + { + "epoch": 71.01, + "grad_norm": 0.0004026548413094133, + "learning_rate": 2.2081554541439718e-08, + "loss": 0.0, + "step": 30180 + }, + { + "epoch": 71.01, + "grad_norm": 0.00039476496749557555, + "learning_rate": 3.6802590902399533e-09, + "loss": 0.0, + "step": 30190 + }, + { + "epoch": 71.01, + "eval_accuracy": 0.75, + "eval_loss": 1.9316831827163696, + "eval_runtime": 39.2933, + "eval_samples_per_second": 22.599, + "eval_steps_per_second": 1.883, + "step": 30192 + }, + { + "epoch": 71.01, + "step": 30192, + "total_flos": 4.507464203371508e+20, + "train_loss": 0.11212031152609238, + "train_runtime": 37654.5939, + "train_samples_per_second": 9.622, + "train_steps_per_second": 0.802 + }, + { + "epoch": 71.01, + "eval_accuracy": 0.7623873873873874, + "eval_loss": 1.6253712177276611, + "eval_runtime": 49.894, + "eval_samples_per_second": 17.798, + "eval_steps_per_second": 1.483, + "step": 30192 + }, + { + "epoch": 71.01, + "eval_accuracy": 0.7623873873873874, + "eval_loss": 1.6253712177276611, + "eval_runtime": 38.5437, + "eval_samples_per_second": 23.039, + "eval_steps_per_second": 1.92, + "step": 30192 } ], "logging_steps": 10, - "max_steps": 11856, + "max_steps": 30192, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, - "total_flos": 1.770184277095349e+20, + "total_flos": 4.507464203371508e+20, "train_batch_size": 12, "trial_name": null, "trial_params": null