{ "best_metric": 0.7623873873873874, "best_model_checkpoint": "videomae-base-finetuned-scratch_1/checkpoint-24360", "epoch": 71.01232114467409, "eval_steps": 500, "global_step": 30192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.798513412475586, "learning_rate": 1.6556291390728477e-07, "loss": 0.6854, "step": 10 }, { "epoch": 0.0, "grad_norm": 6.500337600708008, "learning_rate": 3.3112582781456954e-07, "loss": 0.7216, "step": 20 }, { "epoch": 0.0, "grad_norm": 4.219776630401611, "learning_rate": 4.966887417218544e-07, "loss": 0.6915, "step": 30 }, { "epoch": 0.0, "grad_norm": 4.801699161529541, "learning_rate": 6.622516556291391e-07, "loss": 0.7103, "step": 40 }, { "epoch": 0.0, "grad_norm": 4.168068885803223, "learning_rate": 8.278145695364239e-07, "loss": 0.7183, "step": 50 }, { "epoch": 0.0, "grad_norm": 4.032826900482178, "learning_rate": 9.933774834437087e-07, "loss": 0.6903, "step": 60 }, { "epoch": 0.0, "grad_norm": 4.603478908538818, "learning_rate": 1.1589403973509934e-06, "loss": 0.7016, "step": 70 }, { "epoch": 0.0, "grad_norm": 6.0669779777526855, "learning_rate": 1.3245033112582782e-06, "loss": 0.7131, "step": 80 }, { "epoch": 0.0, "grad_norm": 3.940962314605713, "learning_rate": 1.4900662251655629e-06, "loss": 0.6874, "step": 90 }, { "epoch": 0.0, "grad_norm": 3.132936716079712, "learning_rate": 1.6556291390728478e-06, "loss": 0.7025, "step": 100 }, { "epoch": 0.0, "grad_norm": 4.358637809753418, "learning_rate": 1.8211920529801325e-06, "loss": 0.694, "step": 110 }, { "epoch": 0.0, "grad_norm": 4.67786169052124, "learning_rate": 1.9867549668874175e-06, "loss": 0.69, "step": 120 }, { "epoch": 0.0, "grad_norm": 5.850955486297607, "learning_rate": 2.152317880794702e-06, "loss": 0.6855, "step": 130 }, { "epoch": 0.0, "grad_norm": 7.933328628540039, "learning_rate": 2.317880794701987e-06, "loss": 0.7019, "step": 140 }, { "epoch": 0.0, "grad_norm": 6.646462440490723, "learning_rate": 2.4834437086092716e-06, "loss": 0.7026, "step": 150 }, { "epoch": 0.01, "grad_norm": 5.709299087524414, "learning_rate": 2.6490066225165563e-06, "loss": 0.677, "step": 160 }, { "epoch": 0.01, "grad_norm": 4.531544208526611, "learning_rate": 2.8145695364238415e-06, "loss": 0.7028, "step": 170 }, { "epoch": 0.01, "grad_norm": 9.23356819152832, "learning_rate": 2.9801324503311258e-06, "loss": 0.667, "step": 180 }, { "epoch": 0.01, "grad_norm": 7.787632942199707, "learning_rate": 3.145695364238411e-06, "loss": 0.6753, "step": 190 }, { "epoch": 0.01, "grad_norm": 4.996011257171631, "learning_rate": 3.3112582781456956e-06, "loss": 0.7245, "step": 200 }, { "epoch": 0.01, "grad_norm": 6.756231784820557, "learning_rate": 3.47682119205298e-06, "loss": 0.663, "step": 210 }, { "epoch": 0.01, "grad_norm": 4.762441635131836, "learning_rate": 3.642384105960265e-06, "loss": 0.6785, "step": 220 }, { "epoch": 0.01, "grad_norm": 5.188967704772949, "learning_rate": 3.8079470198675498e-06, "loss": 0.6748, "step": 230 }, { "epoch": 0.01, "grad_norm": 7.416825771331787, "learning_rate": 3.973509933774835e-06, "loss": 0.6679, "step": 240 }, { "epoch": 0.01, "grad_norm": 7.885051727294922, "learning_rate": 4.13907284768212e-06, "loss": 0.6712, "step": 250 }, { "epoch": 0.01, "grad_norm": 3.602786064147949, "learning_rate": 4.304635761589404e-06, "loss": 0.6838, "step": 260 }, { "epoch": 0.01, "grad_norm": 9.783942222595215, "learning_rate": 4.470198675496689e-06, "loss": 0.6824, "step": 270 }, { "epoch": 0.01, "grad_norm": 5.1008172035217285, "learning_rate": 4.635761589403974e-06, "loss": 0.661, "step": 280 }, { "epoch": 0.01, "grad_norm": 7.266839504241943, "learning_rate": 4.801324503311259e-06, "loss": 0.6225, "step": 290 }, { "epoch": 0.01, "grad_norm": 6.263260364532471, "learning_rate": 4.966887417218543e-06, "loss": 0.6457, "step": 300 }, { "epoch": 0.01, "grad_norm": 9.13078784942627, "learning_rate": 5.1324503311258275e-06, "loss": 0.639, "step": 310 }, { "epoch": 0.01, "grad_norm": 4.028023719787598, "learning_rate": 5.298013245033113e-06, "loss": 0.6535, "step": 320 }, { "epoch": 0.01, "grad_norm": 5.480044364929199, "learning_rate": 5.463576158940398e-06, "loss": 0.6519, "step": 330 }, { "epoch": 0.01, "grad_norm": 5.179543972015381, "learning_rate": 5.629139072847683e-06, "loss": 0.6611, "step": 340 }, { "epoch": 0.01, "grad_norm": 3.5863168239593506, "learning_rate": 5.794701986754967e-06, "loss": 0.6115, "step": 350 }, { "epoch": 0.01, "grad_norm": 7.434300899505615, "learning_rate": 5.9602649006622515e-06, "loss": 0.6357, "step": 360 }, { "epoch": 0.01, "grad_norm": 4.279542922973633, "learning_rate": 6.125827814569537e-06, "loss": 0.6173, "step": 370 }, { "epoch": 0.01, "grad_norm": 7.049438953399658, "learning_rate": 6.291390728476822e-06, "loss": 0.6348, "step": 380 }, { "epoch": 0.01, "grad_norm": 7.15338659286499, "learning_rate": 6.456953642384106e-06, "loss": 0.5899, "step": 390 }, { "epoch": 0.01, "grad_norm": 10.467881202697754, "learning_rate": 6.622516556291391e-06, "loss": 0.6648, "step": 400 }, { "epoch": 0.01, "grad_norm": 12.209418296813965, "learning_rate": 6.7880794701986755e-06, "loss": 0.6523, "step": 410 }, { "epoch": 0.01, "grad_norm": 18.373001098632812, "learning_rate": 6.95364238410596e-06, "loss": 0.5872, "step": 420 }, { "epoch": 0.01, "eval_accuracy": 0.5518018018018018, "eval_loss": 0.6883862614631653, "eval_runtime": 46.4984, "eval_samples_per_second": 19.097, "eval_steps_per_second": 1.591, "step": 420 }, { "epoch": 1.0, "grad_norm": 8.16763973236084, "learning_rate": 7.119205298013246e-06, "loss": 0.617, "step": 430 }, { "epoch": 1.0, "grad_norm": 7.374216556549072, "learning_rate": 7.28476821192053e-06, "loss": 0.6017, "step": 440 }, { "epoch": 1.0, "grad_norm": 8.875380516052246, "learning_rate": 7.450331125827815e-06, "loss": 0.5943, "step": 450 }, { "epoch": 1.0, "grad_norm": 6.8913116455078125, "learning_rate": 7.6158940397350995e-06, "loss": 0.6053, "step": 460 }, { "epoch": 1.0, "grad_norm": 19.417211532592773, "learning_rate": 7.781456953642384e-06, "loss": 0.5664, "step": 470 }, { "epoch": 1.0, "grad_norm": 9.484718322753906, "learning_rate": 7.94701986754967e-06, "loss": 0.5937, "step": 480 }, { "epoch": 1.0, "grad_norm": 11.553343772888184, "learning_rate": 8.112582781456954e-06, "loss": 0.5961, "step": 490 }, { "epoch": 1.0, "grad_norm": 8.792490005493164, "learning_rate": 8.27814569536424e-06, "loss": 0.5381, "step": 500 }, { "epoch": 1.0, "grad_norm": 11.141890525817871, "learning_rate": 8.443708609271524e-06, "loss": 0.662, "step": 510 }, { "epoch": 1.0, "grad_norm": 10.944332122802734, "learning_rate": 8.609271523178809e-06, "loss": 0.5583, "step": 520 }, { "epoch": 1.0, "grad_norm": 6.739534378051758, "learning_rate": 8.774834437086093e-06, "loss": 0.6051, "step": 530 }, { "epoch": 1.0, "grad_norm": 14.809786796569824, "learning_rate": 8.940397350993377e-06, "loss": 0.5843, "step": 540 }, { "epoch": 1.0, "grad_norm": 9.712545394897461, "learning_rate": 9.105960264900662e-06, "loss": 0.6071, "step": 550 }, { "epoch": 1.0, "grad_norm": 10.756208419799805, "learning_rate": 9.271523178807948e-06, "loss": 0.518, "step": 560 }, { "epoch": 1.0, "grad_norm": 12.263407707214355, "learning_rate": 9.437086092715232e-06, "loss": 0.5224, "step": 570 }, { "epoch": 1.01, "grad_norm": 12.013748168945312, "learning_rate": 9.602649006622518e-06, "loss": 0.563, "step": 580 }, { "epoch": 1.01, "grad_norm": 10.52728271484375, "learning_rate": 9.768211920529802e-06, "loss": 0.584, "step": 590 }, { "epoch": 1.01, "grad_norm": 18.45766830444336, "learning_rate": 9.933774834437086e-06, "loss": 0.5911, "step": 600 }, { "epoch": 1.01, "grad_norm": 11.80704116821289, "learning_rate": 1.0099337748344372e-05, "loss": 0.5943, "step": 610 }, { "epoch": 1.01, "grad_norm": 5.923819541931152, "learning_rate": 1.0264900662251655e-05, "loss": 0.5752, "step": 620 }, { "epoch": 1.01, "grad_norm": 12.2268648147583, "learning_rate": 1.0430463576158941e-05, "loss": 0.5505, "step": 630 }, { "epoch": 1.01, "grad_norm": 8.633209228515625, "learning_rate": 1.0596026490066225e-05, "loss": 0.4826, "step": 640 }, { "epoch": 1.01, "grad_norm": 25.050275802612305, "learning_rate": 1.076158940397351e-05, "loss": 0.6089, "step": 650 }, { "epoch": 1.01, "grad_norm": 14.282061576843262, "learning_rate": 1.0927152317880796e-05, "loss": 0.5236, "step": 660 }, { "epoch": 1.01, "grad_norm": 4.38491678237915, "learning_rate": 1.109271523178808e-05, "loss": 0.5622, "step": 670 }, { "epoch": 1.01, "grad_norm": 5.651684761047363, "learning_rate": 1.1258278145695366e-05, "loss": 0.6044, "step": 680 }, { "epoch": 1.01, "grad_norm": 6.315426826477051, "learning_rate": 1.142384105960265e-05, "loss": 0.6392, "step": 690 }, { "epoch": 1.01, "grad_norm": 5.555671215057373, "learning_rate": 1.1589403973509934e-05, "loss": 0.542, "step": 700 }, { "epoch": 1.01, "grad_norm": 17.609895706176758, "learning_rate": 1.1754966887417219e-05, "loss": 0.5469, "step": 710 }, { "epoch": 1.01, "grad_norm": 5.622751235961914, "learning_rate": 1.1920529801324503e-05, "loss": 0.5576, "step": 720 }, { "epoch": 1.01, "grad_norm": 16.457481384277344, "learning_rate": 1.2086092715231789e-05, "loss": 0.5037, "step": 730 }, { "epoch": 1.01, "grad_norm": 12.223016738891602, "learning_rate": 1.2251655629139073e-05, "loss": 0.5333, "step": 740 }, { "epoch": 1.01, "grad_norm": 12.266618728637695, "learning_rate": 1.2417218543046358e-05, "loss": 0.6454, "step": 750 }, { "epoch": 1.01, "grad_norm": 14.025192260742188, "learning_rate": 1.2582781456953644e-05, "loss": 0.6073, "step": 760 }, { "epoch": 1.01, "grad_norm": 7.08126163482666, "learning_rate": 1.274834437086093e-05, "loss": 0.5818, "step": 770 }, { "epoch": 1.01, "grad_norm": 5.972238063812256, "learning_rate": 1.2913907284768212e-05, "loss": 0.491, "step": 780 }, { "epoch": 1.01, "grad_norm": 9.566168785095215, "learning_rate": 1.3079470198675498e-05, "loss": 0.6412, "step": 790 }, { "epoch": 1.01, "grad_norm": 10.850152969360352, "learning_rate": 1.3245033112582782e-05, "loss": 0.5616, "step": 800 }, { "epoch": 1.01, "grad_norm": 5.276033878326416, "learning_rate": 1.3410596026490067e-05, "loss": 0.6066, "step": 810 }, { "epoch": 1.01, "grad_norm": 6.786125659942627, "learning_rate": 1.3576158940397351e-05, "loss": 0.5489, "step": 820 }, { "epoch": 1.01, "grad_norm": 7.395523548126221, "learning_rate": 1.3741721854304637e-05, "loss": 0.4285, "step": 830 }, { "epoch": 1.01, "grad_norm": 24.986873626708984, "learning_rate": 1.390728476821192e-05, "loss": 0.5358, "step": 840 }, { "epoch": 1.01, "eval_accuracy": 0.6193693693693694, "eval_loss": 0.6766347289085388, "eval_runtime": 42.9716, "eval_samples_per_second": 20.665, "eval_steps_per_second": 1.722, "step": 840 }, { "epoch": 2.0, "grad_norm": 13.267349243164062, "learning_rate": 1.4072847682119206e-05, "loss": 0.4912, "step": 850 }, { "epoch": 2.0, "grad_norm": 9.027665138244629, "learning_rate": 1.4238410596026492e-05, "loss": 0.4733, "step": 860 }, { "epoch": 2.0, "grad_norm": 8.086003303527832, "learning_rate": 1.4403973509933774e-05, "loss": 0.4839, "step": 870 }, { "epoch": 2.0, "grad_norm": 9.574597358703613, "learning_rate": 1.456953642384106e-05, "loss": 0.5129, "step": 880 }, { "epoch": 2.0, "grad_norm": 9.984691619873047, "learning_rate": 1.4735099337748346e-05, "loss": 0.5445, "step": 890 }, { "epoch": 2.0, "grad_norm": 12.852778434753418, "learning_rate": 1.490066225165563e-05, "loss": 0.4693, "step": 900 }, { "epoch": 2.0, "grad_norm": 13.385702133178711, "learning_rate": 1.5066225165562913e-05, "loss": 0.5598, "step": 910 }, { "epoch": 2.0, "grad_norm": 14.717290878295898, "learning_rate": 1.5231788079470199e-05, "loss": 0.4971, "step": 920 }, { "epoch": 2.0, "grad_norm": 9.329611778259277, "learning_rate": 1.5397350993377485e-05, "loss": 0.5318, "step": 930 }, { "epoch": 2.0, "grad_norm": 9.604193687438965, "learning_rate": 1.5562913907284768e-05, "loss": 0.5795, "step": 940 }, { "epoch": 2.0, "grad_norm": 5.933267593383789, "learning_rate": 1.5728476821192054e-05, "loss": 0.5661, "step": 950 }, { "epoch": 2.0, "grad_norm": 11.780954360961914, "learning_rate": 1.589403973509934e-05, "loss": 0.4763, "step": 960 }, { "epoch": 2.0, "grad_norm": 5.923733234405518, "learning_rate": 1.6059602649006622e-05, "loss": 0.5337, "step": 970 }, { "epoch": 2.0, "grad_norm": 12.70390796661377, "learning_rate": 1.6225165562913908e-05, "loss": 0.5132, "step": 980 }, { "epoch": 2.0, "grad_norm": 11.865839004516602, "learning_rate": 1.6390728476821194e-05, "loss": 0.5244, "step": 990 }, { "epoch": 2.01, "grad_norm": 10.646509170532227, "learning_rate": 1.655629139072848e-05, "loss": 0.4553, "step": 1000 }, { "epoch": 2.01, "grad_norm": 15.619583129882812, "learning_rate": 1.6721854304635763e-05, "loss": 0.423, "step": 1010 }, { "epoch": 2.01, "grad_norm": 8.261073112487793, "learning_rate": 1.688741721854305e-05, "loss": 0.4211, "step": 1020 }, { "epoch": 2.01, "grad_norm": 9.633855819702148, "learning_rate": 1.705298013245033e-05, "loss": 0.5283, "step": 1030 }, { "epoch": 2.01, "grad_norm": 14.031935691833496, "learning_rate": 1.7218543046357617e-05, "loss": 0.5528, "step": 1040 }, { "epoch": 2.01, "grad_norm": 11.338902473449707, "learning_rate": 1.73841059602649e-05, "loss": 0.4548, "step": 1050 }, { "epoch": 2.01, "grad_norm": 23.000455856323242, "learning_rate": 1.7549668874172186e-05, "loss": 0.5563, "step": 1060 }, { "epoch": 2.01, "grad_norm": 9.367509841918945, "learning_rate": 1.771523178807947e-05, "loss": 0.4287, "step": 1070 }, { "epoch": 2.01, "grad_norm": 7.15519905090332, "learning_rate": 1.7880794701986755e-05, "loss": 0.5689, "step": 1080 }, { "epoch": 2.01, "grad_norm": 9.858805656433105, "learning_rate": 1.804635761589404e-05, "loss": 0.4932, "step": 1090 }, { "epoch": 2.01, "grad_norm": 6.52086877822876, "learning_rate": 1.8211920529801323e-05, "loss": 0.5288, "step": 1100 }, { "epoch": 2.01, "grad_norm": 13.988100051879883, "learning_rate": 1.837748344370861e-05, "loss": 0.4448, "step": 1110 }, { "epoch": 2.01, "grad_norm": 13.741047859191895, "learning_rate": 1.8543046357615895e-05, "loss": 0.3917, "step": 1120 }, { "epoch": 2.01, "grad_norm": 7.654866695404053, "learning_rate": 1.870860927152318e-05, "loss": 0.4636, "step": 1130 }, { "epoch": 2.01, "grad_norm": 9.594705581665039, "learning_rate": 1.8874172185430464e-05, "loss": 0.5084, "step": 1140 }, { "epoch": 2.01, "grad_norm": 8.550768852233887, "learning_rate": 1.903973509933775e-05, "loss": 0.4168, "step": 1150 }, { "epoch": 2.01, "grad_norm": 10.367500305175781, "learning_rate": 1.9205298013245036e-05, "loss": 0.5148, "step": 1160 }, { "epoch": 2.01, "grad_norm": 10.711953163146973, "learning_rate": 1.9370860927152318e-05, "loss": 0.536, "step": 1170 }, { "epoch": 2.01, "grad_norm": 8.47671890258789, "learning_rate": 1.9536423841059604e-05, "loss": 0.496, "step": 1180 }, { "epoch": 2.01, "grad_norm": 14.76279067993164, "learning_rate": 1.970198675496689e-05, "loss": 0.5253, "step": 1190 }, { "epoch": 2.01, "grad_norm": 11.065993309020996, "learning_rate": 1.9867549668874173e-05, "loss": 0.4508, "step": 1200 }, { "epoch": 2.01, "grad_norm": 10.93369197845459, "learning_rate": 2.003311258278146e-05, "loss": 0.5272, "step": 1210 }, { "epoch": 2.01, "grad_norm": 9.056780815124512, "learning_rate": 2.0198675496688745e-05, "loss": 0.5077, "step": 1220 }, { "epoch": 2.01, "grad_norm": 8.762649536132812, "learning_rate": 2.0364238410596027e-05, "loss": 0.385, "step": 1230 }, { "epoch": 2.01, "grad_norm": 8.538049697875977, "learning_rate": 2.052980132450331e-05, "loss": 0.6194, "step": 1240 }, { "epoch": 2.01, "grad_norm": 8.191787719726562, "learning_rate": 2.0695364238410596e-05, "loss": 0.455, "step": 1250 }, { "epoch": 2.01, "grad_norm": 25.214134216308594, "learning_rate": 2.0860927152317882e-05, "loss": 0.5339, "step": 1260 }, { "epoch": 2.01, "eval_accuracy": 0.6430180180180181, "eval_loss": 0.6294997334480286, "eval_runtime": 42.0635, "eval_samples_per_second": 21.111, "eval_steps_per_second": 1.759, "step": 1260 }, { "epoch": 3.0, "grad_norm": 9.5454683303833, "learning_rate": 2.1026490066225165e-05, "loss": 0.4965, "step": 1270 }, { "epoch": 3.0, "grad_norm": 4.766613483428955, "learning_rate": 2.119205298013245e-05, "loss": 0.4325, "step": 1280 }, { "epoch": 3.0, "grad_norm": 9.407756805419922, "learning_rate": 2.1357615894039737e-05, "loss": 0.4026, "step": 1290 }, { "epoch": 3.0, "grad_norm": 5.415344715118408, "learning_rate": 2.152317880794702e-05, "loss": 0.4614, "step": 1300 }, { "epoch": 3.0, "grad_norm": 10.35544204711914, "learning_rate": 2.1688741721854305e-05, "loss": 0.6127, "step": 1310 }, { "epoch": 3.0, "grad_norm": 6.637629508972168, "learning_rate": 2.185430463576159e-05, "loss": 0.445, "step": 1320 }, { "epoch": 3.0, "grad_norm": 16.520090103149414, "learning_rate": 2.2019867549668874e-05, "loss": 0.3947, "step": 1330 }, { "epoch": 3.0, "grad_norm": 17.7099609375, "learning_rate": 2.218543046357616e-05, "loss": 0.4481, "step": 1340 }, { "epoch": 3.0, "grad_norm": 10.103479385375977, "learning_rate": 2.2350993377483446e-05, "loss": 0.4481, "step": 1350 }, { "epoch": 3.0, "grad_norm": 8.171016693115234, "learning_rate": 2.2516556291390732e-05, "loss": 0.4555, "step": 1360 }, { "epoch": 3.0, "grad_norm": 8.391425132751465, "learning_rate": 2.2682119205298014e-05, "loss": 0.3572, "step": 1370 }, { "epoch": 3.0, "grad_norm": 8.662993431091309, "learning_rate": 2.28476821192053e-05, "loss": 0.433, "step": 1380 }, { "epoch": 3.0, "grad_norm": 16.419416427612305, "learning_rate": 2.3013245033112586e-05, "loss": 0.3873, "step": 1390 }, { "epoch": 3.0, "grad_norm": 17.038654327392578, "learning_rate": 2.317880794701987e-05, "loss": 0.4813, "step": 1400 }, { "epoch": 3.0, "grad_norm": 16.546653747558594, "learning_rate": 2.3344370860927155e-05, "loss": 0.3604, "step": 1410 }, { "epoch": 3.01, "grad_norm": 7.0249738693237305, "learning_rate": 2.3509933774834437e-05, "loss": 0.394, "step": 1420 }, { "epoch": 3.01, "grad_norm": 6.531501293182373, "learning_rate": 2.3675496688741723e-05, "loss": 0.5181, "step": 1430 }, { "epoch": 3.01, "grad_norm": 10.783075332641602, "learning_rate": 2.3841059602649006e-05, "loss": 0.5868, "step": 1440 }, { "epoch": 3.01, "grad_norm": 5.320065975189209, "learning_rate": 2.4006622516556292e-05, "loss": 0.3903, "step": 1450 }, { "epoch": 3.01, "grad_norm": 7.875553131103516, "learning_rate": 2.4172185430463578e-05, "loss": 0.4643, "step": 1460 }, { "epoch": 3.01, "grad_norm": 10.691715240478516, "learning_rate": 2.433774834437086e-05, "loss": 0.543, "step": 1470 }, { "epoch": 3.01, "grad_norm": 13.551742553710938, "learning_rate": 2.4503311258278147e-05, "loss": 0.4733, "step": 1480 }, { "epoch": 3.01, "grad_norm": 13.485529899597168, "learning_rate": 2.4668874172185433e-05, "loss": 0.4886, "step": 1490 }, { "epoch": 3.01, "grad_norm": 9.053728103637695, "learning_rate": 2.4834437086092715e-05, "loss": 0.4446, "step": 1500 }, { "epoch": 3.01, "grad_norm": 7.607123851776123, "learning_rate": 2.5e-05, "loss": 0.386, "step": 1510 }, { "epoch": 3.01, "grad_norm": 11.253607749938965, "learning_rate": 2.5165562913907287e-05, "loss": 0.4385, "step": 1520 }, { "epoch": 3.01, "grad_norm": 9.173261642456055, "learning_rate": 2.5331125827814573e-05, "loss": 0.5017, "step": 1530 }, { "epoch": 3.01, "grad_norm": 13.1712646484375, "learning_rate": 2.549668874172186e-05, "loss": 0.3919, "step": 1540 }, { "epoch": 3.01, "grad_norm": 6.91953706741333, "learning_rate": 2.566225165562914e-05, "loss": 0.384, "step": 1550 }, { "epoch": 3.01, "grad_norm": 10.297747611999512, "learning_rate": 2.5827814569536424e-05, "loss": 0.525, "step": 1560 }, { "epoch": 3.01, "grad_norm": 8.570240020751953, "learning_rate": 2.599337748344371e-05, "loss": 0.5393, "step": 1570 }, { "epoch": 3.01, "grad_norm": 6.507657527923584, "learning_rate": 2.6158940397350996e-05, "loss": 0.4595, "step": 1580 }, { "epoch": 3.01, "grad_norm": 6.807774066925049, "learning_rate": 2.632450331125828e-05, "loss": 0.5503, "step": 1590 }, { "epoch": 3.01, "grad_norm": 4.914486885070801, "learning_rate": 2.6490066225165565e-05, "loss": 0.527, "step": 1600 }, { "epoch": 3.01, "grad_norm": 8.518081665039062, "learning_rate": 2.6655629139072848e-05, "loss": 0.4838, "step": 1610 }, { "epoch": 3.01, "grad_norm": 7.288885116577148, "learning_rate": 2.6821192052980134e-05, "loss": 0.4403, "step": 1620 }, { "epoch": 3.01, "grad_norm": 4.160616874694824, "learning_rate": 2.6986754966887416e-05, "loss": 0.3552, "step": 1630 }, { "epoch": 3.01, "grad_norm": 13.016753196716309, "learning_rate": 2.7152317880794702e-05, "loss": 0.4487, "step": 1640 }, { "epoch": 3.01, "grad_norm": 20.624540328979492, "learning_rate": 2.7317880794701988e-05, "loss": 0.4864, "step": 1650 }, { "epoch": 3.01, "grad_norm": 11.90951919555664, "learning_rate": 2.7483443708609274e-05, "loss": 0.6142, "step": 1660 }, { "epoch": 3.01, "grad_norm": 10.347407341003418, "learning_rate": 2.764900662251656e-05, "loss": 0.5207, "step": 1670 }, { "epoch": 3.01, "grad_norm": 10.494477272033691, "learning_rate": 2.781456953642384e-05, "loss": 0.4483, "step": 1680 }, { "epoch": 3.01, "eval_accuracy": 0.5957207207207207, "eval_loss": 0.732318103313446, "eval_runtime": 42.3337, "eval_samples_per_second": 20.976, "eval_steps_per_second": 1.748, "step": 1680 }, { "epoch": 4.0, "grad_norm": 12.671928405761719, "learning_rate": 2.7980132450331125e-05, "loss": 0.444, "step": 1690 }, { "epoch": 4.0, "grad_norm": 6.501286506652832, "learning_rate": 2.814569536423841e-05, "loss": 0.4082, "step": 1700 }, { "epoch": 4.0, "grad_norm": 5.892317771911621, "learning_rate": 2.8311258278145697e-05, "loss": 0.3455, "step": 1710 }, { "epoch": 4.0, "grad_norm": 34.05948257446289, "learning_rate": 2.8476821192052983e-05, "loss": 0.39, "step": 1720 }, { "epoch": 4.0, "grad_norm": 6.998101234436035, "learning_rate": 2.864238410596027e-05, "loss": 0.3866, "step": 1730 }, { "epoch": 4.0, "grad_norm": 11.206581115722656, "learning_rate": 2.880794701986755e-05, "loss": 0.4302, "step": 1740 }, { "epoch": 4.0, "grad_norm": 19.16914176940918, "learning_rate": 2.8973509933774834e-05, "loss": 0.5607, "step": 1750 }, { "epoch": 4.0, "grad_norm": 14.35991096496582, "learning_rate": 2.913907284768212e-05, "loss": 0.5429, "step": 1760 }, { "epoch": 4.0, "grad_norm": 8.55944538116455, "learning_rate": 2.9304635761589406e-05, "loss": 0.3615, "step": 1770 }, { "epoch": 4.0, "grad_norm": 9.817693710327148, "learning_rate": 2.9470198675496692e-05, "loss": 0.383, "step": 1780 }, { "epoch": 4.0, "grad_norm": 3.8304190635681152, "learning_rate": 2.9635761589403975e-05, "loss": 0.5135, "step": 1790 }, { "epoch": 4.0, "grad_norm": 3.691169023513794, "learning_rate": 2.980132450331126e-05, "loss": 0.4897, "step": 1800 }, { "epoch": 4.0, "grad_norm": 9.651874542236328, "learning_rate": 2.9966887417218544e-05, "loss": 0.4799, "step": 1810 }, { "epoch": 4.0, "grad_norm": 6.509405136108398, "learning_rate": 3.0132450331125826e-05, "loss": 0.459, "step": 1820 }, { "epoch": 4.0, "grad_norm": 7.874476909637451, "learning_rate": 3.0298013245033112e-05, "loss": 0.4221, "step": 1830 }, { "epoch": 4.01, "grad_norm": 3.1665453910827637, "learning_rate": 3.0463576158940398e-05, "loss": 0.3891, "step": 1840 }, { "epoch": 4.01, "grad_norm": 7.245667934417725, "learning_rate": 3.062913907284769e-05, "loss": 0.4019, "step": 1850 }, { "epoch": 4.01, "grad_norm": 10.771611213684082, "learning_rate": 3.079470198675497e-05, "loss": 0.4505, "step": 1860 }, { "epoch": 4.01, "grad_norm": 8.632268905639648, "learning_rate": 3.096026490066225e-05, "loss": 0.4706, "step": 1870 }, { "epoch": 4.01, "grad_norm": 7.70095682144165, "learning_rate": 3.1125827814569535e-05, "loss": 0.3641, "step": 1880 }, { "epoch": 4.01, "grad_norm": 9.165621757507324, "learning_rate": 3.1291390728476825e-05, "loss": 0.4147, "step": 1890 }, { "epoch": 4.01, "grad_norm": 7.8730058670043945, "learning_rate": 3.145695364238411e-05, "loss": 0.4285, "step": 1900 }, { "epoch": 4.01, "grad_norm": 11.200624465942383, "learning_rate": 3.162251655629139e-05, "loss": 0.436, "step": 1910 }, { "epoch": 4.01, "grad_norm": 12.971415519714355, "learning_rate": 3.178807947019868e-05, "loss": 0.4843, "step": 1920 }, { "epoch": 4.01, "grad_norm": 5.6903076171875, "learning_rate": 3.195364238410596e-05, "loss": 0.4054, "step": 1930 }, { "epoch": 4.01, "grad_norm": 6.505043029785156, "learning_rate": 3.2119205298013244e-05, "loss": 0.4321, "step": 1940 }, { "epoch": 4.01, "grad_norm": 12.978421211242676, "learning_rate": 3.228476821192053e-05, "loss": 0.4137, "step": 1950 }, { "epoch": 4.01, "grad_norm": 6.857866287231445, "learning_rate": 3.2450331125827816e-05, "loss": 0.365, "step": 1960 }, { "epoch": 4.01, "grad_norm": 5.074346542358398, "learning_rate": 3.26158940397351e-05, "loss": 0.5182, "step": 1970 }, { "epoch": 4.01, "grad_norm": 7.229710102081299, "learning_rate": 3.278145695364239e-05, "loss": 0.4266, "step": 1980 }, { "epoch": 4.01, "grad_norm": 18.746814727783203, "learning_rate": 3.294701986754967e-05, "loss": 0.4848, "step": 1990 }, { "epoch": 4.01, "grad_norm": 6.558658599853516, "learning_rate": 3.311258278145696e-05, "loss": 0.439, "step": 2000 }, { "epoch": 4.01, "grad_norm": 15.033697128295898, "learning_rate": 3.3278145695364236e-05, "loss": 0.368, "step": 2010 }, { "epoch": 4.01, "grad_norm": 12.098657608032227, "learning_rate": 3.3443708609271526e-05, "loss": 0.3267, "step": 2020 }, { "epoch": 4.01, "grad_norm": 5.617885112762451, "learning_rate": 3.360927152317881e-05, "loss": 0.334, "step": 2030 }, { "epoch": 4.01, "grad_norm": 19.70214080810547, "learning_rate": 3.37748344370861e-05, "loss": 0.4613, "step": 2040 }, { "epoch": 4.01, "grad_norm": 6.696241855621338, "learning_rate": 3.394039735099338e-05, "loss": 0.3222, "step": 2050 }, { "epoch": 4.01, "grad_norm": 9.129453659057617, "learning_rate": 3.410596026490066e-05, "loss": 0.3567, "step": 2060 }, { "epoch": 4.01, "grad_norm": 9.57669448852539, "learning_rate": 3.4271523178807945e-05, "loss": 0.469, "step": 2070 }, { "epoch": 4.01, "grad_norm": 9.590263366699219, "learning_rate": 3.4437086092715235e-05, "loss": 0.3506, "step": 2080 }, { "epoch": 4.01, "grad_norm": 6.603126049041748, "learning_rate": 3.460264900662252e-05, "loss": 0.4482, "step": 2090 }, { "epoch": 4.01, "grad_norm": 31.09659767150879, "learning_rate": 3.47682119205298e-05, "loss": 0.4654, "step": 2100 }, { "epoch": 4.01, "eval_accuracy": 0.6486486486486487, "eval_loss": 0.7019912600517273, "eval_runtime": 42.8716, "eval_samples_per_second": 20.713, "eval_steps_per_second": 1.726, "step": 2100 }, { "epoch": 5.0, "grad_norm": 5.189599990844727, "learning_rate": 3.493377483443709e-05, "loss": 0.3112, "step": 2110 }, { "epoch": 5.0, "grad_norm": 14.994132995605469, "learning_rate": 3.509933774834437e-05, "loss": 0.4296, "step": 2120 }, { "epoch": 5.0, "grad_norm": 5.689457893371582, "learning_rate": 3.526490066225166e-05, "loss": 0.2943, "step": 2130 }, { "epoch": 5.0, "grad_norm": 5.469167232513428, "learning_rate": 3.543046357615894e-05, "loss": 0.324, "step": 2140 }, { "epoch": 5.0, "grad_norm": 9.89396858215332, "learning_rate": 3.5596026490066226e-05, "loss": 0.3203, "step": 2150 }, { "epoch": 5.0, "grad_norm": 15.105340957641602, "learning_rate": 3.576158940397351e-05, "loss": 0.4793, "step": 2160 }, { "epoch": 5.0, "grad_norm": 15.274898529052734, "learning_rate": 3.59271523178808e-05, "loss": 0.3326, "step": 2170 }, { "epoch": 5.0, "grad_norm": 6.226844310760498, "learning_rate": 3.609271523178808e-05, "loss": 0.4929, "step": 2180 }, { "epoch": 5.0, "grad_norm": 5.726044654846191, "learning_rate": 3.625827814569537e-05, "loss": 0.3783, "step": 2190 }, { "epoch": 5.0, "grad_norm": 9.240283966064453, "learning_rate": 3.6423841059602646e-05, "loss": 0.4878, "step": 2200 }, { "epoch": 5.0, "grad_norm": 7.1570587158203125, "learning_rate": 3.6589403973509936e-05, "loss": 0.3352, "step": 2210 }, { "epoch": 5.0, "grad_norm": 8.965234756469727, "learning_rate": 3.675496688741722e-05, "loss": 0.4149, "step": 2220 }, { "epoch": 5.0, "grad_norm": 4.667807579040527, "learning_rate": 3.692052980132451e-05, "loss": 0.2894, "step": 2230 }, { "epoch": 5.0, "grad_norm": 13.140071868896484, "learning_rate": 3.708609271523179e-05, "loss": 0.3437, "step": 2240 }, { "epoch": 5.0, "grad_norm": 6.775124549865723, "learning_rate": 3.725165562913907e-05, "loss": 0.2888, "step": 2250 }, { "epoch": 5.01, "grad_norm": 4.836766719818115, "learning_rate": 3.741721854304636e-05, "loss": 0.3891, "step": 2260 }, { "epoch": 5.01, "grad_norm": 13.269258499145508, "learning_rate": 3.7582781456953645e-05, "loss": 0.324, "step": 2270 }, { "epoch": 5.01, "grad_norm": 9.380875587463379, "learning_rate": 3.774834437086093e-05, "loss": 0.4917, "step": 2280 }, { "epoch": 5.01, "grad_norm": 6.058047294616699, "learning_rate": 3.791390728476821e-05, "loss": 0.4402, "step": 2290 }, { "epoch": 5.01, "grad_norm": 10.489774703979492, "learning_rate": 3.80794701986755e-05, "loss": 0.4674, "step": 2300 }, { "epoch": 5.01, "grad_norm": 6.385855674743652, "learning_rate": 3.824503311258278e-05, "loss": 0.3657, "step": 2310 }, { "epoch": 5.01, "grad_norm": 11.147357940673828, "learning_rate": 3.841059602649007e-05, "loss": 0.3963, "step": 2320 }, { "epoch": 5.01, "grad_norm": 8.97010612487793, "learning_rate": 3.8576158940397354e-05, "loss": 0.3673, "step": 2330 }, { "epoch": 5.01, "grad_norm": 13.971797943115234, "learning_rate": 3.8741721854304637e-05, "loss": 0.3182, "step": 2340 }, { "epoch": 5.01, "grad_norm": 12.762605667114258, "learning_rate": 3.890728476821192e-05, "loss": 0.3467, "step": 2350 }, { "epoch": 5.01, "grad_norm": 18.55424690246582, "learning_rate": 3.907284768211921e-05, "loss": 0.3063, "step": 2360 }, { "epoch": 5.01, "grad_norm": 20.252593994140625, "learning_rate": 3.923841059602649e-05, "loss": 0.334, "step": 2370 }, { "epoch": 5.01, "grad_norm": 9.72482967376709, "learning_rate": 3.940397350993378e-05, "loss": 0.4933, "step": 2380 }, { "epoch": 5.01, "grad_norm": 5.5373663902282715, "learning_rate": 3.956953642384106e-05, "loss": 0.4095, "step": 2390 }, { "epoch": 5.01, "grad_norm": 5.389358043670654, "learning_rate": 3.9735099337748346e-05, "loss": 0.4269, "step": 2400 }, { "epoch": 5.01, "grad_norm": 9.32028865814209, "learning_rate": 3.990066225165563e-05, "loss": 0.2847, "step": 2410 }, { "epoch": 5.01, "grad_norm": 10.698759078979492, "learning_rate": 4.006622516556292e-05, "loss": 0.3018, "step": 2420 }, { "epoch": 5.01, "grad_norm": 9.361750602722168, "learning_rate": 4.02317880794702e-05, "loss": 0.3396, "step": 2430 }, { "epoch": 5.01, "grad_norm": 7.311129093170166, "learning_rate": 4.039735099337749e-05, "loss": 0.4454, "step": 2440 }, { "epoch": 5.01, "grad_norm": 8.57088851928711, "learning_rate": 4.056291390728477e-05, "loss": 0.3589, "step": 2450 }, { "epoch": 5.01, "grad_norm": 5.872406482696533, "learning_rate": 4.0728476821192055e-05, "loss": 0.3914, "step": 2460 }, { "epoch": 5.01, "grad_norm": 3.2365334033966064, "learning_rate": 4.089403973509934e-05, "loss": 0.4405, "step": 2470 }, { "epoch": 5.01, "grad_norm": 11.49345874786377, "learning_rate": 4.105960264900662e-05, "loss": 0.3505, "step": 2480 }, { "epoch": 5.01, "grad_norm": 11.595232963562012, "learning_rate": 4.122516556291391e-05, "loss": 0.3939, "step": 2490 }, { "epoch": 5.01, "grad_norm": 10.280562400817871, "learning_rate": 4.139072847682119e-05, "loss": 0.433, "step": 2500 }, { "epoch": 5.01, "grad_norm": 8.561470031738281, "learning_rate": 4.155629139072848e-05, "loss": 0.3621, "step": 2510 }, { "epoch": 5.01, "grad_norm": 11.474127769470215, "learning_rate": 4.1721854304635764e-05, "loss": 0.3897, "step": 2520 }, { "epoch": 5.01, "eval_accuracy": 0.6497747747747747, "eval_loss": 0.7635564208030701, "eval_runtime": 44.4945, "eval_samples_per_second": 19.958, "eval_steps_per_second": 1.663, "step": 2520 }, { "epoch": 6.0, "grad_norm": 10.609094619750977, "learning_rate": 4.1887417218543047e-05, "loss": 0.3992, "step": 2530 }, { "epoch": 6.0, "grad_norm": 3.468280076980591, "learning_rate": 4.205298013245033e-05, "loss": 0.3226, "step": 2540 }, { "epoch": 6.0, "grad_norm": 12.370600700378418, "learning_rate": 4.221854304635762e-05, "loss": 0.3053, "step": 2550 }, { "epoch": 6.0, "grad_norm": 9.363292694091797, "learning_rate": 4.23841059602649e-05, "loss": 0.4458, "step": 2560 }, { "epoch": 6.0, "grad_norm": 7.07127571105957, "learning_rate": 4.254966887417219e-05, "loss": 0.317, "step": 2570 }, { "epoch": 6.0, "grad_norm": 10.81989574432373, "learning_rate": 4.271523178807947e-05, "loss": 0.4011, "step": 2580 }, { "epoch": 6.0, "grad_norm": 11.262990951538086, "learning_rate": 4.288079470198676e-05, "loss": 0.2711, "step": 2590 }, { "epoch": 6.0, "grad_norm": 24.4318790435791, "learning_rate": 4.304635761589404e-05, "loss": 0.362, "step": 2600 }, { "epoch": 6.0, "grad_norm": 15.196390151977539, "learning_rate": 4.321192052980133e-05, "loss": 0.3314, "step": 2610 }, { "epoch": 6.0, "grad_norm": 10.590712547302246, "learning_rate": 4.337748344370861e-05, "loss": 0.3502, "step": 2620 }, { "epoch": 6.0, "grad_norm": 9.361169815063477, "learning_rate": 4.35430463576159e-05, "loss": 0.3697, "step": 2630 }, { "epoch": 6.0, "grad_norm": 11.022934913635254, "learning_rate": 4.370860927152318e-05, "loss": 0.3217, "step": 2640 }, { "epoch": 6.0, "grad_norm": 9.60159683227539, "learning_rate": 4.3874172185430465e-05, "loss": 0.368, "step": 2650 }, { "epoch": 6.0, "grad_norm": 15.481955528259277, "learning_rate": 4.403973509933775e-05, "loss": 0.3612, "step": 2660 }, { "epoch": 6.0, "grad_norm": 15.360217094421387, "learning_rate": 4.420529801324503e-05, "loss": 0.2848, "step": 2670 }, { "epoch": 6.01, "grad_norm": 10.333962440490723, "learning_rate": 4.437086092715232e-05, "loss": 0.2794, "step": 2680 }, { "epoch": 6.01, "grad_norm": 16.07330894470215, "learning_rate": 4.45364238410596e-05, "loss": 0.4981, "step": 2690 }, { "epoch": 6.01, "grad_norm": 3.810781955718994, "learning_rate": 4.470198675496689e-05, "loss": 0.3905, "step": 2700 }, { "epoch": 6.01, "grad_norm": 7.821160793304443, "learning_rate": 4.4867549668874174e-05, "loss": 0.3131, "step": 2710 }, { "epoch": 6.01, "grad_norm": 14.027177810668945, "learning_rate": 4.5033112582781463e-05, "loss": 0.3232, "step": 2720 }, { "epoch": 6.01, "grad_norm": 17.835529327392578, "learning_rate": 4.519867549668874e-05, "loss": 0.4435, "step": 2730 }, { "epoch": 6.01, "grad_norm": 7.726228713989258, "learning_rate": 4.536423841059603e-05, "loss": 0.2627, "step": 2740 }, { "epoch": 6.01, "grad_norm": 13.054713249206543, "learning_rate": 4.552980132450331e-05, "loss": 0.3224, "step": 2750 }, { "epoch": 6.01, "grad_norm": 8.975753784179688, "learning_rate": 4.56953642384106e-05, "loss": 0.3713, "step": 2760 }, { "epoch": 6.01, "grad_norm": 15.151286125183105, "learning_rate": 4.586092715231788e-05, "loss": 0.494, "step": 2770 }, { "epoch": 6.01, "grad_norm": 6.243801116943359, "learning_rate": 4.602649006622517e-05, "loss": 0.3353, "step": 2780 }, { "epoch": 6.01, "grad_norm": 8.108068466186523, "learning_rate": 4.6192052980132455e-05, "loss": 0.4029, "step": 2790 }, { "epoch": 6.01, "grad_norm": 13.38729190826416, "learning_rate": 4.635761589403974e-05, "loss": 0.3122, "step": 2800 }, { "epoch": 6.01, "grad_norm": 9.62626838684082, "learning_rate": 4.652317880794702e-05, "loss": 0.3107, "step": 2810 }, { "epoch": 6.01, "grad_norm": 8.30498218536377, "learning_rate": 4.668874172185431e-05, "loss": 0.3055, "step": 2820 }, { "epoch": 6.01, "grad_norm": 8.951045989990234, "learning_rate": 4.685430463576159e-05, "loss": 0.3859, "step": 2830 }, { "epoch": 6.01, "grad_norm": 5.538322448730469, "learning_rate": 4.7019867549668875e-05, "loss": 0.3647, "step": 2840 }, { "epoch": 6.01, "grad_norm": 6.897611618041992, "learning_rate": 4.7185430463576164e-05, "loss": 0.3203, "step": 2850 }, { "epoch": 6.01, "grad_norm": 5.254495620727539, "learning_rate": 4.735099337748345e-05, "loss": 0.4124, "step": 2860 }, { "epoch": 6.01, "grad_norm": 8.259984016418457, "learning_rate": 4.751655629139073e-05, "loss": 0.4248, "step": 2870 }, { "epoch": 6.01, "grad_norm": 12.430957794189453, "learning_rate": 4.768211920529801e-05, "loss": 0.2464, "step": 2880 }, { "epoch": 6.01, "grad_norm": 9.76378345489502, "learning_rate": 4.78476821192053e-05, "loss": 0.4417, "step": 2890 }, { "epoch": 6.01, "grad_norm": 10.517653465270996, "learning_rate": 4.8013245033112584e-05, "loss": 0.4046, "step": 2900 }, { "epoch": 6.01, "grad_norm": 13.035500526428223, "learning_rate": 4.8178807947019873e-05, "loss": 0.3439, "step": 2910 }, { "epoch": 6.01, "grad_norm": 7.6101603507995605, "learning_rate": 4.8344370860927156e-05, "loss": 0.3431, "step": 2920 }, { "epoch": 6.01, "grad_norm": 14.4600830078125, "learning_rate": 4.850993377483444e-05, "loss": 0.3835, "step": 2930 }, { "epoch": 6.01, "grad_norm": 19.38602066040039, "learning_rate": 4.867549668874172e-05, "loss": 0.3386, "step": 2940 }, { "epoch": 6.01, "eval_accuracy": 0.661036036036036, "eval_loss": 0.8876528143882751, "eval_runtime": 43.379, "eval_samples_per_second": 20.471, "eval_steps_per_second": 1.706, "step": 2940 }, { "epoch": 7.0, "grad_norm": 11.12556266784668, "learning_rate": 4.884105960264901e-05, "loss": 0.3609, "step": 2950 }, { "epoch": 7.0, "grad_norm": 9.327975273132324, "learning_rate": 4.900662251655629e-05, "loss": 0.1874, "step": 2960 }, { "epoch": 7.0, "grad_norm": 2.0008111000061035, "learning_rate": 4.917218543046358e-05, "loss": 0.2868, "step": 2970 }, { "epoch": 7.0, "grad_norm": 9.07735824584961, "learning_rate": 4.9337748344370865e-05, "loss": 0.3089, "step": 2980 }, { "epoch": 7.0, "grad_norm": 3.0386064052581787, "learning_rate": 4.950331125827815e-05, "loss": 0.272, "step": 2990 }, { "epoch": 7.0, "grad_norm": 5.1751532554626465, "learning_rate": 4.966887417218543e-05, "loss": 0.3166, "step": 3000 }, { "epoch": 7.0, "grad_norm": 4.185882568359375, "learning_rate": 4.983443708609272e-05, "loss": 0.3783, "step": 3010 }, { "epoch": 7.0, "grad_norm": 8.280274391174316, "learning_rate": 5e-05, "loss": 0.3463, "step": 3020 }, { "epoch": 7.0, "grad_norm": 15.407179832458496, "learning_rate": 4.99815987045488e-05, "loss": 0.3766, "step": 3030 }, { "epoch": 7.0, "grad_norm": 12.469446182250977, "learning_rate": 4.99631974090976e-05, "loss": 0.4571, "step": 3040 }, { "epoch": 7.0, "grad_norm": 12.229589462280273, "learning_rate": 4.99447961136464e-05, "loss": 0.3185, "step": 3050 }, { "epoch": 7.0, "grad_norm": 10.47951889038086, "learning_rate": 4.99263948181952e-05, "loss": 0.4482, "step": 3060 }, { "epoch": 7.0, "grad_norm": 3.1830942630767822, "learning_rate": 4.9907993522744e-05, "loss": 0.2548, "step": 3070 }, { "epoch": 7.0, "grad_norm": 11.870902061462402, "learning_rate": 4.98895922272928e-05, "loss": 0.2811, "step": 3080 }, { "epoch": 7.0, "grad_norm": 12.750068664550781, "learning_rate": 4.98711909318416e-05, "loss": 0.2607, "step": 3090 }, { "epoch": 7.01, "grad_norm": 10.34190559387207, "learning_rate": 4.985278963639041e-05, "loss": 0.3298, "step": 3100 }, { "epoch": 7.01, "grad_norm": 6.752932548522949, "learning_rate": 4.9834388340939203e-05, "loss": 0.3155, "step": 3110 }, { "epoch": 7.01, "grad_norm": 11.394509315490723, "learning_rate": 4.9815987045488004e-05, "loss": 0.3237, "step": 3120 }, { "epoch": 7.01, "grad_norm": 24.788921356201172, "learning_rate": 4.9797585750036804e-05, "loss": 0.3304, "step": 3130 }, { "epoch": 7.01, "grad_norm": 8.760883331298828, "learning_rate": 4.9779184454585604e-05, "loss": 0.3403, "step": 3140 }, { "epoch": 7.01, "grad_norm": 11.140789031982422, "learning_rate": 4.9760783159134404e-05, "loss": 0.3818, "step": 3150 }, { "epoch": 7.01, "grad_norm": 3.8733274936676025, "learning_rate": 4.9742381863683204e-05, "loss": 0.3388, "step": 3160 }, { "epoch": 7.01, "grad_norm": 5.976226806640625, "learning_rate": 4.9723980568232004e-05, "loss": 0.3663, "step": 3170 }, { "epoch": 7.01, "grad_norm": 6.937375068664551, "learning_rate": 4.9705579272780804e-05, "loss": 0.3696, "step": 3180 }, { "epoch": 7.01, "grad_norm": 9.472258567810059, "learning_rate": 4.9687177977329604e-05, "loss": 0.3954, "step": 3190 }, { "epoch": 7.01, "grad_norm": 10.504880905151367, "learning_rate": 4.9668776681878404e-05, "loss": 0.3662, "step": 3200 }, { "epoch": 7.01, "grad_norm": 2.160273313522339, "learning_rate": 4.9650375386427205e-05, "loss": 0.306, "step": 3210 }, { "epoch": 7.01, "grad_norm": 10.063016891479492, "learning_rate": 4.9631974090976005e-05, "loss": 0.4123, "step": 3220 }, { "epoch": 7.01, "grad_norm": 8.959742546081543, "learning_rate": 4.961357279552481e-05, "loss": 0.4683, "step": 3230 }, { "epoch": 7.01, "grad_norm": 11.134523391723633, "learning_rate": 4.9595171500073605e-05, "loss": 0.3067, "step": 3240 }, { "epoch": 7.01, "grad_norm": 6.577158451080322, "learning_rate": 4.9576770204622405e-05, "loss": 0.3519, "step": 3250 }, { "epoch": 7.01, "grad_norm": 6.515697479248047, "learning_rate": 4.955836890917121e-05, "loss": 0.3656, "step": 3260 }, { "epoch": 7.01, "grad_norm": 8.94619083404541, "learning_rate": 4.9539967613720005e-05, "loss": 0.3805, "step": 3270 }, { "epoch": 7.01, "grad_norm": 6.963715076446533, "learning_rate": 4.9521566318268805e-05, "loss": 0.3076, "step": 3280 }, { "epoch": 7.01, "grad_norm": 13.25515365600586, "learning_rate": 4.950316502281761e-05, "loss": 0.4114, "step": 3290 }, { "epoch": 7.01, "grad_norm": 7.593176364898682, "learning_rate": 4.9484763727366406e-05, "loss": 0.3189, "step": 3300 }, { "epoch": 7.01, "grad_norm": 5.747020244598389, "learning_rate": 4.9466362431915206e-05, "loss": 0.2708, "step": 3310 }, { "epoch": 7.01, "grad_norm": 9.585000038146973, "learning_rate": 4.944796113646401e-05, "loss": 0.3637, "step": 3320 }, { "epoch": 7.01, "grad_norm": 13.932437896728516, "learning_rate": 4.9429559841012806e-05, "loss": 0.3064, "step": 3330 }, { "epoch": 7.01, "grad_norm": 12.580643653869629, "learning_rate": 4.9411158545561606e-05, "loss": 0.3294, "step": 3340 }, { "epoch": 7.01, "grad_norm": 7.36806583404541, "learning_rate": 4.939275725011041e-05, "loss": 0.2394, "step": 3350 }, { "epoch": 7.01, "grad_norm": 34.076446533203125, "learning_rate": 4.937435595465921e-05, "loss": 0.3601, "step": 3360 }, { "epoch": 7.01, "eval_accuracy": 0.6486486486486487, "eval_loss": 0.8790870308876038, "eval_runtime": 41.5269, "eval_samples_per_second": 21.384, "eval_steps_per_second": 1.782, "step": 3360 }, { "epoch": 8.0, "grad_norm": 14.441483497619629, "learning_rate": 4.9355954659208006e-05, "loss": 0.2921, "step": 3370 }, { "epoch": 8.0, "grad_norm": 9.203411102294922, "learning_rate": 4.933755336375681e-05, "loss": 0.2638, "step": 3380 }, { "epoch": 8.0, "grad_norm": 0.4057348668575287, "learning_rate": 4.9319152068305613e-05, "loss": 0.2193, "step": 3390 }, { "epoch": 8.0, "grad_norm": 13.04542350769043, "learning_rate": 4.930075077285441e-05, "loss": 0.446, "step": 3400 }, { "epoch": 8.0, "grad_norm": 9.762777328491211, "learning_rate": 4.9282349477403214e-05, "loss": 0.3489, "step": 3410 }, { "epoch": 8.0, "grad_norm": 8.025638580322266, "learning_rate": 4.9263948181952014e-05, "loss": 0.2376, "step": 3420 }, { "epoch": 8.0, "grad_norm": 5.531264781951904, "learning_rate": 4.924554688650081e-05, "loss": 0.3041, "step": 3430 }, { "epoch": 8.0, "grad_norm": 7.828507900238037, "learning_rate": 4.9227145591049614e-05, "loss": 0.3604, "step": 3440 }, { "epoch": 8.0, "grad_norm": 8.740315437316895, "learning_rate": 4.9208744295598414e-05, "loss": 0.3292, "step": 3450 }, { "epoch": 8.0, "grad_norm": 12.079410552978516, "learning_rate": 4.9190343000147214e-05, "loss": 0.204, "step": 3460 }, { "epoch": 8.0, "grad_norm": 0.8909263610839844, "learning_rate": 4.9171941704696014e-05, "loss": 0.2667, "step": 3470 }, { "epoch": 8.0, "grad_norm": 15.108928680419922, "learning_rate": 4.9153540409244814e-05, "loss": 0.3396, "step": 3480 }, { "epoch": 8.0, "grad_norm": 13.09268569946289, "learning_rate": 4.9135139113793615e-05, "loss": 0.2719, "step": 3490 }, { "epoch": 8.0, "grad_norm": 12.801443099975586, "learning_rate": 4.9116737818342415e-05, "loss": 0.2951, "step": 3500 }, { "epoch": 8.0, "grad_norm": 15.295145034790039, "learning_rate": 4.9098336522891215e-05, "loss": 0.2027, "step": 3510 }, { "epoch": 8.01, "grad_norm": 10.882307052612305, "learning_rate": 4.9079935227440015e-05, "loss": 0.459, "step": 3520 }, { "epoch": 8.01, "grad_norm": 12.717984199523926, "learning_rate": 4.9061533931988815e-05, "loss": 0.3292, "step": 3530 }, { "epoch": 8.01, "grad_norm": 1.0784927606582642, "learning_rate": 4.9043132636537615e-05, "loss": 0.2172, "step": 3540 }, { "epoch": 8.01, "grad_norm": 1.1845206022262573, "learning_rate": 4.9024731341086415e-05, "loss": 0.2034, "step": 3550 }, { "epoch": 8.01, "grad_norm": 15.95457935333252, "learning_rate": 4.9006330045635215e-05, "loss": 0.3306, "step": 3560 }, { "epoch": 8.01, "grad_norm": 11.536294937133789, "learning_rate": 4.8987928750184016e-05, "loss": 0.3716, "step": 3570 }, { "epoch": 8.01, "grad_norm": 10.536565780639648, "learning_rate": 4.8969527454732816e-05, "loss": 0.2894, "step": 3580 }, { "epoch": 8.01, "grad_norm": 15.887523651123047, "learning_rate": 4.8951126159281616e-05, "loss": 0.324, "step": 3590 }, { "epoch": 8.01, "grad_norm": 8.051523208618164, "learning_rate": 4.8932724863830416e-05, "loss": 0.2115, "step": 3600 }, { "epoch": 8.01, "grad_norm": 12.734569549560547, "learning_rate": 4.8914323568379216e-05, "loss": 0.261, "step": 3610 }, { "epoch": 8.01, "grad_norm": 21.070165634155273, "learning_rate": 4.8895922272928016e-05, "loss": 0.26, "step": 3620 }, { "epoch": 8.01, "grad_norm": 14.227327346801758, "learning_rate": 4.8877520977476816e-05, "loss": 0.37, "step": 3630 }, { "epoch": 8.01, "grad_norm": 0.5862560868263245, "learning_rate": 4.8859119682025616e-05, "loss": 0.2208, "step": 3640 }, { "epoch": 8.01, "grad_norm": 7.712880611419678, "learning_rate": 4.8840718386574416e-05, "loss": 0.3196, "step": 3650 }, { "epoch": 8.01, "grad_norm": 8.337738037109375, "learning_rate": 4.8822317091123217e-05, "loss": 0.3664, "step": 3660 }, { "epoch": 8.01, "grad_norm": 8.144108772277832, "learning_rate": 4.880391579567202e-05, "loss": 0.395, "step": 3670 }, { "epoch": 8.01, "grad_norm": 5.335818290710449, "learning_rate": 4.878551450022082e-05, "loss": 0.208, "step": 3680 }, { "epoch": 8.01, "grad_norm": 1.657988429069519, "learning_rate": 4.876711320476962e-05, "loss": 0.2321, "step": 3690 }, { "epoch": 8.01, "grad_norm": 11.252223014831543, "learning_rate": 4.874871190931842e-05, "loss": 0.3122, "step": 3700 }, { "epoch": 8.01, "grad_norm": 15.258499145507812, "learning_rate": 4.873031061386722e-05, "loss": 0.4487, "step": 3710 }, { "epoch": 8.01, "grad_norm": 13.362289428710938, "learning_rate": 4.8711909318416024e-05, "loss": 0.4194, "step": 3720 }, { "epoch": 8.01, "grad_norm": 10.974387168884277, "learning_rate": 4.869350802296482e-05, "loss": 0.3243, "step": 3730 }, { "epoch": 8.01, "grad_norm": 7.41167688369751, "learning_rate": 4.867510672751362e-05, "loss": 0.3533, "step": 3740 }, { "epoch": 8.01, "grad_norm": 5.849924087524414, "learning_rate": 4.8656705432062424e-05, "loss": 0.3544, "step": 3750 }, { "epoch": 8.01, "grad_norm": 11.34356689453125, "learning_rate": 4.863830413661122e-05, "loss": 0.3534, "step": 3760 }, { "epoch": 8.01, "grad_norm": 16.076383590698242, "learning_rate": 4.861990284116002e-05, "loss": 0.2324, "step": 3770 }, { "epoch": 8.01, "grad_norm": 7.834190845489502, "learning_rate": 4.8601501545708825e-05, "loss": 0.3401, "step": 3780 }, { "epoch": 8.01, "eval_accuracy": 0.6632882882882883, "eval_loss": 0.740277111530304, "eval_runtime": 42.3365, "eval_samples_per_second": 20.975, "eval_steps_per_second": 1.748, "step": 3780 }, { "epoch": 9.0, "grad_norm": 10.141741752624512, "learning_rate": 4.858310025025762e-05, "loss": 0.2218, "step": 3790 }, { "epoch": 9.0, "grad_norm": 14.957062721252441, "learning_rate": 4.856469895480642e-05, "loss": 0.2532, "step": 3800 }, { "epoch": 9.0, "grad_norm": 6.903181552886963, "learning_rate": 4.8546297659355225e-05, "loss": 0.2675, "step": 3810 }, { "epoch": 9.0, "grad_norm": 8.146132469177246, "learning_rate": 4.852789636390402e-05, "loss": 0.278, "step": 3820 }, { "epoch": 9.0, "grad_norm": 16.355253219604492, "learning_rate": 4.850949506845282e-05, "loss": 0.2215, "step": 3830 }, { "epoch": 9.0, "grad_norm": 4.766119003295898, "learning_rate": 4.8491093773001625e-05, "loss": 0.2977, "step": 3840 }, { "epoch": 9.0, "grad_norm": 17.544775009155273, "learning_rate": 4.8472692477550426e-05, "loss": 0.2754, "step": 3850 }, { "epoch": 9.0, "grad_norm": 3.9578094482421875, "learning_rate": 4.845429118209922e-05, "loss": 0.2533, "step": 3860 }, { "epoch": 9.0, "grad_norm": 5.8706231117248535, "learning_rate": 4.8435889886648026e-05, "loss": 0.1435, "step": 3870 }, { "epoch": 9.0, "grad_norm": 5.420246124267578, "learning_rate": 4.8417488591196826e-05, "loss": 0.2288, "step": 3880 }, { "epoch": 9.0, "grad_norm": 23.27185821533203, "learning_rate": 4.839908729574562e-05, "loss": 0.4029, "step": 3890 }, { "epoch": 9.0, "grad_norm": 9.662599563598633, "learning_rate": 4.8380686000294426e-05, "loss": 0.4065, "step": 3900 }, { "epoch": 9.0, "grad_norm": 7.802966117858887, "learning_rate": 4.8362284704843226e-05, "loss": 0.2416, "step": 3910 }, { "epoch": 9.0, "grad_norm": 10.698823928833008, "learning_rate": 4.834388340939202e-05, "loss": 0.2022, "step": 3920 }, { "epoch": 9.0, "grad_norm": 7.557618141174316, "learning_rate": 4.8325482113940826e-05, "loss": 0.3592, "step": 3930 }, { "epoch": 9.01, "grad_norm": 14.383673667907715, "learning_rate": 4.8307080818489627e-05, "loss": 0.1569, "step": 3940 }, { "epoch": 9.01, "grad_norm": 14.874628067016602, "learning_rate": 4.828867952303842e-05, "loss": 0.3816, "step": 3950 }, { "epoch": 9.01, "grad_norm": 8.525997161865234, "learning_rate": 4.827027822758722e-05, "loss": 0.2609, "step": 3960 }, { "epoch": 9.01, "grad_norm": 11.662755966186523, "learning_rate": 4.825187693213603e-05, "loss": 0.2357, "step": 3970 }, { "epoch": 9.01, "grad_norm": 13.592899322509766, "learning_rate": 4.823347563668483e-05, "loss": 0.2936, "step": 3980 }, { "epoch": 9.01, "grad_norm": 7.432744026184082, "learning_rate": 4.821507434123362e-05, "loss": 0.2854, "step": 3990 }, { "epoch": 9.01, "grad_norm": 12.174053192138672, "learning_rate": 4.819667304578243e-05, "loss": 0.3413, "step": 4000 }, { "epoch": 9.01, "grad_norm": 20.847490310668945, "learning_rate": 4.817827175033123e-05, "loss": 0.1914, "step": 4010 }, { "epoch": 9.01, "grad_norm": 19.58496856689453, "learning_rate": 4.815987045488002e-05, "loss": 0.2485, "step": 4020 }, { "epoch": 9.01, "grad_norm": 11.753061294555664, "learning_rate": 4.814146915942883e-05, "loss": 0.4117, "step": 4030 }, { "epoch": 9.01, "grad_norm": 7.240084648132324, "learning_rate": 4.812306786397763e-05, "loss": 0.2516, "step": 4040 }, { "epoch": 9.01, "grad_norm": 8.860321044921875, "learning_rate": 4.810466656852642e-05, "loss": 0.3589, "step": 4050 }, { "epoch": 9.01, "grad_norm": 9.940979957580566, "learning_rate": 4.808626527307523e-05, "loss": 0.2426, "step": 4060 }, { "epoch": 9.01, "grad_norm": 5.785098552703857, "learning_rate": 4.806786397762403e-05, "loss": 0.2779, "step": 4070 }, { "epoch": 9.01, "grad_norm": 13.360100746154785, "learning_rate": 4.804946268217283e-05, "loss": 0.18, "step": 4080 }, { "epoch": 9.01, "grad_norm": 13.76761531829834, "learning_rate": 4.803106138672163e-05, "loss": 0.2789, "step": 4090 }, { "epoch": 9.01, "grad_norm": 20.905683517456055, "learning_rate": 4.801266009127043e-05, "loss": 0.2769, "step": 4100 }, { "epoch": 9.01, "grad_norm": 11.389440536499023, "learning_rate": 4.799425879581923e-05, "loss": 0.1383, "step": 4110 }, { "epoch": 9.01, "grad_norm": 22.627151489257812, "learning_rate": 4.797585750036803e-05, "loss": 0.3232, "step": 4120 }, { "epoch": 9.01, "grad_norm": 1.108852505683899, "learning_rate": 4.795745620491683e-05, "loss": 0.2509, "step": 4130 }, { "epoch": 9.01, "grad_norm": 11.086101531982422, "learning_rate": 4.793905490946563e-05, "loss": 0.2939, "step": 4140 }, { "epoch": 9.01, "grad_norm": 21.736812591552734, "learning_rate": 4.792065361401443e-05, "loss": 0.3108, "step": 4150 }, { "epoch": 9.01, "grad_norm": 12.673864364624023, "learning_rate": 4.790225231856323e-05, "loss": 0.2923, "step": 4160 }, { "epoch": 9.01, "grad_norm": 23.429868698120117, "learning_rate": 4.788385102311203e-05, "loss": 0.3268, "step": 4170 }, { "epoch": 9.01, "grad_norm": 20.748750686645508, "learning_rate": 4.786544972766083e-05, "loss": 0.3612, "step": 4180 }, { "epoch": 9.01, "grad_norm": 9.398347854614258, "learning_rate": 4.784704843220963e-05, "loss": 0.3637, "step": 4190 }, { "epoch": 9.01, "grad_norm": 7.497073650360107, "learning_rate": 4.782864713675843e-05, "loss": 0.3113, "step": 4200 }, { "epoch": 9.01, "eval_accuracy": 0.6959459459459459, "eval_loss": 0.7315611839294434, "eval_runtime": 41.8788, "eval_samples_per_second": 21.204, "eval_steps_per_second": 1.767, "step": 4200 }, { "epoch": 10.0, "grad_norm": 2.589111566543579, "learning_rate": 4.781024584130723e-05, "loss": 0.3071, "step": 4210 }, { "epoch": 10.0, "grad_norm": 2.229191541671753, "learning_rate": 4.779184454585603e-05, "loss": 0.1844, "step": 4220 }, { "epoch": 10.0, "grad_norm": 5.470048904418945, "learning_rate": 4.777344325040483e-05, "loss": 0.3186, "step": 4230 }, { "epoch": 10.0, "grad_norm": 2.975252389907837, "learning_rate": 4.775504195495363e-05, "loss": 0.2843, "step": 4240 }, { "epoch": 10.0, "grad_norm": 27.58317756652832, "learning_rate": 4.773664065950243e-05, "loss": 0.2342, "step": 4250 }, { "epoch": 10.0, "grad_norm": 10.897406578063965, "learning_rate": 4.771823936405123e-05, "loss": 0.3133, "step": 4260 }, { "epoch": 10.0, "grad_norm": 18.671857833862305, "learning_rate": 4.769983806860003e-05, "loss": 0.247, "step": 4270 }, { "epoch": 10.0, "grad_norm": 3.1605987548828125, "learning_rate": 4.768143677314883e-05, "loss": 0.2691, "step": 4280 }, { "epoch": 10.0, "grad_norm": 15.147821426391602, "learning_rate": 4.766303547769763e-05, "loss": 0.1998, "step": 4290 }, { "epoch": 10.0, "grad_norm": 9.628533363342285, "learning_rate": 4.764463418224643e-05, "loss": 0.4104, "step": 4300 }, { "epoch": 10.0, "grad_norm": 3.3812103271484375, "learning_rate": 4.762623288679523e-05, "loss": 0.3426, "step": 4310 }, { "epoch": 10.0, "grad_norm": 13.017431259155273, "learning_rate": 4.760783159134403e-05, "loss": 0.2946, "step": 4320 }, { "epoch": 10.0, "grad_norm": 12.399896621704102, "learning_rate": 4.758943029589283e-05, "loss": 0.1878, "step": 4330 }, { "epoch": 10.0, "grad_norm": 21.525562286376953, "learning_rate": 4.757102900044164e-05, "loss": 0.2973, "step": 4340 }, { "epoch": 10.0, "grad_norm": 3.3385937213897705, "learning_rate": 4.755262770499043e-05, "loss": 0.1394, "step": 4350 }, { "epoch": 10.01, "grad_norm": 1.1139613389968872, "learning_rate": 4.753422640953923e-05, "loss": 0.22, "step": 4360 }, { "epoch": 10.01, "grad_norm": 11.690268516540527, "learning_rate": 4.751582511408804e-05, "loss": 0.2399, "step": 4370 }, { "epoch": 10.01, "grad_norm": 17.56683921813965, "learning_rate": 4.749742381863683e-05, "loss": 0.4817, "step": 4380 }, { "epoch": 10.01, "grad_norm": 6.885412693023682, "learning_rate": 4.747902252318563e-05, "loss": 0.3016, "step": 4390 }, { "epoch": 10.01, "grad_norm": 16.683002471923828, "learning_rate": 4.746062122773444e-05, "loss": 0.2179, "step": 4400 }, { "epoch": 10.01, "grad_norm": 21.049219131469727, "learning_rate": 4.744221993228323e-05, "loss": 0.2264, "step": 4410 }, { "epoch": 10.01, "grad_norm": 0.32361775636672974, "learning_rate": 4.742381863683203e-05, "loss": 0.2258, "step": 4420 }, { "epoch": 10.01, "grad_norm": 0.522939920425415, "learning_rate": 4.740541734138084e-05, "loss": 0.4103, "step": 4430 }, { "epoch": 10.01, "grad_norm": 5.143355846405029, "learning_rate": 4.738701604592963e-05, "loss": 0.2144, "step": 4440 }, { "epoch": 10.01, "grad_norm": 12.447662353515625, "learning_rate": 4.736861475047843e-05, "loss": 0.3608, "step": 4450 }, { "epoch": 10.01, "grad_norm": 10.097562789916992, "learning_rate": 4.735021345502724e-05, "loss": 0.2905, "step": 4460 }, { "epoch": 10.01, "grad_norm": 7.1078362464904785, "learning_rate": 4.733181215957604e-05, "loss": 0.2133, "step": 4470 }, { "epoch": 10.01, "grad_norm": 11.503559112548828, "learning_rate": 4.731341086412483e-05, "loss": 0.1682, "step": 4480 }, { "epoch": 10.01, "grad_norm": 17.54301643371582, "learning_rate": 4.729500956867364e-05, "loss": 0.2728, "step": 4490 }, { "epoch": 10.01, "grad_norm": 0.5605434775352478, "learning_rate": 4.727660827322244e-05, "loss": 0.2068, "step": 4500 }, { "epoch": 10.01, "grad_norm": 29.69890022277832, "learning_rate": 4.725820697777123e-05, "loss": 0.2811, "step": 4510 }, { "epoch": 10.01, "grad_norm": 2.86535906791687, "learning_rate": 4.723980568232004e-05, "loss": 0.1564, "step": 4520 }, { "epoch": 10.01, "grad_norm": 16.985363006591797, "learning_rate": 4.722140438686884e-05, "loss": 0.3927, "step": 4530 }, { "epoch": 10.01, "grad_norm": 11.91352367401123, "learning_rate": 4.7203003091417633e-05, "loss": 0.232, "step": 4540 }, { "epoch": 10.01, "grad_norm": 4.534491539001465, "learning_rate": 4.718460179596644e-05, "loss": 0.2632, "step": 4550 }, { "epoch": 10.01, "grad_norm": 5.70693302154541, "learning_rate": 4.716620050051524e-05, "loss": 0.309, "step": 4560 }, { "epoch": 10.01, "grad_norm": 9.484837532043457, "learning_rate": 4.7147799205064034e-05, "loss": 0.2051, "step": 4570 }, { "epoch": 10.01, "grad_norm": 2.0406031608581543, "learning_rate": 4.712939790961284e-05, "loss": 0.2524, "step": 4580 }, { "epoch": 10.01, "grad_norm": 11.688041687011719, "learning_rate": 4.711099661416164e-05, "loss": 0.408, "step": 4590 }, { "epoch": 10.01, "grad_norm": 6.8777899742126465, "learning_rate": 4.709259531871044e-05, "loss": 0.2687, "step": 4600 }, { "epoch": 10.01, "grad_norm": 13.090034484863281, "learning_rate": 4.707419402325924e-05, "loss": 0.1895, "step": 4610 }, { "epoch": 10.01, "grad_norm": 1.0532723665237427, "learning_rate": 4.705579272780804e-05, "loss": 0.2096, "step": 4620 }, { "epoch": 10.01, "eval_accuracy": 0.6981981981981982, "eval_loss": 0.9519428610801697, "eval_runtime": 42.0352, "eval_samples_per_second": 21.125, "eval_steps_per_second": 1.76, "step": 4620 }, { "epoch": 11.0, "grad_norm": 10.391329765319824, "learning_rate": 4.703739143235684e-05, "loss": 0.2347, "step": 4630 }, { "epoch": 11.0, "grad_norm": 10.35151481628418, "learning_rate": 4.701899013690564e-05, "loss": 0.3039, "step": 4640 }, { "epoch": 11.0, "grad_norm": 15.39012622833252, "learning_rate": 4.700058884145444e-05, "loss": 0.1841, "step": 4650 }, { "epoch": 11.0, "grad_norm": 1.5340383052825928, "learning_rate": 4.698218754600324e-05, "loss": 0.1564, "step": 4660 }, { "epoch": 11.0, "grad_norm": 8.856043815612793, "learning_rate": 4.696378625055204e-05, "loss": 0.1238, "step": 4670 }, { "epoch": 11.0, "grad_norm": 6.075847148895264, "learning_rate": 4.694538495510084e-05, "loss": 0.15, "step": 4680 }, { "epoch": 11.0, "grad_norm": 0.534511923789978, "learning_rate": 4.692698365964964e-05, "loss": 0.2028, "step": 4690 }, { "epoch": 11.0, "grad_norm": 19.36056137084961, "learning_rate": 4.690858236419844e-05, "loss": 0.2122, "step": 4700 }, { "epoch": 11.0, "grad_norm": 0.39027050137519836, "learning_rate": 4.689018106874724e-05, "loss": 0.2455, "step": 4710 }, { "epoch": 11.0, "grad_norm": 16.614545822143555, "learning_rate": 4.687177977329604e-05, "loss": 0.291, "step": 4720 }, { "epoch": 11.0, "grad_norm": 7.031955242156982, "learning_rate": 4.685337847784484e-05, "loss": 0.1445, "step": 4730 }, { "epoch": 11.0, "grad_norm": 13.988171577453613, "learning_rate": 4.683497718239364e-05, "loss": 0.1988, "step": 4740 }, { "epoch": 11.0, "grad_norm": 11.631166458129883, "learning_rate": 4.681657588694244e-05, "loss": 0.2773, "step": 4750 }, { "epoch": 11.0, "grad_norm": 7.512355327606201, "learning_rate": 4.679817459149124e-05, "loss": 0.1132, "step": 4760 }, { "epoch": 11.0, "grad_norm": 0.6635879874229431, "learning_rate": 4.677977329604004e-05, "loss": 0.1799, "step": 4770 }, { "epoch": 11.01, "grad_norm": 59.61662292480469, "learning_rate": 4.676137200058884e-05, "loss": 0.2375, "step": 4780 }, { "epoch": 11.01, "grad_norm": 9.080729484558105, "learning_rate": 4.674297070513764e-05, "loss": 0.1571, "step": 4790 }, { "epoch": 11.01, "grad_norm": 11.073897361755371, "learning_rate": 4.672456940968644e-05, "loss": 0.2831, "step": 4800 }, { "epoch": 11.01, "grad_norm": 0.20718024671077728, "learning_rate": 4.6706168114235243e-05, "loss": 0.247, "step": 4810 }, { "epoch": 11.01, "grad_norm": 6.692288875579834, "learning_rate": 4.6687766818784044e-05, "loss": 0.1577, "step": 4820 }, { "epoch": 11.01, "grad_norm": 21.0302791595459, "learning_rate": 4.666936552333285e-05, "loss": 0.2697, "step": 4830 }, { "epoch": 11.01, "grad_norm": 18.954795837402344, "learning_rate": 4.6650964227881644e-05, "loss": 0.2863, "step": 4840 }, { "epoch": 11.01, "grad_norm": 0.38008683919906616, "learning_rate": 4.6632562932430444e-05, "loss": 0.2369, "step": 4850 }, { "epoch": 11.01, "grad_norm": 7.475613594055176, "learning_rate": 4.661416163697925e-05, "loss": 0.3686, "step": 4860 }, { "epoch": 11.01, "grad_norm": 8.689560890197754, "learning_rate": 4.6595760341528044e-05, "loss": 0.2117, "step": 4870 }, { "epoch": 11.01, "grad_norm": 9.824551582336426, "learning_rate": 4.6577359046076844e-05, "loss": 0.2091, "step": 4880 }, { "epoch": 11.01, "grad_norm": 0.5613351464271545, "learning_rate": 4.655895775062565e-05, "loss": 0.1785, "step": 4890 }, { "epoch": 11.01, "grad_norm": 15.86489486694336, "learning_rate": 4.6540556455174444e-05, "loss": 0.2719, "step": 4900 }, { "epoch": 11.01, "grad_norm": 13.821952819824219, "learning_rate": 4.6522155159723245e-05, "loss": 0.1034, "step": 4910 }, { "epoch": 11.01, "grad_norm": 22.148283004760742, "learning_rate": 4.650375386427205e-05, "loss": 0.3639, "step": 4920 }, { "epoch": 11.01, "grad_norm": 10.938385963439941, "learning_rate": 4.6485352568820845e-05, "loss": 0.3123, "step": 4930 }, { "epoch": 11.01, "grad_norm": 1.883072853088379, "learning_rate": 4.6466951273369645e-05, "loss": 0.3987, "step": 4940 }, { "epoch": 11.01, "grad_norm": 1.4499850273132324, "learning_rate": 4.644854997791845e-05, "loss": 0.1556, "step": 4950 }, { "epoch": 11.01, "grad_norm": 11.371675491333008, "learning_rate": 4.643014868246725e-05, "loss": 0.2313, "step": 4960 }, { "epoch": 11.01, "grad_norm": 9.29699993133545, "learning_rate": 4.6411747387016045e-05, "loss": 0.1583, "step": 4970 }, { "epoch": 11.01, "grad_norm": 18.879003524780273, "learning_rate": 4.639334609156485e-05, "loss": 0.3131, "step": 4980 }, { "epoch": 11.01, "grad_norm": 3.0703296661376953, "learning_rate": 4.637494479611365e-05, "loss": 0.3215, "step": 4990 }, { "epoch": 11.01, "grad_norm": 9.388489723205566, "learning_rate": 4.6356543500662446e-05, "loss": 0.2506, "step": 5000 }, { "epoch": 11.01, "grad_norm": 15.042057991027832, "learning_rate": 4.6338142205211246e-05, "loss": 0.3032, "step": 5010 }, { "epoch": 11.01, "grad_norm": 14.531025886535645, "learning_rate": 4.631974090976005e-05, "loss": 0.2766, "step": 5020 }, { "epoch": 11.01, "grad_norm": 13.1823148727417, "learning_rate": 4.6301339614308846e-05, "loss": 0.1502, "step": 5030 }, { "epoch": 11.01, "grad_norm": 12.649144172668457, "learning_rate": 4.6282938318857646e-05, "loss": 0.1537, "step": 5040 }, { "epoch": 11.01, "eval_accuracy": 0.7015765765765766, "eval_loss": 0.9116391539573669, "eval_runtime": 41.456, "eval_samples_per_second": 21.42, "eval_steps_per_second": 1.785, "step": 5040 }, { "epoch": 12.0, "grad_norm": 4.675528526306152, "learning_rate": 4.626453702340645e-05, "loss": 0.1384, "step": 5050 }, { "epoch": 12.0, "grad_norm": 15.369380950927734, "learning_rate": 4.6246135727955246e-05, "loss": 0.1938, "step": 5060 }, { "epoch": 12.0, "grad_norm": 9.795681953430176, "learning_rate": 4.6227734432504046e-05, "loss": 0.1583, "step": 5070 }, { "epoch": 12.0, "grad_norm": 16.371030807495117, "learning_rate": 4.620933313705285e-05, "loss": 0.2502, "step": 5080 }, { "epoch": 12.0, "grad_norm": 17.098554611206055, "learning_rate": 4.6190931841601653e-05, "loss": 0.1198, "step": 5090 }, { "epoch": 12.0, "grad_norm": 1.2527225017547607, "learning_rate": 4.617253054615045e-05, "loss": 0.0858, "step": 5100 }, { "epoch": 12.0, "grad_norm": 5.850952625274658, "learning_rate": 4.6154129250699254e-05, "loss": 0.2589, "step": 5110 }, { "epoch": 12.0, "grad_norm": 3.29551100730896, "learning_rate": 4.6135727955248054e-05, "loss": 0.1703, "step": 5120 }, { "epoch": 12.0, "grad_norm": 13.772591590881348, "learning_rate": 4.611732665979685e-05, "loss": 0.3824, "step": 5130 }, { "epoch": 12.0, "grad_norm": 14.904064178466797, "learning_rate": 4.6098925364345654e-05, "loss": 0.1953, "step": 5140 }, { "epoch": 12.0, "grad_norm": 21.28123664855957, "learning_rate": 4.6080524068894454e-05, "loss": 0.2385, "step": 5150 }, { "epoch": 12.0, "grad_norm": 0.9189083576202393, "learning_rate": 4.606212277344325e-05, "loss": 0.1396, "step": 5160 }, { "epoch": 12.0, "grad_norm": 12.280217170715332, "learning_rate": 4.6043721477992054e-05, "loss": 0.1751, "step": 5170 }, { "epoch": 12.0, "grad_norm": 7.960886478424072, "learning_rate": 4.6025320182540854e-05, "loss": 0.0838, "step": 5180 }, { "epoch": 12.0, "grad_norm": 3.011474370956421, "learning_rate": 4.600691888708965e-05, "loss": 0.1764, "step": 5190 }, { "epoch": 12.01, "grad_norm": 19.604820251464844, "learning_rate": 4.5988517591638455e-05, "loss": 0.2461, "step": 5200 }, { "epoch": 12.01, "grad_norm": 12.652880668640137, "learning_rate": 4.5970116296187255e-05, "loss": 0.2306, "step": 5210 }, { "epoch": 12.01, "grad_norm": 18.40096092224121, "learning_rate": 4.5951715000736055e-05, "loss": 0.1667, "step": 5220 }, { "epoch": 12.01, "grad_norm": 13.00020980834961, "learning_rate": 4.5933313705284855e-05, "loss": 0.2536, "step": 5230 }, { "epoch": 12.01, "grad_norm": 14.974309921264648, "learning_rate": 4.5914912409833655e-05, "loss": 0.2561, "step": 5240 }, { "epoch": 12.01, "grad_norm": 11.146673202514648, "learning_rate": 4.5896511114382455e-05, "loss": 0.2038, "step": 5250 }, { "epoch": 12.01, "grad_norm": 8.248503684997559, "learning_rate": 4.5878109818931255e-05, "loss": 0.2505, "step": 5260 }, { "epoch": 12.01, "grad_norm": 11.573139190673828, "learning_rate": 4.5859708523480055e-05, "loss": 0.1565, "step": 5270 }, { "epoch": 12.01, "grad_norm": 19.135541915893555, "learning_rate": 4.5841307228028856e-05, "loss": 0.2578, "step": 5280 }, { "epoch": 12.01, "grad_norm": 12.795113563537598, "learning_rate": 4.5822905932577656e-05, "loss": 0.2591, "step": 5290 }, { "epoch": 12.01, "grad_norm": 16.305692672729492, "learning_rate": 4.5804504637126456e-05, "loss": 0.163, "step": 5300 }, { "epoch": 12.01, "grad_norm": 6.531643390655518, "learning_rate": 4.5786103341675256e-05, "loss": 0.2605, "step": 5310 }, { "epoch": 12.01, "grad_norm": 13.617148399353027, "learning_rate": 4.5767702046224056e-05, "loss": 0.2031, "step": 5320 }, { "epoch": 12.01, "grad_norm": 5.859495162963867, "learning_rate": 4.5749300750772856e-05, "loss": 0.1609, "step": 5330 }, { "epoch": 12.01, "grad_norm": 0.675979733467102, "learning_rate": 4.5730899455321656e-05, "loss": 0.2702, "step": 5340 }, { "epoch": 12.01, "grad_norm": 16.337562561035156, "learning_rate": 4.5712498159870456e-05, "loss": 0.1415, "step": 5350 }, { "epoch": 12.01, "grad_norm": 4.218993186950684, "learning_rate": 4.5694096864419257e-05, "loss": 0.2667, "step": 5360 }, { "epoch": 12.01, "grad_norm": 20.66876983642578, "learning_rate": 4.567569556896806e-05, "loss": 0.2478, "step": 5370 }, { "epoch": 12.01, "grad_norm": 8.135565757751465, "learning_rate": 4.565729427351686e-05, "loss": 0.2205, "step": 5380 }, { "epoch": 12.01, "grad_norm": 9.30663776397705, "learning_rate": 4.563889297806566e-05, "loss": 0.328, "step": 5390 }, { "epoch": 12.01, "grad_norm": 16.911775588989258, "learning_rate": 4.562049168261446e-05, "loss": 0.2238, "step": 5400 }, { "epoch": 12.01, "grad_norm": 4.587623119354248, "learning_rate": 4.560209038716326e-05, "loss": 0.0763, "step": 5410 }, { "epoch": 12.01, "grad_norm": 17.55312728881836, "learning_rate": 4.558368909171206e-05, "loss": 0.2904, "step": 5420 }, { "epoch": 12.01, "grad_norm": 11.453413009643555, "learning_rate": 4.556528779626086e-05, "loss": 0.3298, "step": 5430 }, { "epoch": 12.01, "grad_norm": 3.5977048873901367, "learning_rate": 4.554688650080966e-05, "loss": 0.331, "step": 5440 }, { "epoch": 12.01, "grad_norm": 8.736082077026367, "learning_rate": 4.5528485205358464e-05, "loss": 0.1551, "step": 5450 }, { "epoch": 12.01, "grad_norm": 3.3765244483947754, "learning_rate": 4.551008390990726e-05, "loss": 0.1113, "step": 5460 }, { "epoch": 12.01, "eval_accuracy": 0.6970720720720721, "eval_loss": 1.00467848777771, "eval_runtime": 41.2332, "eval_samples_per_second": 21.536, "eval_steps_per_second": 1.795, "step": 5460 }, { "epoch": 13.0, "grad_norm": 23.630643844604492, "learning_rate": 4.549168261445606e-05, "loss": 0.083, "step": 5470 }, { "epoch": 13.0, "grad_norm": 2.3666303157806396, "learning_rate": 4.5473281319004865e-05, "loss": 0.1436, "step": 5480 }, { "epoch": 13.0, "grad_norm": 22.300064086914062, "learning_rate": 4.545488002355366e-05, "loss": 0.1631, "step": 5490 }, { "epoch": 13.0, "grad_norm": 14.8043212890625, "learning_rate": 4.543647872810246e-05, "loss": 0.1613, "step": 5500 }, { "epoch": 13.0, "grad_norm": 0.10374309122562408, "learning_rate": 4.5418077432651265e-05, "loss": 0.1761, "step": 5510 }, { "epoch": 13.0, "grad_norm": 8.43531322479248, "learning_rate": 4.539967613720006e-05, "loss": 0.1712, "step": 5520 }, { "epoch": 13.0, "grad_norm": 8.424771308898926, "learning_rate": 4.538127484174886e-05, "loss": 0.2752, "step": 5530 }, { "epoch": 13.0, "grad_norm": 12.37260913848877, "learning_rate": 4.5362873546297665e-05, "loss": 0.2324, "step": 5540 }, { "epoch": 13.0, "grad_norm": 9.709940910339355, "learning_rate": 4.534447225084646e-05, "loss": 0.2579, "step": 5550 }, { "epoch": 13.0, "grad_norm": 20.865863800048828, "learning_rate": 4.532607095539526e-05, "loss": 0.3222, "step": 5560 }, { "epoch": 13.0, "grad_norm": 12.145430564880371, "learning_rate": 4.5307669659944066e-05, "loss": 0.1261, "step": 5570 }, { "epoch": 13.0, "grad_norm": 7.941616058349609, "learning_rate": 4.5289268364492866e-05, "loss": 0.1429, "step": 5580 }, { "epoch": 13.0, "grad_norm": 4.826683521270752, "learning_rate": 4.527086706904166e-05, "loss": 0.2266, "step": 5590 }, { "epoch": 13.0, "grad_norm": 19.701143264770508, "learning_rate": 4.5252465773590466e-05, "loss": 0.2436, "step": 5600 }, { "epoch": 13.0, "grad_norm": 5.625741958618164, "learning_rate": 4.5234064478139266e-05, "loss": 0.1171, "step": 5610 }, { "epoch": 13.01, "grad_norm": 2.8478872776031494, "learning_rate": 4.521566318268806e-05, "loss": 0.3495, "step": 5620 }, { "epoch": 13.01, "grad_norm": 7.616860866546631, "learning_rate": 4.5197261887236866e-05, "loss": 0.177, "step": 5630 }, { "epoch": 13.01, "grad_norm": 35.53705978393555, "learning_rate": 4.5178860591785667e-05, "loss": 0.1834, "step": 5640 }, { "epoch": 13.01, "grad_norm": 0.2759915888309479, "learning_rate": 4.516045929633446e-05, "loss": 0.0909, "step": 5650 }, { "epoch": 13.01, "grad_norm": 5.422901630401611, "learning_rate": 4.514205800088327e-05, "loss": 0.0757, "step": 5660 }, { "epoch": 13.01, "grad_norm": 1.9962157011032104, "learning_rate": 4.512365670543207e-05, "loss": 0.0337, "step": 5670 }, { "epoch": 13.01, "grad_norm": 1.336719274520874, "learning_rate": 4.510525540998086e-05, "loss": 0.1988, "step": 5680 }, { "epoch": 13.01, "grad_norm": 19.26902961730957, "learning_rate": 4.508685411452967e-05, "loss": 0.2792, "step": 5690 }, { "epoch": 13.01, "grad_norm": 0.8596954941749573, "learning_rate": 4.506845281907847e-05, "loss": 0.192, "step": 5700 }, { "epoch": 13.01, "grad_norm": 2.8119475841522217, "learning_rate": 4.505005152362727e-05, "loss": 0.0473, "step": 5710 }, { "epoch": 13.01, "grad_norm": 1.8721100091934204, "learning_rate": 4.503165022817607e-05, "loss": 0.1026, "step": 5720 }, { "epoch": 13.01, "grad_norm": 40.60285568237305, "learning_rate": 4.501324893272487e-05, "loss": 0.2544, "step": 5730 }, { "epoch": 13.01, "grad_norm": 14.672572135925293, "learning_rate": 4.499484763727367e-05, "loss": 0.192, "step": 5740 }, { "epoch": 13.01, "grad_norm": 10.472712516784668, "learning_rate": 4.497644634182247e-05, "loss": 0.4008, "step": 5750 }, { "epoch": 13.01, "grad_norm": 4.290433406829834, "learning_rate": 4.495804504637127e-05, "loss": 0.3308, "step": 5760 }, { "epoch": 13.01, "grad_norm": 9.656917572021484, "learning_rate": 4.493964375092007e-05, "loss": 0.2224, "step": 5770 }, { "epoch": 13.01, "grad_norm": 8.347408294677734, "learning_rate": 4.492124245546887e-05, "loss": 0.1097, "step": 5780 }, { "epoch": 13.01, "grad_norm": 7.9891743659973145, "learning_rate": 4.490284116001767e-05, "loss": 0.1013, "step": 5790 }, { "epoch": 13.01, "grad_norm": 29.41997528076172, "learning_rate": 4.488443986456647e-05, "loss": 0.1906, "step": 5800 }, { "epoch": 13.01, "grad_norm": 0.2109886109828949, "learning_rate": 4.486603856911527e-05, "loss": 0.2282, "step": 5810 }, { "epoch": 13.01, "grad_norm": 4.410977840423584, "learning_rate": 4.484763727366407e-05, "loss": 0.2242, "step": 5820 }, { "epoch": 13.01, "grad_norm": 10.800416946411133, "learning_rate": 4.482923597821287e-05, "loss": 0.1689, "step": 5830 }, { "epoch": 13.01, "grad_norm": 15.845876693725586, "learning_rate": 4.481083468276167e-05, "loss": 0.2048, "step": 5840 }, { "epoch": 13.01, "grad_norm": 5.1937785148620605, "learning_rate": 4.479243338731047e-05, "loss": 0.1652, "step": 5850 }, { "epoch": 13.01, "grad_norm": 0.8186588883399963, "learning_rate": 4.477403209185927e-05, "loss": 0.1132, "step": 5860 }, { "epoch": 13.01, "grad_norm": 34.49995803833008, "learning_rate": 4.475563079640807e-05, "loss": 0.1222, "step": 5870 }, { "epoch": 13.01, "grad_norm": 4.207097053527832, "learning_rate": 4.473722950095687e-05, "loss": 0.3247, "step": 5880 }, { "epoch": 13.01, "eval_accuracy": 0.6846846846846847, "eval_loss": 1.2167141437530518, "eval_runtime": 40.5277, "eval_samples_per_second": 21.911, "eval_steps_per_second": 1.826, "step": 5880 }, { "epoch": 14.0, "grad_norm": 0.39019277691841125, "learning_rate": 4.471882820550567e-05, "loss": 0.1095, "step": 5890 }, { "epoch": 14.0, "grad_norm": 14.054343223571777, "learning_rate": 4.470042691005447e-05, "loss": 0.1827, "step": 5900 }, { "epoch": 14.0, "grad_norm": 18.15593147277832, "learning_rate": 4.468202561460327e-05, "loss": 0.1991, "step": 5910 }, { "epoch": 14.0, "grad_norm": 1.0667266845703125, "learning_rate": 4.466362431915207e-05, "loss": 0.0281, "step": 5920 }, { "epoch": 14.0, "grad_norm": 0.5443500876426697, "learning_rate": 4.464522302370087e-05, "loss": 0.2, "step": 5930 }, { "epoch": 14.0, "grad_norm": 1.438390851020813, "learning_rate": 4.462682172824968e-05, "loss": 0.1323, "step": 5940 }, { "epoch": 14.0, "grad_norm": 0.20778608322143555, "learning_rate": 4.460842043279847e-05, "loss": 0.192, "step": 5950 }, { "epoch": 14.0, "grad_norm": 21.353769302368164, "learning_rate": 4.459001913734727e-05, "loss": 0.1008, "step": 5960 }, { "epoch": 14.0, "grad_norm": 10.89686393737793, "learning_rate": 4.457161784189608e-05, "loss": 0.156, "step": 5970 }, { "epoch": 14.0, "grad_norm": 6.522188663482666, "learning_rate": 4.455321654644487e-05, "loss": 0.2051, "step": 5980 }, { "epoch": 14.0, "grad_norm": 17.12128257751465, "learning_rate": 4.453481525099367e-05, "loss": 0.2968, "step": 5990 }, { "epoch": 14.0, "grad_norm": 10.827217102050781, "learning_rate": 4.451641395554248e-05, "loss": 0.0437, "step": 6000 }, { "epoch": 14.0, "grad_norm": 29.041154861450195, "learning_rate": 4.449801266009127e-05, "loss": 0.1536, "step": 6010 }, { "epoch": 14.0, "grad_norm": 5.939600944519043, "learning_rate": 4.447961136464007e-05, "loss": 0.147, "step": 6020 }, { "epoch": 14.0, "grad_norm": 24.224306106567383, "learning_rate": 4.446121006918888e-05, "loss": 0.2939, "step": 6030 }, { "epoch": 14.01, "grad_norm": 7.354557991027832, "learning_rate": 4.444280877373767e-05, "loss": 0.132, "step": 6040 }, { "epoch": 14.01, "grad_norm": 14.240554809570312, "learning_rate": 4.442440747828647e-05, "loss": 0.0965, "step": 6050 }, { "epoch": 14.01, "grad_norm": 10.489018440246582, "learning_rate": 4.440600618283527e-05, "loss": 0.2011, "step": 6060 }, { "epoch": 14.01, "grad_norm": 17.361114501953125, "learning_rate": 4.438760488738408e-05, "loss": 0.2374, "step": 6070 }, { "epoch": 14.01, "grad_norm": 2.982257604598999, "learning_rate": 4.436920359193287e-05, "loss": 0.2142, "step": 6080 }, { "epoch": 14.01, "grad_norm": 0.11127086728811264, "learning_rate": 4.435080229648167e-05, "loss": 0.1552, "step": 6090 }, { "epoch": 14.01, "grad_norm": 2.568547487258911, "learning_rate": 4.433240100103048e-05, "loss": 0.2232, "step": 6100 }, { "epoch": 14.01, "grad_norm": 0.16568288207054138, "learning_rate": 4.431399970557927e-05, "loss": 0.2059, "step": 6110 }, { "epoch": 14.01, "grad_norm": 7.591290473937988, "learning_rate": 4.429559841012807e-05, "loss": 0.1485, "step": 6120 }, { "epoch": 14.01, "grad_norm": 11.021584510803223, "learning_rate": 4.427719711467688e-05, "loss": 0.149, "step": 6130 }, { "epoch": 14.01, "grad_norm": 0.6921817064285278, "learning_rate": 4.425879581922567e-05, "loss": 0.2096, "step": 6140 }, { "epoch": 14.01, "grad_norm": 5.8809661865234375, "learning_rate": 4.424039452377447e-05, "loss": 0.1371, "step": 6150 }, { "epoch": 14.01, "grad_norm": 2.2032387256622314, "learning_rate": 4.422199322832328e-05, "loss": 0.086, "step": 6160 }, { "epoch": 14.01, "grad_norm": 22.69715118408203, "learning_rate": 4.420359193287207e-05, "loss": 0.1365, "step": 6170 }, { "epoch": 14.01, "grad_norm": 20.44197654724121, "learning_rate": 4.418519063742087e-05, "loss": 0.0839, "step": 6180 }, { "epoch": 14.01, "grad_norm": 25.620283126831055, "learning_rate": 4.416678934196968e-05, "loss": 0.1959, "step": 6190 }, { "epoch": 14.01, "grad_norm": 42.260128021240234, "learning_rate": 4.414838804651848e-05, "loss": 0.2688, "step": 6200 }, { "epoch": 14.01, "grad_norm": 13.617227554321289, "learning_rate": 4.412998675106727e-05, "loss": 0.2601, "step": 6210 }, { "epoch": 14.01, "grad_norm": 9.865835189819336, "learning_rate": 4.411158545561608e-05, "loss": 0.3127, "step": 6220 }, { "epoch": 14.01, "grad_norm": 3.797058343887329, "learning_rate": 4.409318416016488e-05, "loss": 0.201, "step": 6230 }, { "epoch": 14.01, "grad_norm": 4.668950080871582, "learning_rate": 4.4074782864713673e-05, "loss": 0.1756, "step": 6240 }, { "epoch": 14.01, "grad_norm": 21.492088317871094, "learning_rate": 4.405638156926248e-05, "loss": 0.2038, "step": 6250 }, { "epoch": 14.01, "grad_norm": 11.907729148864746, "learning_rate": 4.403798027381128e-05, "loss": 0.235, "step": 6260 }, { "epoch": 14.01, "grad_norm": 13.219657897949219, "learning_rate": 4.4019578978360074e-05, "loss": 0.2846, "step": 6270 }, { "epoch": 14.01, "grad_norm": 3.140854597091675, "learning_rate": 4.400117768290888e-05, "loss": 0.4261, "step": 6280 }, { "epoch": 14.01, "grad_norm": 6.707206726074219, "learning_rate": 4.398277638745768e-05, "loss": 0.106, "step": 6290 }, { "epoch": 14.01, "grad_norm": 2.215766429901123, "learning_rate": 4.3964375092006474e-05, "loss": 0.171, "step": 6300 }, { "epoch": 14.01, "eval_accuracy": 0.7027027027027027, "eval_loss": 0.9336600303649902, "eval_runtime": 40.7056, "eval_samples_per_second": 21.815, "eval_steps_per_second": 1.818, "step": 6300 }, { "epoch": 15.0, "grad_norm": 1.4444656372070312, "learning_rate": 4.394597379655528e-05, "loss": 0.2148, "step": 6310 }, { "epoch": 15.0, "grad_norm": 14.322981834411621, "learning_rate": 4.392757250110408e-05, "loss": 0.1187, "step": 6320 }, { "epoch": 15.0, "grad_norm": 17.432512283325195, "learning_rate": 4.390917120565288e-05, "loss": 0.2358, "step": 6330 }, { "epoch": 15.0, "grad_norm": 10.29634952545166, "learning_rate": 4.389076991020168e-05, "loss": 0.1578, "step": 6340 }, { "epoch": 15.0, "grad_norm": 0.3197493255138397, "learning_rate": 4.387236861475048e-05, "loss": 0.1072, "step": 6350 }, { "epoch": 15.0, "grad_norm": 0.4721217751502991, "learning_rate": 4.385396731929928e-05, "loss": 0.3094, "step": 6360 }, { "epoch": 15.0, "grad_norm": 20.208770751953125, "learning_rate": 4.383556602384808e-05, "loss": 0.1157, "step": 6370 }, { "epoch": 15.0, "grad_norm": 15.79928207397461, "learning_rate": 4.381716472839688e-05, "loss": 0.251, "step": 6380 }, { "epoch": 15.0, "grad_norm": 0.06318643689155579, "learning_rate": 4.379876343294568e-05, "loss": 0.1641, "step": 6390 }, { "epoch": 15.0, "grad_norm": 18.822420120239258, "learning_rate": 4.378036213749448e-05, "loss": 0.1197, "step": 6400 }, { "epoch": 15.0, "grad_norm": 0.14875183999538422, "learning_rate": 4.376196084204328e-05, "loss": 0.1705, "step": 6410 }, { "epoch": 15.0, "grad_norm": 18.02042579650879, "learning_rate": 4.374355954659208e-05, "loss": 0.1298, "step": 6420 }, { "epoch": 15.0, "grad_norm": 14.635958671569824, "learning_rate": 4.372515825114088e-05, "loss": 0.1821, "step": 6430 }, { "epoch": 15.0, "grad_norm": 0.13969016075134277, "learning_rate": 4.370675695568968e-05, "loss": 0.1356, "step": 6440 }, { "epoch": 15.0, "grad_norm": 0.7694371342658997, "learning_rate": 4.368835566023848e-05, "loss": 0.1752, "step": 6450 }, { "epoch": 15.01, "grad_norm": 0.2148592621088028, "learning_rate": 4.366995436478728e-05, "loss": 0.117, "step": 6460 }, { "epoch": 15.01, "grad_norm": 11.844324111938477, "learning_rate": 4.365155306933608e-05, "loss": 0.2095, "step": 6470 }, { "epoch": 15.01, "grad_norm": 8.03518009185791, "learning_rate": 4.363315177388488e-05, "loss": 0.0854, "step": 6480 }, { "epoch": 15.01, "grad_norm": 1.3339135646820068, "learning_rate": 4.361475047843368e-05, "loss": 0.143, "step": 6490 }, { "epoch": 15.01, "grad_norm": 0.5493175387382507, "learning_rate": 4.359634918298248e-05, "loss": 0.1753, "step": 6500 }, { "epoch": 15.01, "grad_norm": 15.884575843811035, "learning_rate": 4.357794788753128e-05, "loss": 0.1704, "step": 6510 }, { "epoch": 15.01, "grad_norm": 19.157939910888672, "learning_rate": 4.3559546592080083e-05, "loss": 0.1545, "step": 6520 }, { "epoch": 15.01, "grad_norm": 22.01847267150879, "learning_rate": 4.3541145296628884e-05, "loss": 0.2814, "step": 6530 }, { "epoch": 15.01, "grad_norm": 3.600910186767578, "learning_rate": 4.3522744001177684e-05, "loss": 0.1142, "step": 6540 }, { "epoch": 15.01, "grad_norm": 0.21439813077449799, "learning_rate": 4.3504342705726484e-05, "loss": 0.2112, "step": 6550 }, { "epoch": 15.01, "grad_norm": 8.411886215209961, "learning_rate": 4.348594141027529e-05, "loss": 0.0611, "step": 6560 }, { "epoch": 15.01, "grad_norm": 21.956642150878906, "learning_rate": 4.3467540114824084e-05, "loss": 0.1059, "step": 6570 }, { "epoch": 15.01, "grad_norm": 0.07254786044359207, "learning_rate": 4.3449138819372884e-05, "loss": 0.0876, "step": 6580 }, { "epoch": 15.01, "grad_norm": 0.7494866847991943, "learning_rate": 4.343073752392169e-05, "loss": 0.1531, "step": 6590 }, { "epoch": 15.01, "grad_norm": 27.045339584350586, "learning_rate": 4.3412336228470484e-05, "loss": 0.2429, "step": 6600 }, { "epoch": 15.01, "grad_norm": 0.29273873567581177, "learning_rate": 4.3393934933019284e-05, "loss": 0.2683, "step": 6610 }, { "epoch": 15.01, "grad_norm": 0.04690911993384361, "learning_rate": 4.337553363756809e-05, "loss": 0.1066, "step": 6620 }, { "epoch": 15.01, "grad_norm": 0.5130358338356018, "learning_rate": 4.3357132342116885e-05, "loss": 0.111, "step": 6630 }, { "epoch": 15.01, "grad_norm": 0.7703613042831421, "learning_rate": 4.3338731046665685e-05, "loss": 0.097, "step": 6640 }, { "epoch": 15.01, "grad_norm": 25.8164005279541, "learning_rate": 4.332032975121449e-05, "loss": 0.2039, "step": 6650 }, { "epoch": 15.01, "grad_norm": 18.577009201049805, "learning_rate": 4.3301928455763285e-05, "loss": 0.2498, "step": 6660 }, { "epoch": 15.01, "grad_norm": 17.129587173461914, "learning_rate": 4.3283527160312085e-05, "loss": 0.207, "step": 6670 }, { "epoch": 15.01, "grad_norm": 22.360652923583984, "learning_rate": 4.326512586486089e-05, "loss": 0.1344, "step": 6680 }, { "epoch": 15.01, "grad_norm": 0.045382168143987656, "learning_rate": 4.324672456940969e-05, "loss": 0.1224, "step": 6690 }, { "epoch": 15.01, "grad_norm": 0.12095453590154648, "learning_rate": 4.3228323273958486e-05, "loss": 0.0751, "step": 6700 }, { "epoch": 15.01, "grad_norm": 0.04379770904779434, "learning_rate": 4.320992197850729e-05, "loss": 0.2142, "step": 6710 }, { "epoch": 15.01, "grad_norm": 0.022613519802689552, "learning_rate": 4.319152068305609e-05, "loss": 0.3076, "step": 6720 }, { "epoch": 15.01, "eval_accuracy": 0.7207207207207207, "eval_loss": 1.181077480316162, "eval_runtime": 41.2544, "eval_samples_per_second": 21.525, "eval_steps_per_second": 1.794, "step": 6720 }, { "epoch": 16.0, "grad_norm": 3.539224863052368, "learning_rate": 4.3173119387604886e-05, "loss": 0.0385, "step": 6730 }, { "epoch": 16.0, "grad_norm": 3.006033182144165, "learning_rate": 4.315471809215369e-05, "loss": 0.0355, "step": 6740 }, { "epoch": 16.0, "grad_norm": 0.09995649755001068, "learning_rate": 4.313631679670249e-05, "loss": 0.1871, "step": 6750 }, { "epoch": 16.0, "grad_norm": 6.6211347579956055, "learning_rate": 4.3117915501251286e-05, "loss": 0.0773, "step": 6760 }, { "epoch": 16.0, "grad_norm": 1.1593960523605347, "learning_rate": 4.309951420580009e-05, "loss": 0.0803, "step": 6770 }, { "epoch": 16.0, "grad_norm": 5.994491100311279, "learning_rate": 4.308111291034889e-05, "loss": 0.0459, "step": 6780 }, { "epoch": 16.0, "grad_norm": 11.333940505981445, "learning_rate": 4.3062711614897687e-05, "loss": 0.237, "step": 6790 }, { "epoch": 16.0, "grad_norm": 9.39922046661377, "learning_rate": 4.3044310319446493e-05, "loss": 0.3263, "step": 6800 }, { "epoch": 16.0, "grad_norm": 0.28990158438682556, "learning_rate": 4.3025909023995294e-05, "loss": 0.16, "step": 6810 }, { "epoch": 16.0, "grad_norm": 0.8307206034660339, "learning_rate": 4.3007507728544094e-05, "loss": 0.2574, "step": 6820 }, { "epoch": 16.0, "grad_norm": 0.03865053132176399, "learning_rate": 4.2989106433092894e-05, "loss": 0.0774, "step": 6830 }, { "epoch": 16.0, "grad_norm": 0.4578423500061035, "learning_rate": 4.2970705137641694e-05, "loss": 0.0989, "step": 6840 }, { "epoch": 16.0, "grad_norm": 0.41832372546195984, "learning_rate": 4.2952303842190494e-05, "loss": 0.0478, "step": 6850 }, { "epoch": 16.0, "grad_norm": 10.938180923461914, "learning_rate": 4.2933902546739294e-05, "loss": 0.1905, "step": 6860 }, { "epoch": 16.0, "grad_norm": 24.749025344848633, "learning_rate": 4.2915501251288094e-05, "loss": 0.1256, "step": 6870 }, { "epoch": 16.01, "grad_norm": 19.042219161987305, "learning_rate": 4.2897099955836894e-05, "loss": 0.1258, "step": 6880 }, { "epoch": 16.01, "grad_norm": 0.6163063645362854, "learning_rate": 4.287869866038569e-05, "loss": 0.07, "step": 6890 }, { "epoch": 16.01, "grad_norm": 17.089820861816406, "learning_rate": 4.2860297364934495e-05, "loss": 0.1107, "step": 6900 }, { "epoch": 16.01, "grad_norm": 0.10851157456636429, "learning_rate": 4.2841896069483295e-05, "loss": 0.3045, "step": 6910 }, { "epoch": 16.01, "grad_norm": 0.45974329113960266, "learning_rate": 4.282349477403209e-05, "loss": 0.1239, "step": 6920 }, { "epoch": 16.01, "grad_norm": 26.82357406616211, "learning_rate": 4.2805093478580895e-05, "loss": 0.1921, "step": 6930 }, { "epoch": 16.01, "grad_norm": 10.198452949523926, "learning_rate": 4.2786692183129695e-05, "loss": 0.1364, "step": 6940 }, { "epoch": 16.01, "grad_norm": 29.6612491607666, "learning_rate": 4.2768290887678495e-05, "loss": 0.167, "step": 6950 }, { "epoch": 16.01, "grad_norm": 7.377419948577881, "learning_rate": 4.2749889592227295e-05, "loss": 0.0909, "step": 6960 }, { "epoch": 16.01, "grad_norm": 2.213519334793091, "learning_rate": 4.2731488296776095e-05, "loss": 0.0531, "step": 6970 }, { "epoch": 16.01, "grad_norm": 0.029641279950737953, "learning_rate": 4.2713087001324896e-05, "loss": 0.1466, "step": 6980 }, { "epoch": 16.01, "grad_norm": 1.8791080713272095, "learning_rate": 4.2694685705873696e-05, "loss": 0.0484, "step": 6990 }, { "epoch": 16.01, "grad_norm": 0.746714174747467, "learning_rate": 4.2676284410422496e-05, "loss": 0.134, "step": 7000 }, { "epoch": 16.01, "grad_norm": 11.966108322143555, "learning_rate": 4.2657883114971296e-05, "loss": 0.1856, "step": 7010 }, { "epoch": 16.01, "grad_norm": 0.21873310208320618, "learning_rate": 4.2639481819520096e-05, "loss": 0.1875, "step": 7020 }, { "epoch": 16.01, "grad_norm": 8.706038475036621, "learning_rate": 4.2621080524068896e-05, "loss": 0.2378, "step": 7030 }, { "epoch": 16.01, "grad_norm": 0.31858113408088684, "learning_rate": 4.2602679228617696e-05, "loss": 0.1298, "step": 7040 }, { "epoch": 16.01, "grad_norm": 0.04492282494902611, "learning_rate": 4.2584277933166496e-05, "loss": 0.2082, "step": 7050 }, { "epoch": 16.01, "grad_norm": 0.04001040756702423, "learning_rate": 4.2565876637715296e-05, "loss": 0.2121, "step": 7060 }, { "epoch": 16.01, "grad_norm": 13.668713569641113, "learning_rate": 4.2547475342264097e-05, "loss": 0.1399, "step": 7070 }, { "epoch": 16.01, "grad_norm": 10.486247062683105, "learning_rate": 4.25290740468129e-05, "loss": 0.1461, "step": 7080 }, { "epoch": 16.01, "grad_norm": 11.53575611114502, "learning_rate": 4.25106727513617e-05, "loss": 0.2366, "step": 7090 }, { "epoch": 16.01, "grad_norm": 23.928285598754883, "learning_rate": 4.24922714559105e-05, "loss": 0.1941, "step": 7100 }, { "epoch": 16.01, "grad_norm": 3.083263874053955, "learning_rate": 4.24738701604593e-05, "loss": 0.3031, "step": 7110 }, { "epoch": 16.01, "grad_norm": 2.5412213802337646, "learning_rate": 4.24554688650081e-05, "loss": 0.1671, "step": 7120 }, { "epoch": 16.01, "grad_norm": 17.73686981201172, "learning_rate": 4.24370675695569e-05, "loss": 0.1783, "step": 7130 }, { "epoch": 16.01, "grad_norm": 0.07319161295890808, "learning_rate": 4.24186662741057e-05, "loss": 0.2927, "step": 7140 }, { "epoch": 16.01, "eval_accuracy": 0.7218468468468469, "eval_loss": 1.0953478813171387, "eval_runtime": 40.8306, "eval_samples_per_second": 21.748, "eval_steps_per_second": 1.812, "step": 7140 }, { "epoch": 17.0, "grad_norm": 0.11046060919761658, "learning_rate": 4.24002649786545e-05, "loss": 0.093, "step": 7150 }, { "epoch": 17.0, "grad_norm": 2.020051956176758, "learning_rate": 4.23818636832033e-05, "loss": 0.12, "step": 7160 }, { "epoch": 17.0, "grad_norm": 12.6734037399292, "learning_rate": 4.23634623877521e-05, "loss": 0.0306, "step": 7170 }, { "epoch": 17.0, "grad_norm": 3.399958610534668, "learning_rate": 4.23450610923009e-05, "loss": 0.1348, "step": 7180 }, { "epoch": 17.0, "grad_norm": 4.962296962738037, "learning_rate": 4.23266597968497e-05, "loss": 0.154, "step": 7190 }, { "epoch": 17.0, "grad_norm": 21.432941436767578, "learning_rate": 4.23082585013985e-05, "loss": 0.1807, "step": 7200 }, { "epoch": 17.0, "grad_norm": 2.730048179626465, "learning_rate": 4.2289857205947305e-05, "loss": 0.0698, "step": 7210 }, { "epoch": 17.0, "grad_norm": 5.443423271179199, "learning_rate": 4.22714559104961e-05, "loss": 0.1003, "step": 7220 }, { "epoch": 17.0, "grad_norm": 35.500003814697266, "learning_rate": 4.22530546150449e-05, "loss": 0.189, "step": 7230 }, { "epoch": 17.0, "grad_norm": 0.07223138213157654, "learning_rate": 4.2234653319593705e-05, "loss": 0.3029, "step": 7240 }, { "epoch": 17.0, "grad_norm": 0.2147696614265442, "learning_rate": 4.22162520241425e-05, "loss": 0.031, "step": 7250 }, { "epoch": 17.0, "grad_norm": 9.870826721191406, "learning_rate": 4.21978507286913e-05, "loss": 0.2191, "step": 7260 }, { "epoch": 17.0, "grad_norm": 16.39332389831543, "learning_rate": 4.2179449433240106e-05, "loss": 0.1711, "step": 7270 }, { "epoch": 17.0, "grad_norm": 2.216157913208008, "learning_rate": 4.21610481377889e-05, "loss": 0.0723, "step": 7280 }, { "epoch": 17.0, "grad_norm": 4.93229866027832, "learning_rate": 4.21426468423377e-05, "loss": 0.0542, "step": 7290 }, { "epoch": 17.01, "grad_norm": 5.814107894897461, "learning_rate": 4.2124245546886506e-05, "loss": 0.0757, "step": 7300 }, { "epoch": 17.01, "grad_norm": 29.41144561767578, "learning_rate": 4.2105844251435306e-05, "loss": 0.1158, "step": 7310 }, { "epoch": 17.01, "grad_norm": 40.187164306640625, "learning_rate": 4.20874429559841e-05, "loss": 0.2319, "step": 7320 }, { "epoch": 17.01, "grad_norm": 0.42633482813835144, "learning_rate": 4.2069041660532906e-05, "loss": 0.078, "step": 7330 }, { "epoch": 17.01, "grad_norm": 14.942743301391602, "learning_rate": 4.2050640365081706e-05, "loss": 0.0495, "step": 7340 }, { "epoch": 17.01, "grad_norm": 6.133449554443359, "learning_rate": 4.20322390696305e-05, "loss": 0.265, "step": 7350 }, { "epoch": 17.01, "grad_norm": 0.2768873870372772, "learning_rate": 4.201383777417931e-05, "loss": 0.1142, "step": 7360 }, { "epoch": 17.01, "grad_norm": 33.952964782714844, "learning_rate": 4.199543647872811e-05, "loss": 0.1683, "step": 7370 }, { "epoch": 17.01, "grad_norm": 11.31227970123291, "learning_rate": 4.19770351832769e-05, "loss": 0.024, "step": 7380 }, { "epoch": 17.01, "grad_norm": 7.327922821044922, "learning_rate": 4.195863388782571e-05, "loss": 0.045, "step": 7390 }, { "epoch": 17.01, "grad_norm": 26.489227294921875, "learning_rate": 4.194023259237451e-05, "loss": 0.2488, "step": 7400 }, { "epoch": 17.01, "grad_norm": 0.04616143926978111, "learning_rate": 4.19218312969233e-05, "loss": 0.2151, "step": 7410 }, { "epoch": 17.01, "grad_norm": 0.2145106941461563, "learning_rate": 4.190343000147211e-05, "loss": 0.0606, "step": 7420 }, { "epoch": 17.01, "grad_norm": 5.961447238922119, "learning_rate": 4.188502870602091e-05, "loss": 0.1122, "step": 7430 }, { "epoch": 17.01, "grad_norm": 10.214862823486328, "learning_rate": 4.186662741056971e-05, "loss": 0.2077, "step": 7440 }, { "epoch": 17.01, "grad_norm": 0.2347540557384491, "learning_rate": 4.184822611511851e-05, "loss": 0.1062, "step": 7450 }, { "epoch": 17.01, "grad_norm": 18.635656356811523, "learning_rate": 4.182982481966731e-05, "loss": 0.1868, "step": 7460 }, { "epoch": 17.01, "grad_norm": 22.011613845825195, "learning_rate": 4.181142352421611e-05, "loss": 0.1565, "step": 7470 }, { "epoch": 17.01, "grad_norm": 14.826783180236816, "learning_rate": 4.179302222876491e-05, "loss": 0.1408, "step": 7480 }, { "epoch": 17.01, "grad_norm": 11.025404930114746, "learning_rate": 4.177462093331371e-05, "loss": 0.2478, "step": 7490 }, { "epoch": 17.01, "grad_norm": 20.00482177734375, "learning_rate": 4.175621963786251e-05, "loss": 0.1929, "step": 7500 }, { "epoch": 17.01, "grad_norm": 0.18002435564994812, "learning_rate": 4.173781834241131e-05, "loss": 0.1091, "step": 7510 }, { "epoch": 17.01, "grad_norm": 17.47130584716797, "learning_rate": 4.171941704696011e-05, "loss": 0.2577, "step": 7520 }, { "epoch": 17.01, "grad_norm": 22.80291175842285, "learning_rate": 4.170101575150891e-05, "loss": 0.1132, "step": 7530 }, { "epoch": 17.01, "grad_norm": 0.3076806366443634, "learning_rate": 4.168261445605771e-05, "loss": 0.1184, "step": 7540 }, { "epoch": 17.01, "grad_norm": 48.81949234008789, "learning_rate": 4.166421316060651e-05, "loss": 0.2314, "step": 7550 }, { "epoch": 17.01, "grad_norm": 99.59304809570312, "learning_rate": 4.164581186515531e-05, "loss": 0.1679, "step": 7560 }, { "epoch": 17.01, "eval_accuracy": 0.7207207207207207, "eval_loss": 1.2947888374328613, "eval_runtime": 40.3013, "eval_samples_per_second": 22.034, "eval_steps_per_second": 1.836, "step": 7560 }, { "epoch": 18.0, "grad_norm": 35.903743743896484, "learning_rate": 4.162741056970411e-05, "loss": 0.1419, "step": 7570 }, { "epoch": 18.0, "grad_norm": 0.05569310858845711, "learning_rate": 4.160900927425291e-05, "loss": 0.0823, "step": 7580 }, { "epoch": 18.0, "grad_norm": 0.4801734685897827, "learning_rate": 4.159060797880171e-05, "loss": 0.2445, "step": 7590 }, { "epoch": 18.0, "grad_norm": 0.12532910704612732, "learning_rate": 4.157220668335051e-05, "loss": 0.103, "step": 7600 }, { "epoch": 18.0, "grad_norm": 0.5598722100257874, "learning_rate": 4.155380538789931e-05, "loss": 0.0963, "step": 7610 }, { "epoch": 18.0, "grad_norm": 0.05128243565559387, "learning_rate": 4.153540409244811e-05, "loss": 0.1159, "step": 7620 }, { "epoch": 18.0, "grad_norm": 0.0423787459731102, "learning_rate": 4.151700279699691e-05, "loss": 0.1454, "step": 7630 }, { "epoch": 18.0, "grad_norm": 29.96397590637207, "learning_rate": 4.149860150154571e-05, "loss": 0.1761, "step": 7640 }, { "epoch": 18.0, "grad_norm": 10.358722686767578, "learning_rate": 4.148020020609451e-05, "loss": 0.1922, "step": 7650 }, { "epoch": 18.0, "grad_norm": 4.160580158233643, "learning_rate": 4.146179891064331e-05, "loss": 0.0424, "step": 7660 }, { "epoch": 18.0, "grad_norm": 6.635777950286865, "learning_rate": 4.144339761519211e-05, "loss": 0.1561, "step": 7670 }, { "epoch": 18.0, "grad_norm": 10.608802795410156, "learning_rate": 4.142499631974091e-05, "loss": 0.0766, "step": 7680 }, { "epoch": 18.0, "grad_norm": 26.325162887573242, "learning_rate": 4.140659502428971e-05, "loss": 0.12, "step": 7690 }, { "epoch": 18.0, "grad_norm": 22.195999145507812, "learning_rate": 4.138819372883852e-05, "loss": 0.1439, "step": 7700 }, { "epoch": 18.0, "grad_norm": 15.824335098266602, "learning_rate": 4.136979243338731e-05, "loss": 0.1322, "step": 7710 }, { "epoch": 18.01, "grad_norm": 0.14473718404769897, "learning_rate": 4.135139113793611e-05, "loss": 0.0479, "step": 7720 }, { "epoch": 18.01, "grad_norm": 0.10849491506814957, "learning_rate": 4.133298984248492e-05, "loss": 0.0638, "step": 7730 }, { "epoch": 18.01, "grad_norm": 43.89323806762695, "learning_rate": 4.131458854703371e-05, "loss": 0.1261, "step": 7740 }, { "epoch": 18.01, "grad_norm": 35.16301345825195, "learning_rate": 4.129618725158251e-05, "loss": 0.0186, "step": 7750 }, { "epoch": 18.01, "grad_norm": 0.023130550980567932, "learning_rate": 4.127778595613132e-05, "loss": 0.1184, "step": 7760 }, { "epoch": 18.01, "grad_norm": 14.438142776489258, "learning_rate": 4.125938466068011e-05, "loss": 0.242, "step": 7770 }, { "epoch": 18.01, "grad_norm": 16.877521514892578, "learning_rate": 4.124098336522891e-05, "loss": 0.047, "step": 7780 }, { "epoch": 18.01, "grad_norm": 17.4776668548584, "learning_rate": 4.122258206977772e-05, "loss": 0.1285, "step": 7790 }, { "epoch": 18.01, "grad_norm": 0.09450684487819672, "learning_rate": 4.120418077432651e-05, "loss": 0.022, "step": 7800 }, { "epoch": 18.01, "grad_norm": 1.1784636974334717, "learning_rate": 4.118577947887531e-05, "loss": 0.1058, "step": 7810 }, { "epoch": 18.01, "grad_norm": 0.1115993857383728, "learning_rate": 4.116737818342412e-05, "loss": 0.1197, "step": 7820 }, { "epoch": 18.01, "grad_norm": 0.26653343439102173, "learning_rate": 4.114897688797292e-05, "loss": 0.1181, "step": 7830 }, { "epoch": 18.01, "grad_norm": 10.61689567565918, "learning_rate": 4.113057559252171e-05, "loss": 0.1226, "step": 7840 }, { "epoch": 18.01, "grad_norm": 1.0771710872650146, "learning_rate": 4.111217429707052e-05, "loss": 0.0459, "step": 7850 }, { "epoch": 18.01, "grad_norm": 2.459402561187744, "learning_rate": 4.109377300161932e-05, "loss": 0.0604, "step": 7860 }, { "epoch": 18.01, "grad_norm": 0.12830379605293274, "learning_rate": 4.107537170616811e-05, "loss": 0.3332, "step": 7870 }, { "epoch": 18.01, "grad_norm": 20.693988800048828, "learning_rate": 4.105697041071692e-05, "loss": 0.162, "step": 7880 }, { "epoch": 18.01, "grad_norm": 27.03666114807129, "learning_rate": 4.103856911526572e-05, "loss": 0.2202, "step": 7890 }, { "epoch": 18.01, "grad_norm": 0.8211888670921326, "learning_rate": 4.102016781981451e-05, "loss": 0.0964, "step": 7900 }, { "epoch": 18.01, "grad_norm": 40.739036560058594, "learning_rate": 4.100176652436331e-05, "loss": 0.1921, "step": 7910 }, { "epoch": 18.01, "grad_norm": 7.381683349609375, "learning_rate": 4.098336522891212e-05, "loss": 0.3249, "step": 7920 }, { "epoch": 18.01, "grad_norm": 6.641468524932861, "learning_rate": 4.096496393346092e-05, "loss": 0.0733, "step": 7930 }, { "epoch": 18.01, "grad_norm": 23.581151962280273, "learning_rate": 4.0946562638009713e-05, "loss": 0.1884, "step": 7940 }, { "epoch": 18.01, "grad_norm": 25.609771728515625, "learning_rate": 4.092816134255852e-05, "loss": 0.2263, "step": 7950 }, { "epoch": 18.01, "grad_norm": 20.769895553588867, "learning_rate": 4.090976004710732e-05, "loss": 0.1547, "step": 7960 }, { "epoch": 18.01, "grad_norm": 0.11588778346776962, "learning_rate": 4.0891358751656114e-05, "loss": 0.0908, "step": 7970 }, { "epoch": 18.01, "grad_norm": 0.04875311255455017, "learning_rate": 4.087295745620492e-05, "loss": 0.1523, "step": 7980 }, { "epoch": 18.01, "eval_accuracy": 0.7015765765765766, "eval_loss": 1.3631926774978638, "eval_runtime": 40.2124, "eval_samples_per_second": 22.083, "eval_steps_per_second": 1.84, "step": 7980 }, { "epoch": 19.0, "grad_norm": 21.000314712524414, "learning_rate": 4.085455616075372e-05, "loss": 0.1682, "step": 7990 }, { "epoch": 19.0, "grad_norm": 1.4022904634475708, "learning_rate": 4.0836154865302514e-05, "loss": 0.1087, "step": 8000 }, { "epoch": 19.0, "grad_norm": 0.13801883161067963, "learning_rate": 4.081775356985132e-05, "loss": 0.1159, "step": 8010 }, { "epoch": 19.0, "grad_norm": 16.723587036132812, "learning_rate": 4.079935227440012e-05, "loss": 0.1965, "step": 8020 }, { "epoch": 19.0, "grad_norm": 0.8381214737892151, "learning_rate": 4.0780950978948914e-05, "loss": 0.0861, "step": 8030 }, { "epoch": 19.0, "grad_norm": 0.10108273476362228, "learning_rate": 4.076254968349772e-05, "loss": 0.046, "step": 8040 }, { "epoch": 19.0, "grad_norm": 8.169659614562988, "learning_rate": 4.074414838804652e-05, "loss": 0.1479, "step": 8050 }, { "epoch": 19.0, "grad_norm": 0.968652069568634, "learning_rate": 4.072574709259532e-05, "loss": 0.0547, "step": 8060 }, { "epoch": 19.0, "grad_norm": 29.368057250976562, "learning_rate": 4.070734579714412e-05, "loss": 0.147, "step": 8070 }, { "epoch": 19.0, "grad_norm": 39.67902755737305, "learning_rate": 4.068894450169292e-05, "loss": 0.3396, "step": 8080 }, { "epoch": 19.0, "grad_norm": 0.5779815912246704, "learning_rate": 4.067054320624172e-05, "loss": 0.2295, "step": 8090 }, { "epoch": 19.0, "grad_norm": 0.8270196318626404, "learning_rate": 4.065214191079052e-05, "loss": 0.0877, "step": 8100 }, { "epoch": 19.0, "grad_norm": 12.409184455871582, "learning_rate": 4.063374061533932e-05, "loss": 0.1544, "step": 8110 }, { "epoch": 19.0, "grad_norm": 11.662062644958496, "learning_rate": 4.061533931988812e-05, "loss": 0.0856, "step": 8120 }, { "epoch": 19.0, "grad_norm": 11.891218185424805, "learning_rate": 4.059693802443692e-05, "loss": 0.0513, "step": 8130 }, { "epoch": 19.01, "grad_norm": 4.517670631408691, "learning_rate": 4.057853672898572e-05, "loss": 0.1357, "step": 8140 }, { "epoch": 19.01, "grad_norm": 6.784265995025635, "learning_rate": 4.056013543353452e-05, "loss": 0.0232, "step": 8150 }, { "epoch": 19.01, "grad_norm": 0.07984334230422974, "learning_rate": 4.054173413808332e-05, "loss": 0.0557, "step": 8160 }, { "epoch": 19.01, "grad_norm": 0.8104623556137085, "learning_rate": 4.052333284263212e-05, "loss": 0.1124, "step": 8170 }, { "epoch": 19.01, "grad_norm": 0.036781515926122665, "learning_rate": 4.050493154718092e-05, "loss": 0.026, "step": 8180 }, { "epoch": 19.01, "grad_norm": 26.195846557617188, "learning_rate": 4.048653025172972e-05, "loss": 0.0411, "step": 8190 }, { "epoch": 19.01, "grad_norm": 46.0142936706543, "learning_rate": 4.046812895627852e-05, "loss": 0.1791, "step": 8200 }, { "epoch": 19.01, "grad_norm": 0.6812270879745483, "learning_rate": 4.044972766082732e-05, "loss": 0.114, "step": 8210 }, { "epoch": 19.01, "grad_norm": 0.040096819400787354, "learning_rate": 4.0431326365376123e-05, "loss": 0.2042, "step": 8220 }, { "epoch": 19.01, "grad_norm": 0.17151233553886414, "learning_rate": 4.0412925069924924e-05, "loss": 0.0718, "step": 8230 }, { "epoch": 19.01, "grad_norm": 1.1165339946746826, "learning_rate": 4.0394523774473724e-05, "loss": 0.1602, "step": 8240 }, { "epoch": 19.01, "grad_norm": 0.42848771810531616, "learning_rate": 4.0376122479022524e-05, "loss": 0.0959, "step": 8250 }, { "epoch": 19.01, "grad_norm": 0.10783377289772034, "learning_rate": 4.0357721183571324e-05, "loss": 0.0306, "step": 8260 }, { "epoch": 19.01, "grad_norm": 65.03582763671875, "learning_rate": 4.0339319888120124e-05, "loss": 0.1638, "step": 8270 }, { "epoch": 19.01, "grad_norm": 0.21185167133808136, "learning_rate": 4.0320918592668924e-05, "loss": 0.0511, "step": 8280 }, { "epoch": 19.01, "grad_norm": 14.06902027130127, "learning_rate": 4.0302517297217724e-05, "loss": 0.0355, "step": 8290 }, { "epoch": 19.01, "grad_norm": 29.087316513061523, "learning_rate": 4.0284116001766524e-05, "loss": 0.2198, "step": 8300 }, { "epoch": 19.01, "grad_norm": 12.470526695251465, "learning_rate": 4.0265714706315324e-05, "loss": 0.068, "step": 8310 }, { "epoch": 19.01, "grad_norm": 0.05231478437781334, "learning_rate": 4.024731341086413e-05, "loss": 0.1933, "step": 8320 }, { "epoch": 19.01, "grad_norm": 0.028017813339829445, "learning_rate": 4.0228912115412925e-05, "loss": 0.1588, "step": 8330 }, { "epoch": 19.01, "grad_norm": 10.386894226074219, "learning_rate": 4.0210510819961725e-05, "loss": 0.0583, "step": 8340 }, { "epoch": 19.01, "grad_norm": 41.2181396484375, "learning_rate": 4.019210952451053e-05, "loss": 0.1404, "step": 8350 }, { "epoch": 19.01, "grad_norm": 11.828994750976562, "learning_rate": 4.0173708229059325e-05, "loss": 0.1395, "step": 8360 }, { "epoch": 19.01, "grad_norm": 56.479366302490234, "learning_rate": 4.0155306933608125e-05, "loss": 0.1967, "step": 8370 }, { "epoch": 19.01, "grad_norm": 0.19793842732906342, "learning_rate": 4.013690563815693e-05, "loss": 0.1509, "step": 8380 }, { "epoch": 19.01, "grad_norm": 0.04642114043235779, "learning_rate": 4.0118504342705725e-05, "loss": 0.171, "step": 8390 }, { "epoch": 19.01, "grad_norm": 0.028800005093216896, "learning_rate": 4.0100103047254525e-05, "loss": 0.1059, "step": 8400 }, { "epoch": 19.01, "eval_accuracy": 0.7184684684684685, "eval_loss": 1.2914650440216064, "eval_runtime": 39.1072, "eval_samples_per_second": 22.707, "eval_steps_per_second": 1.892, "step": 8400 }, { "epoch": 20.0, "grad_norm": 0.05289442837238312, "learning_rate": 4.008170175180333e-05, "loss": 0.0056, "step": 8410 }, { "epoch": 20.0, "grad_norm": 18.77003288269043, "learning_rate": 4.0063300456352126e-05, "loss": 0.0863, "step": 8420 }, { "epoch": 20.0, "grad_norm": 0.9093281030654907, "learning_rate": 4.0044899160900926e-05, "loss": 0.0118, "step": 8430 }, { "epoch": 20.0, "grad_norm": 0.07237890362739563, "learning_rate": 4.002649786544973e-05, "loss": 0.0576, "step": 8440 }, { "epoch": 20.0, "grad_norm": 9.122472763061523, "learning_rate": 4.000809656999853e-05, "loss": 0.0942, "step": 8450 }, { "epoch": 20.0, "grad_norm": 0.02487068995833397, "learning_rate": 3.9989695274547326e-05, "loss": 0.0443, "step": 8460 }, { "epoch": 20.0, "grad_norm": 0.02156475931406021, "learning_rate": 3.997129397909613e-05, "loss": 0.1681, "step": 8470 }, { "epoch": 20.0, "grad_norm": 0.025435185059905052, "learning_rate": 3.995289268364493e-05, "loss": 0.1307, "step": 8480 }, { "epoch": 20.0, "grad_norm": 0.6154837608337402, "learning_rate": 3.9934491388193727e-05, "loss": 0.1182, "step": 8490 }, { "epoch": 20.0, "grad_norm": 0.3116990327835083, "learning_rate": 3.9916090092742533e-05, "loss": 0.0448, "step": 8500 }, { "epoch": 20.0, "grad_norm": 11.546723365783691, "learning_rate": 3.9897688797291334e-05, "loss": 0.1507, "step": 8510 }, { "epoch": 20.0, "grad_norm": 9.730113983154297, "learning_rate": 3.987928750184013e-05, "loss": 0.1857, "step": 8520 }, { "epoch": 20.0, "grad_norm": 0.044611260294914246, "learning_rate": 3.9860886206388934e-05, "loss": 0.1907, "step": 8530 }, { "epoch": 20.0, "grad_norm": 0.4382248818874359, "learning_rate": 3.9842484910937734e-05, "loss": 0.0964, "step": 8540 }, { "epoch": 20.0, "grad_norm": 0.0616886205971241, "learning_rate": 3.9824083615486534e-05, "loss": 0.0944, "step": 8550 }, { "epoch": 20.01, "grad_norm": 28.374303817749023, "learning_rate": 3.9805682320035334e-05, "loss": 0.0524, "step": 8560 }, { "epoch": 20.01, "grad_norm": 0.11457622051239014, "learning_rate": 3.9787281024584134e-05, "loss": 0.1207, "step": 8570 }, { "epoch": 20.01, "grad_norm": 1.5453132390975952, "learning_rate": 3.9768879729132934e-05, "loss": 0.0961, "step": 8580 }, { "epoch": 20.01, "grad_norm": 0.6138853430747986, "learning_rate": 3.9750478433681734e-05, "loss": 0.0976, "step": 8590 }, { "epoch": 20.01, "grad_norm": 0.33729997277259827, "learning_rate": 3.9732077138230535e-05, "loss": 0.0299, "step": 8600 }, { "epoch": 20.01, "grad_norm": 0.14739558100700378, "learning_rate": 3.9713675842779335e-05, "loss": 0.0697, "step": 8610 }, { "epoch": 20.01, "grad_norm": 0.4316738247871399, "learning_rate": 3.9695274547328135e-05, "loss": 0.2268, "step": 8620 }, { "epoch": 20.01, "grad_norm": 0.3644164502620697, "learning_rate": 3.9676873251876935e-05, "loss": 0.0166, "step": 8630 }, { "epoch": 20.01, "grad_norm": 29.693737030029297, "learning_rate": 3.9658471956425735e-05, "loss": 0.2011, "step": 8640 }, { "epoch": 20.01, "grad_norm": 0.11193890124559402, "learning_rate": 3.9640070660974535e-05, "loss": 0.163, "step": 8650 }, { "epoch": 20.01, "grad_norm": 0.500665545463562, "learning_rate": 3.9621669365523335e-05, "loss": 0.1297, "step": 8660 }, { "epoch": 20.01, "grad_norm": 1.7424372434616089, "learning_rate": 3.9603268070072135e-05, "loss": 0.3028, "step": 8670 }, { "epoch": 20.01, "grad_norm": 0.35591185092926025, "learning_rate": 3.9584866774620936e-05, "loss": 0.1396, "step": 8680 }, { "epoch": 20.01, "grad_norm": 0.6528427600860596, "learning_rate": 3.9566465479169736e-05, "loss": 0.2106, "step": 8690 }, { "epoch": 20.01, "grad_norm": 8.507621765136719, "learning_rate": 3.9548064183718536e-05, "loss": 0.0937, "step": 8700 }, { "epoch": 20.01, "grad_norm": 0.22332046926021576, "learning_rate": 3.9529662888267336e-05, "loss": 0.098, "step": 8710 }, { "epoch": 20.01, "grad_norm": 0.1772357076406479, "learning_rate": 3.9511261592816136e-05, "loss": 0.1744, "step": 8720 }, { "epoch": 20.01, "grad_norm": 18.10626792907715, "learning_rate": 3.9492860297364936e-05, "loss": 0.0796, "step": 8730 }, { "epoch": 20.01, "grad_norm": 0.09350816160440445, "learning_rate": 3.9474459001913736e-05, "loss": 0.0159, "step": 8740 }, { "epoch": 20.01, "grad_norm": 0.45954638719558716, "learning_rate": 3.9456057706462536e-05, "loss": 0.081, "step": 8750 }, { "epoch": 20.01, "grad_norm": 0.6258265972137451, "learning_rate": 3.9437656411011336e-05, "loss": 0.2643, "step": 8760 }, { "epoch": 20.01, "grad_norm": 48.40532684326172, "learning_rate": 3.9419255115560137e-05, "loss": 0.1233, "step": 8770 }, { "epoch": 20.01, "grad_norm": 1.5936936140060425, "learning_rate": 3.940085382010894e-05, "loss": 0.0819, "step": 8780 }, { "epoch": 20.01, "grad_norm": 0.1590225249528885, "learning_rate": 3.938245252465774e-05, "loss": 0.1419, "step": 8790 }, { "epoch": 20.01, "grad_norm": 0.9333840012550354, "learning_rate": 3.936405122920654e-05, "loss": 0.0307, "step": 8800 }, { "epoch": 20.01, "grad_norm": 3.0169289112091064, "learning_rate": 3.9345649933755344e-05, "loss": 0.3203, "step": 8810 }, { "epoch": 20.01, "grad_norm": 0.12335663288831711, "learning_rate": 3.932724863830414e-05, "loss": 0.1741, "step": 8820 }, { "epoch": 20.01, "eval_accuracy": 0.7432432432432432, "eval_loss": 1.231528401374817, "eval_runtime": 39.1219, "eval_samples_per_second": 22.698, "eval_steps_per_second": 1.892, "step": 8820 }, { "epoch": 21.0, "grad_norm": 0.8017550110816956, "learning_rate": 3.930884734285294e-05, "loss": 0.1648, "step": 8830 }, { "epoch": 21.0, "grad_norm": 0.028409045189619064, "learning_rate": 3.9290446047401744e-05, "loss": 0.1764, "step": 8840 }, { "epoch": 21.0, "grad_norm": 0.8386111259460449, "learning_rate": 3.927204475195054e-05, "loss": 0.3142, "step": 8850 }, { "epoch": 21.0, "grad_norm": 5.57110595703125, "learning_rate": 3.925364345649934e-05, "loss": 0.2943, "step": 8860 }, { "epoch": 21.0, "grad_norm": 18.612104415893555, "learning_rate": 3.9235242161048144e-05, "loss": 0.0672, "step": 8870 }, { "epoch": 21.0, "grad_norm": 1.4769333600997925, "learning_rate": 3.921684086559694e-05, "loss": 0.0753, "step": 8880 }, { "epoch": 21.0, "grad_norm": 41.26362991333008, "learning_rate": 3.919843957014574e-05, "loss": 0.1466, "step": 8890 }, { "epoch": 21.0, "grad_norm": 0.1891375631093979, "learning_rate": 3.9180038274694545e-05, "loss": 0.0904, "step": 8900 }, { "epoch": 21.0, "grad_norm": 0.03610742464661598, "learning_rate": 3.916163697924334e-05, "loss": 0.0738, "step": 8910 }, { "epoch": 21.0, "grad_norm": 36.820213317871094, "learning_rate": 3.914323568379214e-05, "loss": 0.1528, "step": 8920 }, { "epoch": 21.0, "grad_norm": 42.04949951171875, "learning_rate": 3.9124834388340945e-05, "loss": 0.1982, "step": 8930 }, { "epoch": 21.0, "grad_norm": 7.4204864501953125, "learning_rate": 3.9106433092889745e-05, "loss": 0.0767, "step": 8940 }, { "epoch": 21.0, "grad_norm": 3.0327413082122803, "learning_rate": 3.908803179743854e-05, "loss": 0.11, "step": 8950 }, { "epoch": 21.0, "grad_norm": 1.1203861236572266, "learning_rate": 3.906963050198734e-05, "loss": 0.0456, "step": 8960 }, { "epoch": 21.0, "grad_norm": 4.915249347686768, "learning_rate": 3.9051229206536146e-05, "loss": 0.2851, "step": 8970 }, { "epoch": 21.01, "grad_norm": 0.033022407442331314, "learning_rate": 3.903282791108494e-05, "loss": 0.1003, "step": 8980 }, { "epoch": 21.01, "grad_norm": 34.32133102416992, "learning_rate": 3.901442661563374e-05, "loss": 0.0292, "step": 8990 }, { "epoch": 21.01, "grad_norm": 2.097433090209961, "learning_rate": 3.8996025320182546e-05, "loss": 0.0478, "step": 9000 }, { "epoch": 21.01, "grad_norm": 0.08751551806926727, "learning_rate": 3.897762402473134e-05, "loss": 0.0678, "step": 9010 }, { "epoch": 21.01, "grad_norm": 11.18073558807373, "learning_rate": 3.895922272928014e-05, "loss": 0.0649, "step": 9020 }, { "epoch": 21.01, "grad_norm": 35.868499755859375, "learning_rate": 3.8940821433828946e-05, "loss": 0.0511, "step": 9030 }, { "epoch": 21.01, "grad_norm": 9.828630447387695, "learning_rate": 3.892242013837774e-05, "loss": 0.2034, "step": 9040 }, { "epoch": 21.01, "grad_norm": 12.086048126220703, "learning_rate": 3.890401884292654e-05, "loss": 0.0642, "step": 9050 }, { "epoch": 21.01, "grad_norm": 8.521302223205566, "learning_rate": 3.888561754747535e-05, "loss": 0.2202, "step": 9060 }, { "epoch": 21.01, "grad_norm": 13.418307304382324, "learning_rate": 3.886721625202415e-05, "loss": 0.1072, "step": 9070 }, { "epoch": 21.01, "grad_norm": 0.07701459527015686, "learning_rate": 3.884881495657294e-05, "loss": 0.0174, "step": 9080 }, { "epoch": 21.01, "grad_norm": 6.8916192054748535, "learning_rate": 3.883041366112175e-05, "loss": 0.2348, "step": 9090 }, { "epoch": 21.01, "grad_norm": 0.03478574380278587, "learning_rate": 3.881201236567055e-05, "loss": 0.0187, "step": 9100 }, { "epoch": 21.01, "grad_norm": 19.356990814208984, "learning_rate": 3.879361107021934e-05, "loss": 0.131, "step": 9110 }, { "epoch": 21.01, "grad_norm": 0.17833730578422546, "learning_rate": 3.877520977476815e-05, "loss": 0.0998, "step": 9120 }, { "epoch": 21.01, "grad_norm": 0.059898946434259415, "learning_rate": 3.875680847931695e-05, "loss": 0.0682, "step": 9130 }, { "epoch": 21.01, "grad_norm": 15.192434310913086, "learning_rate": 3.873840718386574e-05, "loss": 0.0871, "step": 9140 }, { "epoch": 21.01, "grad_norm": 0.03356494382023811, "learning_rate": 3.872000588841455e-05, "loss": 0.082, "step": 9150 }, { "epoch": 21.01, "grad_norm": 10.07889175415039, "learning_rate": 3.870160459296335e-05, "loss": 0.1058, "step": 9160 }, { "epoch": 21.01, "grad_norm": 0.051915716379880905, "learning_rate": 3.868320329751215e-05, "loss": 0.0869, "step": 9170 }, { "epoch": 21.01, "grad_norm": 21.866188049316406, "learning_rate": 3.866480200206095e-05, "loss": 0.1787, "step": 9180 }, { "epoch": 21.01, "grad_norm": 0.09002427756786346, "learning_rate": 3.864640070660975e-05, "loss": 0.0945, "step": 9190 }, { "epoch": 21.01, "grad_norm": 36.970481872558594, "learning_rate": 3.862799941115855e-05, "loss": 0.1424, "step": 9200 }, { "epoch": 21.01, "grad_norm": 0.6947605013847351, "learning_rate": 3.860959811570735e-05, "loss": 0.1761, "step": 9210 }, { "epoch": 21.01, "grad_norm": 0.016832459717988968, "learning_rate": 3.859119682025615e-05, "loss": 0.0497, "step": 9220 }, { "epoch": 21.01, "grad_norm": 3.5753579139709473, "learning_rate": 3.857279552480495e-05, "loss": 0.1527, "step": 9230 }, { "epoch": 21.01, "grad_norm": 0.09591725468635559, "learning_rate": 3.855439422935375e-05, "loss": 0.0629, "step": 9240 }, { "epoch": 21.01, "eval_accuracy": 0.722972972972973, "eval_loss": 1.394756555557251, "eval_runtime": 38.8213, "eval_samples_per_second": 22.874, "eval_steps_per_second": 1.906, "step": 9240 }, { "epoch": 22.0, "grad_norm": 0.05974861979484558, "learning_rate": 3.853599293390255e-05, "loss": 0.0187, "step": 9250 }, { "epoch": 22.0, "grad_norm": 0.03310147300362587, "learning_rate": 3.851759163845135e-05, "loss": 0.0019, "step": 9260 }, { "epoch": 22.0, "grad_norm": 28.746809005737305, "learning_rate": 3.849919034300015e-05, "loss": 0.115, "step": 9270 }, { "epoch": 22.0, "grad_norm": 1.6740992069244385, "learning_rate": 3.848078904754895e-05, "loss": 0.2023, "step": 9280 }, { "epoch": 22.0, "grad_norm": 40.7819938659668, "learning_rate": 3.846238775209775e-05, "loss": 0.1216, "step": 9290 }, { "epoch": 22.0, "grad_norm": 0.050575271248817444, "learning_rate": 3.844398645664655e-05, "loss": 0.074, "step": 9300 }, { "epoch": 22.0, "grad_norm": 87.55679321289062, "learning_rate": 3.842558516119535e-05, "loss": 0.2161, "step": 9310 }, { "epoch": 22.0, "grad_norm": 0.12832655012607574, "learning_rate": 3.840718386574415e-05, "loss": 0.0803, "step": 9320 }, { "epoch": 22.0, "grad_norm": 42.25579833984375, "learning_rate": 3.838878257029295e-05, "loss": 0.1586, "step": 9330 }, { "epoch": 22.0, "grad_norm": 28.92885971069336, "learning_rate": 3.837038127484175e-05, "loss": 0.0364, "step": 9340 }, { "epoch": 22.0, "grad_norm": 1.6303467750549316, "learning_rate": 3.835197997939055e-05, "loss": 0.0356, "step": 9350 }, { "epoch": 22.0, "grad_norm": 9.19245719909668, "learning_rate": 3.833357868393935e-05, "loss": 0.107, "step": 9360 }, { "epoch": 22.0, "grad_norm": 2.077812671661377, "learning_rate": 3.831517738848815e-05, "loss": 0.0545, "step": 9370 }, { "epoch": 22.0, "grad_norm": 0.058572422713041306, "learning_rate": 3.829677609303695e-05, "loss": 0.0071, "step": 9380 }, { "epoch": 22.0, "grad_norm": 50.71233367919922, "learning_rate": 3.827837479758575e-05, "loss": 0.0593, "step": 9390 }, { "epoch": 22.01, "grad_norm": 31.309873580932617, "learning_rate": 3.825997350213455e-05, "loss": 0.1554, "step": 9400 }, { "epoch": 22.01, "grad_norm": 9.297453880310059, "learning_rate": 3.824157220668335e-05, "loss": 0.0624, "step": 9410 }, { "epoch": 22.01, "grad_norm": 0.3019231855869293, "learning_rate": 3.822317091123215e-05, "loss": 0.0708, "step": 9420 }, { "epoch": 22.01, "grad_norm": 16.121776580810547, "learning_rate": 3.820476961578096e-05, "loss": 0.0671, "step": 9430 }, { "epoch": 22.01, "grad_norm": 8.922002792358398, "learning_rate": 3.818636832032975e-05, "loss": 0.1623, "step": 9440 }, { "epoch": 22.01, "grad_norm": 5.869600296020508, "learning_rate": 3.816796702487855e-05, "loss": 0.1411, "step": 9450 }, { "epoch": 22.01, "grad_norm": 0.02841232158243656, "learning_rate": 3.814956572942736e-05, "loss": 0.1246, "step": 9460 }, { "epoch": 22.01, "grad_norm": 20.73468589782715, "learning_rate": 3.813116443397615e-05, "loss": 0.1599, "step": 9470 }, { "epoch": 22.01, "grad_norm": 33.60063171386719, "learning_rate": 3.811276313852495e-05, "loss": 0.1519, "step": 9480 }, { "epoch": 22.01, "grad_norm": 39.62193298339844, "learning_rate": 3.809436184307376e-05, "loss": 0.0782, "step": 9490 }, { "epoch": 22.01, "grad_norm": 13.764589309692383, "learning_rate": 3.807596054762255e-05, "loss": 0.2313, "step": 9500 }, { "epoch": 22.01, "grad_norm": 0.05422932282090187, "learning_rate": 3.805755925217135e-05, "loss": 0.0522, "step": 9510 }, { "epoch": 22.01, "grad_norm": 0.43672868609428406, "learning_rate": 3.803915795672016e-05, "loss": 0.0095, "step": 9520 }, { "epoch": 22.01, "grad_norm": 5.034006595611572, "learning_rate": 3.802075666126895e-05, "loss": 0.2069, "step": 9530 }, { "epoch": 22.01, "grad_norm": 0.8128895163536072, "learning_rate": 3.800235536581775e-05, "loss": 0.0059, "step": 9540 }, { "epoch": 22.01, "grad_norm": 0.04529860243201256, "learning_rate": 3.798395407036656e-05, "loss": 0.0623, "step": 9550 }, { "epoch": 22.01, "grad_norm": 0.05076577514410019, "learning_rate": 3.796555277491536e-05, "loss": 0.0635, "step": 9560 }, { "epoch": 22.01, "grad_norm": 0.09839289635419846, "learning_rate": 3.794715147946415e-05, "loss": 0.1778, "step": 9570 }, { "epoch": 22.01, "grad_norm": 3.5754637718200684, "learning_rate": 3.792875018401296e-05, "loss": 0.1769, "step": 9580 }, { "epoch": 22.01, "grad_norm": 22.517118453979492, "learning_rate": 3.791034888856176e-05, "loss": 0.1084, "step": 9590 }, { "epoch": 22.01, "grad_norm": 28.474924087524414, "learning_rate": 3.789194759311055e-05, "loss": 0.1328, "step": 9600 }, { "epoch": 22.01, "grad_norm": 0.3125380277633667, "learning_rate": 3.787354629765936e-05, "loss": 0.1563, "step": 9610 }, { "epoch": 22.01, "grad_norm": 5.8578104972839355, "learning_rate": 3.785514500220816e-05, "loss": 0.1266, "step": 9620 }, { "epoch": 22.01, "grad_norm": 0.23546111583709717, "learning_rate": 3.783674370675695e-05, "loss": 0.1167, "step": 9630 }, { "epoch": 22.01, "grad_norm": 0.32572802901268005, "learning_rate": 3.781834241130576e-05, "loss": 0.1068, "step": 9640 }, { "epoch": 22.01, "grad_norm": 0.057987648993730545, "learning_rate": 3.779994111585456e-05, "loss": 0.0996, "step": 9650 }, { "epoch": 22.01, "grad_norm": 0.020183347165584564, "learning_rate": 3.7781539820403354e-05, "loss": 0.0075, "step": 9660 }, { "epoch": 22.01, "eval_accuracy": 0.7376126126126126, "eval_loss": 1.1434566974639893, "eval_runtime": 38.9969, "eval_samples_per_second": 22.771, "eval_steps_per_second": 1.898, "step": 9660 }, { "epoch": 23.0, "grad_norm": 1.9055886268615723, "learning_rate": 3.776313852495216e-05, "loss": 0.0124, "step": 9670 }, { "epoch": 23.0, "grad_norm": 3.3849780559539795, "learning_rate": 3.774473722950096e-05, "loss": 0.1073, "step": 9680 }, { "epoch": 23.0, "grad_norm": 24.8551025390625, "learning_rate": 3.772633593404976e-05, "loss": 0.0921, "step": 9690 }, { "epoch": 23.0, "grad_norm": 0.3700391948223114, "learning_rate": 3.770793463859856e-05, "loss": 0.1239, "step": 9700 }, { "epoch": 23.0, "grad_norm": 0.06266848742961884, "learning_rate": 3.768953334314736e-05, "loss": 0.0568, "step": 9710 }, { "epoch": 23.0, "grad_norm": 3.016946315765381, "learning_rate": 3.767113204769616e-05, "loss": 0.0159, "step": 9720 }, { "epoch": 23.0, "grad_norm": 0.05134722962975502, "learning_rate": 3.765273075224496e-05, "loss": 0.0665, "step": 9730 }, { "epoch": 23.0, "grad_norm": 0.021403346210718155, "learning_rate": 3.763432945679376e-05, "loss": 0.0372, "step": 9740 }, { "epoch": 23.0, "grad_norm": 0.06313654035329819, "learning_rate": 3.761592816134256e-05, "loss": 0.0333, "step": 9750 }, { "epoch": 23.0, "grad_norm": 0.017108239233493805, "learning_rate": 3.759752686589136e-05, "loss": 0.0688, "step": 9760 }, { "epoch": 23.0, "grad_norm": 36.144142150878906, "learning_rate": 3.757912557044016e-05, "loss": 0.1642, "step": 9770 }, { "epoch": 23.0, "grad_norm": 0.028163446113467216, "learning_rate": 3.756072427498896e-05, "loss": 0.0751, "step": 9780 }, { "epoch": 23.0, "grad_norm": 0.11214728653430939, "learning_rate": 3.754232297953776e-05, "loss": 0.0801, "step": 9790 }, { "epoch": 23.0, "grad_norm": 0.01722540520131588, "learning_rate": 3.752392168408656e-05, "loss": 0.0971, "step": 9800 }, { "epoch": 23.0, "grad_norm": 8.70751667022705, "learning_rate": 3.750552038863536e-05, "loss": 0.2075, "step": 9810 }, { "epoch": 23.01, "grad_norm": 0.03713701665401459, "learning_rate": 3.748711909318416e-05, "loss": 0.02, "step": 9820 }, { "epoch": 23.01, "grad_norm": 0.012665356509387493, "learning_rate": 3.746871779773296e-05, "loss": 0.0393, "step": 9830 }, { "epoch": 23.01, "grad_norm": 0.6546823382377625, "learning_rate": 3.745031650228176e-05, "loss": 0.1762, "step": 9840 }, { "epoch": 23.01, "grad_norm": 0.022101113572716713, "learning_rate": 3.743191520683056e-05, "loss": 0.0973, "step": 9850 }, { "epoch": 23.01, "grad_norm": 80.03093719482422, "learning_rate": 3.741351391137936e-05, "loss": 0.1646, "step": 9860 }, { "epoch": 23.01, "grad_norm": 13.314861297607422, "learning_rate": 3.739511261592816e-05, "loss": 0.1341, "step": 9870 }, { "epoch": 23.01, "grad_norm": 0.23819276690483093, "learning_rate": 3.737671132047696e-05, "loss": 0.1729, "step": 9880 }, { "epoch": 23.01, "grad_norm": 0.4850609600543976, "learning_rate": 3.735831002502576e-05, "loss": 0.1631, "step": 9890 }, { "epoch": 23.01, "grad_norm": 0.18072141706943512, "learning_rate": 3.733990872957456e-05, "loss": 0.0321, "step": 9900 }, { "epoch": 23.01, "grad_norm": 0.23707066476345062, "learning_rate": 3.732150743412336e-05, "loss": 0.0487, "step": 9910 }, { "epoch": 23.01, "grad_norm": 0.026308251544833183, "learning_rate": 3.730310613867217e-05, "loss": 0.0343, "step": 9920 }, { "epoch": 23.01, "grad_norm": 0.01976301707327366, "learning_rate": 3.7284704843220963e-05, "loss": 0.0336, "step": 9930 }, { "epoch": 23.01, "grad_norm": 0.024655556306242943, "learning_rate": 3.7266303547769764e-05, "loss": 0.1337, "step": 9940 }, { "epoch": 23.01, "grad_norm": 0.06664633005857468, "learning_rate": 3.724790225231857e-05, "loss": 0.1152, "step": 9950 }, { "epoch": 23.01, "grad_norm": 6.069345474243164, "learning_rate": 3.7229500956867364e-05, "loss": 0.0569, "step": 9960 }, { "epoch": 23.01, "grad_norm": 0.15154345333576202, "learning_rate": 3.7211099661416164e-05, "loss": 0.114, "step": 9970 }, { "epoch": 23.01, "grad_norm": 6.500603675842285, "learning_rate": 3.719269836596497e-05, "loss": 0.1888, "step": 9980 }, { "epoch": 23.01, "grad_norm": 0.02962004393339157, "learning_rate": 3.7174297070513764e-05, "loss": 0.1188, "step": 9990 }, { "epoch": 23.01, "grad_norm": 30.147991180419922, "learning_rate": 3.7155895775062564e-05, "loss": 0.126, "step": 10000 }, { "epoch": 23.01, "grad_norm": 34.535369873046875, "learning_rate": 3.7137494479611364e-05, "loss": 0.1595, "step": 10010 }, { "epoch": 23.01, "grad_norm": 0.49603599309921265, "learning_rate": 3.7119093184160165e-05, "loss": 0.0493, "step": 10020 }, { "epoch": 23.01, "grad_norm": 1.476478934288025, "learning_rate": 3.7100691888708965e-05, "loss": 0.2013, "step": 10030 }, { "epoch": 23.01, "grad_norm": 1.7116336822509766, "learning_rate": 3.7082290593257765e-05, "loss": 0.1436, "step": 10040 }, { "epoch": 23.01, "grad_norm": 0.0445764921605587, "learning_rate": 3.706388929780657e-05, "loss": 0.0622, "step": 10050 }, { "epoch": 23.01, "grad_norm": 0.06141388788819313, "learning_rate": 3.7045488002355365e-05, "loss": 0.0872, "step": 10060 }, { "epoch": 23.01, "grad_norm": 0.41972872614860535, "learning_rate": 3.7027086706904165e-05, "loss": 0.1058, "step": 10070 }, { "epoch": 23.01, "grad_norm": 96.0462646484375, "learning_rate": 3.700868541145297e-05, "loss": 0.1692, "step": 10080 }, { "epoch": 23.01, "eval_accuracy": 0.7128378378378378, "eval_loss": 1.39983069896698, "eval_runtime": 38.9582, "eval_samples_per_second": 22.794, "eval_steps_per_second": 1.899, "step": 10080 }, { "epoch": 24.0, "grad_norm": 0.01138946134597063, "learning_rate": 3.6990284116001765e-05, "loss": 0.1309, "step": 10090 }, { "epoch": 24.0, "grad_norm": 3.2111175060272217, "learning_rate": 3.6971882820550565e-05, "loss": 0.0148, "step": 10100 }, { "epoch": 24.0, "grad_norm": 0.03468929976224899, "learning_rate": 3.695348152509937e-05, "loss": 0.0766, "step": 10110 }, { "epoch": 24.0, "grad_norm": 0.09320088475942612, "learning_rate": 3.6935080229648166e-05, "loss": 0.0566, "step": 10120 }, { "epoch": 24.0, "grad_norm": 0.05267590656876564, "learning_rate": 3.6916678934196966e-05, "loss": 0.0825, "step": 10130 }, { "epoch": 24.0, "grad_norm": 0.060753244906663895, "learning_rate": 3.689827763874577e-05, "loss": 0.0806, "step": 10140 }, { "epoch": 24.0, "grad_norm": 0.0423295758664608, "learning_rate": 3.6879876343294566e-05, "loss": 0.0056, "step": 10150 }, { "epoch": 24.0, "grad_norm": 0.01127055287361145, "learning_rate": 3.6861475047843366e-05, "loss": 0.1037, "step": 10160 }, { "epoch": 24.0, "grad_norm": 0.08633752912282944, "learning_rate": 3.684307375239217e-05, "loss": 0.1213, "step": 10170 }, { "epoch": 24.0, "grad_norm": 30.071001052856445, "learning_rate": 3.682467245694097e-05, "loss": 0.1545, "step": 10180 }, { "epoch": 24.0, "grad_norm": 0.028279367834329605, "learning_rate": 3.6806271161489766e-05, "loss": 0.0697, "step": 10190 }, { "epoch": 24.0, "grad_norm": 0.04578516632318497, "learning_rate": 3.678786986603857e-05, "loss": 0.1627, "step": 10200 }, { "epoch": 24.0, "grad_norm": 0.11059720069169998, "learning_rate": 3.6769468570587373e-05, "loss": 0.0776, "step": 10210 }, { "epoch": 24.0, "grad_norm": 53.34998321533203, "learning_rate": 3.675106727513617e-05, "loss": 0.0741, "step": 10220 }, { "epoch": 24.0, "grad_norm": 0.013538829982280731, "learning_rate": 3.6732665979684974e-05, "loss": 0.1207, "step": 10230 }, { "epoch": 24.01, "grad_norm": 0.0954127162694931, "learning_rate": 3.6714264684233774e-05, "loss": 0.1385, "step": 10240 }, { "epoch": 24.01, "grad_norm": 31.611495971679688, "learning_rate": 3.669586338878257e-05, "loss": 0.1752, "step": 10250 }, { "epoch": 24.01, "grad_norm": 0.2513565719127655, "learning_rate": 3.6677462093331374e-05, "loss": 0.0656, "step": 10260 }, { "epoch": 24.01, "grad_norm": 0.0673178881406784, "learning_rate": 3.6659060797880174e-05, "loss": 0.0656, "step": 10270 }, { "epoch": 24.01, "grad_norm": 0.04620659723877907, "learning_rate": 3.664065950242897e-05, "loss": 0.102, "step": 10280 }, { "epoch": 24.01, "grad_norm": 11.861926078796387, "learning_rate": 3.6622258206977774e-05, "loss": 0.0844, "step": 10290 }, { "epoch": 24.01, "grad_norm": 8.202778816223145, "learning_rate": 3.6603856911526575e-05, "loss": 0.1471, "step": 10300 }, { "epoch": 24.01, "grad_norm": 2.2646937370300293, "learning_rate": 3.6585455616075375e-05, "loss": 0.0442, "step": 10310 }, { "epoch": 24.01, "grad_norm": 0.02684628963470459, "learning_rate": 3.6567054320624175e-05, "loss": 0.0659, "step": 10320 }, { "epoch": 24.01, "grad_norm": 0.07974495738744736, "learning_rate": 3.6548653025172975e-05, "loss": 0.0989, "step": 10330 }, { "epoch": 24.01, "grad_norm": 12.994035720825195, "learning_rate": 3.6530251729721775e-05, "loss": 0.0064, "step": 10340 }, { "epoch": 24.01, "grad_norm": 0.007294784765690565, "learning_rate": 3.6511850434270575e-05, "loss": 0.0494, "step": 10350 }, { "epoch": 24.01, "grad_norm": 0.032260965555906296, "learning_rate": 3.6493449138819375e-05, "loss": 0.0361, "step": 10360 }, { "epoch": 24.01, "grad_norm": 51.3294677734375, "learning_rate": 3.6475047843368175e-05, "loss": 0.2203, "step": 10370 }, { "epoch": 24.01, "grad_norm": 0.16499635577201843, "learning_rate": 3.6456646547916975e-05, "loss": 0.0403, "step": 10380 }, { "epoch": 24.01, "grad_norm": 0.5237520337104797, "learning_rate": 3.6438245252465776e-05, "loss": 0.1665, "step": 10390 }, { "epoch": 24.01, "grad_norm": 30.010053634643555, "learning_rate": 3.6419843957014576e-05, "loss": 0.0648, "step": 10400 }, { "epoch": 24.01, "grad_norm": 0.030312929302453995, "learning_rate": 3.6401442661563376e-05, "loss": 0.127, "step": 10410 }, { "epoch": 24.01, "grad_norm": 0.0383584164083004, "learning_rate": 3.6383041366112176e-05, "loss": 0.0259, "step": 10420 }, { "epoch": 24.01, "grad_norm": 26.257125854492188, "learning_rate": 3.6364640070660976e-05, "loss": 0.0096, "step": 10430 }, { "epoch": 24.01, "grad_norm": 23.165929794311523, "learning_rate": 3.6346238775209776e-05, "loss": 0.1543, "step": 10440 }, { "epoch": 24.01, "grad_norm": 18.72815704345703, "learning_rate": 3.6327837479758576e-05, "loss": 0.1438, "step": 10450 }, { "epoch": 24.01, "grad_norm": 0.012924473732709885, "learning_rate": 3.6309436184307376e-05, "loss": 0.0911, "step": 10460 }, { "epoch": 24.01, "grad_norm": 16.28953742980957, "learning_rate": 3.6291034888856176e-05, "loss": 0.1934, "step": 10470 }, { "epoch": 24.01, "grad_norm": 2.0924813747406006, "learning_rate": 3.627263359340498e-05, "loss": 0.1541, "step": 10480 }, { "epoch": 24.01, "grad_norm": 10.997538566589355, "learning_rate": 3.625423229795378e-05, "loss": 0.1034, "step": 10490 }, { "epoch": 24.01, "grad_norm": 0.01888904720544815, "learning_rate": 3.623583100250258e-05, "loss": 0.0347, "step": 10500 }, { "epoch": 24.01, "eval_accuracy": 0.7027027027027027, "eval_loss": 1.4803065061569214, "eval_runtime": 38.8312, "eval_samples_per_second": 22.868, "eval_steps_per_second": 1.906, "step": 10500 }, { "epoch": 25.0, "grad_norm": 32.71762466430664, "learning_rate": 3.621742970705138e-05, "loss": 0.0376, "step": 10510 }, { "epoch": 25.0, "grad_norm": 1.0197267532348633, "learning_rate": 3.619902841160018e-05, "loss": 0.0717, "step": 10520 }, { "epoch": 25.0, "grad_norm": 0.022604642435908318, "learning_rate": 3.618062711614898e-05, "loss": 0.1172, "step": 10530 }, { "epoch": 25.0, "grad_norm": 25.57185173034668, "learning_rate": 3.6162225820697784e-05, "loss": 0.1384, "step": 10540 }, { "epoch": 25.0, "grad_norm": 0.04132658615708351, "learning_rate": 3.614382452524658e-05, "loss": 0.0613, "step": 10550 }, { "epoch": 25.0, "grad_norm": 0.005044014658778906, "learning_rate": 3.612542322979538e-05, "loss": 0.0643, "step": 10560 }, { "epoch": 25.0, "grad_norm": 32.0972900390625, "learning_rate": 3.6107021934344184e-05, "loss": 0.1037, "step": 10570 }, { "epoch": 25.0, "grad_norm": 0.08468390256166458, "learning_rate": 3.608862063889298e-05, "loss": 0.0824, "step": 10580 }, { "epoch": 25.0, "grad_norm": 0.014037560671567917, "learning_rate": 3.607021934344178e-05, "loss": 0.0706, "step": 10590 }, { "epoch": 25.0, "grad_norm": 0.02923491969704628, "learning_rate": 3.6051818047990585e-05, "loss": 0.0577, "step": 10600 }, { "epoch": 25.0, "grad_norm": 0.03039034642279148, "learning_rate": 3.603341675253938e-05, "loss": 0.0432, "step": 10610 }, { "epoch": 25.0, "grad_norm": 0.15451429784297943, "learning_rate": 3.601501545708818e-05, "loss": 0.1319, "step": 10620 }, { "epoch": 25.0, "grad_norm": 37.398616790771484, "learning_rate": 3.5996614161636985e-05, "loss": 0.1519, "step": 10630 }, { "epoch": 25.0, "grad_norm": 1.254022240638733, "learning_rate": 3.597821286618578e-05, "loss": 0.0676, "step": 10640 }, { "epoch": 25.0, "grad_norm": 18.051490783691406, "learning_rate": 3.595981157073458e-05, "loss": 0.057, "step": 10650 }, { "epoch": 25.01, "grad_norm": 0.7600337266921997, "learning_rate": 3.5941410275283385e-05, "loss": 0.1598, "step": 10660 }, { "epoch": 25.01, "grad_norm": 20.58978843688965, "learning_rate": 3.5923008979832186e-05, "loss": 0.3004, "step": 10670 }, { "epoch": 25.01, "grad_norm": 0.7854387164115906, "learning_rate": 3.590460768438098e-05, "loss": 0.1261, "step": 10680 }, { "epoch": 25.01, "grad_norm": 31.682432174682617, "learning_rate": 3.5886206388929786e-05, "loss": 0.1037, "step": 10690 }, { "epoch": 25.01, "grad_norm": 1.783712387084961, "learning_rate": 3.5867805093478586e-05, "loss": 0.0997, "step": 10700 }, { "epoch": 25.01, "grad_norm": 9.035534858703613, "learning_rate": 3.584940379802738e-05, "loss": 0.0946, "step": 10710 }, { "epoch": 25.01, "grad_norm": 21.357057571411133, "learning_rate": 3.5831002502576186e-05, "loss": 0.0336, "step": 10720 }, { "epoch": 25.01, "grad_norm": 7.840404033660889, "learning_rate": 3.5812601207124986e-05, "loss": 0.1378, "step": 10730 }, { "epoch": 25.01, "grad_norm": 10.447696685791016, "learning_rate": 3.579419991167378e-05, "loss": 0.0945, "step": 10740 }, { "epoch": 25.01, "grad_norm": 0.049855832010507584, "learning_rate": 3.5775798616222587e-05, "loss": 0.0878, "step": 10750 }, { "epoch": 25.01, "grad_norm": 16.85457992553711, "learning_rate": 3.575739732077139e-05, "loss": 0.0705, "step": 10760 }, { "epoch": 25.01, "grad_norm": 0.26030638813972473, "learning_rate": 3.573899602532018e-05, "loss": 0.0213, "step": 10770 }, { "epoch": 25.01, "grad_norm": 5.856488227844238, "learning_rate": 3.572059472986899e-05, "loss": 0.0092, "step": 10780 }, { "epoch": 25.01, "grad_norm": 0.013858279213309288, "learning_rate": 3.570219343441779e-05, "loss": 0.1014, "step": 10790 }, { "epoch": 25.01, "grad_norm": 12.681320190429688, "learning_rate": 3.568379213896659e-05, "loss": 0.0263, "step": 10800 }, { "epoch": 25.01, "grad_norm": 12.907841682434082, "learning_rate": 3.566539084351539e-05, "loss": 0.1036, "step": 10810 }, { "epoch": 25.01, "grad_norm": 0.20276179909706116, "learning_rate": 3.564698954806419e-05, "loss": 0.0841, "step": 10820 }, { "epoch": 25.01, "grad_norm": 0.01858721859753132, "learning_rate": 3.562858825261299e-05, "loss": 0.191, "step": 10830 }, { "epoch": 25.01, "grad_norm": 0.039093319326639175, "learning_rate": 3.561018695716178e-05, "loss": 0.0442, "step": 10840 }, { "epoch": 25.01, "grad_norm": 35.638572692871094, "learning_rate": 3.559178566171059e-05, "loss": 0.0427, "step": 10850 }, { "epoch": 25.01, "grad_norm": 0.050236549228429794, "learning_rate": 3.557338436625939e-05, "loss": 0.0294, "step": 10860 }, { "epoch": 25.01, "grad_norm": 0.2372741997241974, "learning_rate": 3.555498307080818e-05, "loss": 0.0848, "step": 10870 }, { "epoch": 25.01, "grad_norm": 0.03720271214842796, "learning_rate": 3.553658177535699e-05, "loss": 0.022, "step": 10880 }, { "epoch": 25.01, "grad_norm": 19.287675857543945, "learning_rate": 3.551818047990579e-05, "loss": 0.083, "step": 10890 }, { "epoch": 25.01, "grad_norm": 0.08347965776920319, "learning_rate": 3.549977918445458e-05, "loss": 0.0453, "step": 10900 }, { "epoch": 25.01, "grad_norm": 22.4985294342041, "learning_rate": 3.548137788900339e-05, "loss": 0.1279, "step": 10910 }, { "epoch": 25.01, "grad_norm": 0.03293849155306816, "learning_rate": 3.546297659355219e-05, "loss": 0.0396, "step": 10920 }, { "epoch": 25.01, "eval_accuracy": 0.7004504504504504, "eval_loss": 1.6456875801086426, "eval_runtime": 38.6971, "eval_samples_per_second": 22.947, "eval_steps_per_second": 1.912, "step": 10920 }, { "epoch": 26.0, "grad_norm": 0.10634168982505798, "learning_rate": 3.544457529810099e-05, "loss": 0.1908, "step": 10930 }, { "epoch": 26.0, "grad_norm": 0.012839309871196747, "learning_rate": 3.542617400264979e-05, "loss": 0.1257, "step": 10940 }, { "epoch": 26.0, "grad_norm": 20.624330520629883, "learning_rate": 3.540777270719859e-05, "loss": 0.1339, "step": 10950 }, { "epoch": 26.0, "grad_norm": 0.8575695753097534, "learning_rate": 3.538937141174739e-05, "loss": 0.1957, "step": 10960 }, { "epoch": 26.0, "grad_norm": 0.24717579782009125, "learning_rate": 3.537097011629619e-05, "loss": 0.0824, "step": 10970 }, { "epoch": 26.0, "grad_norm": 0.057337842881679535, "learning_rate": 3.535256882084499e-05, "loss": 0.0173, "step": 10980 }, { "epoch": 26.0, "grad_norm": 1.2183499336242676, "learning_rate": 3.533416752539379e-05, "loss": 0.0373, "step": 10990 }, { "epoch": 26.0, "grad_norm": 0.007798346225172281, "learning_rate": 3.531576622994259e-05, "loss": 0.0586, "step": 11000 }, { "epoch": 26.0, "grad_norm": 14.570333480834961, "learning_rate": 3.529736493449139e-05, "loss": 0.0178, "step": 11010 }, { "epoch": 26.0, "grad_norm": 17.163837432861328, "learning_rate": 3.527896363904019e-05, "loss": 0.1456, "step": 11020 }, { "epoch": 26.0, "grad_norm": 0.017764287069439888, "learning_rate": 3.526056234358899e-05, "loss": 0.0556, "step": 11030 }, { "epoch": 26.0, "grad_norm": 0.01081930659711361, "learning_rate": 3.524216104813779e-05, "loss": 0.1175, "step": 11040 }, { "epoch": 26.0, "grad_norm": 0.011412628926336765, "learning_rate": 3.522375975268659e-05, "loss": 0.039, "step": 11050 }, { "epoch": 26.0, "grad_norm": 0.015809055417776108, "learning_rate": 3.520535845723539e-05, "loss": 0.0942, "step": 11060 }, { "epoch": 26.0, "grad_norm": 0.015211720019578934, "learning_rate": 3.518695716178419e-05, "loss": 0.1319, "step": 11070 }, { "epoch": 26.01, "grad_norm": 47.66071319580078, "learning_rate": 3.516855586633299e-05, "loss": 0.1854, "step": 11080 }, { "epoch": 26.01, "grad_norm": 0.013697043061256409, "learning_rate": 3.515015457088179e-05, "loss": 0.0635, "step": 11090 }, { "epoch": 26.01, "grad_norm": 0.049156658351421356, "learning_rate": 3.513175327543059e-05, "loss": 0.0205, "step": 11100 }, { "epoch": 26.01, "grad_norm": 1.8962242603302002, "learning_rate": 3.511335197997939e-05, "loss": 0.0018, "step": 11110 }, { "epoch": 26.01, "grad_norm": 0.009972991421818733, "learning_rate": 3.509495068452819e-05, "loss": 0.03, "step": 11120 }, { "epoch": 26.01, "grad_norm": 0.1715347170829773, "learning_rate": 3.507654938907699e-05, "loss": 0.1898, "step": 11130 }, { "epoch": 26.01, "grad_norm": 0.05813472345471382, "learning_rate": 3.505814809362579e-05, "loss": 0.0597, "step": 11140 }, { "epoch": 26.01, "grad_norm": 10.253442764282227, "learning_rate": 3.503974679817459e-05, "loss": 0.1426, "step": 11150 }, { "epoch": 26.01, "grad_norm": 0.1588079035282135, "learning_rate": 3.50213455027234e-05, "loss": 0.1498, "step": 11160 }, { "epoch": 26.01, "grad_norm": 0.1124909520149231, "learning_rate": 3.500294420727219e-05, "loss": 0.1932, "step": 11170 }, { "epoch": 26.01, "grad_norm": 0.07209834456443787, "learning_rate": 3.498454291182099e-05, "loss": 0.1186, "step": 11180 }, { "epoch": 26.01, "grad_norm": 4.765960216522217, "learning_rate": 3.49661416163698e-05, "loss": 0.129, "step": 11190 }, { "epoch": 26.01, "grad_norm": 7.938549041748047, "learning_rate": 3.494774032091859e-05, "loss": 0.1319, "step": 11200 }, { "epoch": 26.01, "grad_norm": 10.07351016998291, "learning_rate": 3.492933902546739e-05, "loss": 0.1798, "step": 11210 }, { "epoch": 26.01, "grad_norm": 16.873706817626953, "learning_rate": 3.49109377300162e-05, "loss": 0.2251, "step": 11220 }, { "epoch": 26.01, "grad_norm": 16.359207153320312, "learning_rate": 3.489253643456499e-05, "loss": 0.1039, "step": 11230 }, { "epoch": 26.01, "grad_norm": 0.026186149567365646, "learning_rate": 3.487413513911379e-05, "loss": 0.0766, "step": 11240 }, { "epoch": 26.01, "grad_norm": 0.05351871997117996, "learning_rate": 3.48557338436626e-05, "loss": 0.1743, "step": 11250 }, { "epoch": 26.01, "grad_norm": 0.06096191704273224, "learning_rate": 3.483733254821139e-05, "loss": 0.0711, "step": 11260 }, { "epoch": 26.01, "grad_norm": 0.14688991010189056, "learning_rate": 3.481893125276019e-05, "loss": 0.1217, "step": 11270 }, { "epoch": 26.01, "grad_norm": 0.16987045109272003, "learning_rate": 3.4800529957309e-05, "loss": 0.0558, "step": 11280 }, { "epoch": 26.01, "grad_norm": 0.23579993844032288, "learning_rate": 3.47821286618578e-05, "loss": 0.0538, "step": 11290 }, { "epoch": 26.01, "grad_norm": 0.3610358238220215, "learning_rate": 3.476372736640659e-05, "loss": 0.1061, "step": 11300 }, { "epoch": 26.01, "grad_norm": 0.9386014938354492, "learning_rate": 3.47453260709554e-05, "loss": 0.0251, "step": 11310 }, { "epoch": 26.01, "grad_norm": 26.224563598632812, "learning_rate": 3.47269247755042e-05, "loss": 0.0344, "step": 11320 }, { "epoch": 26.01, "grad_norm": 0.008067339658737183, "learning_rate": 3.470852348005299e-05, "loss": 0.0606, "step": 11330 }, { "epoch": 26.01, "grad_norm": 0.00769740529358387, "learning_rate": 3.46901221846018e-05, "loss": 0.0074, "step": 11340 }, { "epoch": 26.01, "eval_accuracy": 0.704954954954955, "eval_loss": 1.5601754188537598, "eval_runtime": 38.8018, "eval_samples_per_second": 22.886, "eval_steps_per_second": 1.907, "step": 11340 }, { "epoch": 27.0, "grad_norm": 0.13885366916656494, "learning_rate": 3.46717208891506e-05, "loss": 0.1765, "step": 11350 }, { "epoch": 27.0, "grad_norm": 0.03417723625898361, "learning_rate": 3.4653319593699394e-05, "loss": 0.1657, "step": 11360 }, { "epoch": 27.0, "grad_norm": 0.009304393082857132, "learning_rate": 3.46349182982482e-05, "loss": 0.1211, "step": 11370 }, { "epoch": 27.0, "grad_norm": 0.16966260969638824, "learning_rate": 3.4616517002797e-05, "loss": 0.019, "step": 11380 }, { "epoch": 27.0, "grad_norm": 1.482495665550232, "learning_rate": 3.4598115707345794e-05, "loss": 0.0346, "step": 11390 }, { "epoch": 27.0, "grad_norm": 0.08011610060930252, "learning_rate": 3.45797144118946e-05, "loss": 0.0895, "step": 11400 }, { "epoch": 27.0, "grad_norm": 0.025240659713745117, "learning_rate": 3.45613131164434e-05, "loss": 0.1737, "step": 11410 }, { "epoch": 27.0, "grad_norm": 15.813284873962402, "learning_rate": 3.45429118209922e-05, "loss": 0.0534, "step": 11420 }, { "epoch": 27.0, "grad_norm": 22.745393753051758, "learning_rate": 3.4524510525541e-05, "loss": 0.1105, "step": 11430 }, { "epoch": 27.0, "grad_norm": 0.018844788894057274, "learning_rate": 3.45061092300898e-05, "loss": 0.0072, "step": 11440 }, { "epoch": 27.0, "grad_norm": 0.08511705696582794, "learning_rate": 3.44877079346386e-05, "loss": 0.1052, "step": 11450 }, { "epoch": 27.0, "grad_norm": 0.018431710079312325, "learning_rate": 3.44693066391874e-05, "loss": 0.0745, "step": 11460 }, { "epoch": 27.0, "grad_norm": 0.060771312564611435, "learning_rate": 3.44509053437362e-05, "loss": 0.026, "step": 11470 }, { "epoch": 27.0, "grad_norm": 0.01608353666961193, "learning_rate": 3.4432504048285e-05, "loss": 0.0576, "step": 11480 }, { "epoch": 27.0, "grad_norm": 0.0488639660179615, "learning_rate": 3.44141027528338e-05, "loss": 0.1017, "step": 11490 }, { "epoch": 27.01, "grad_norm": 28.09701156616211, "learning_rate": 3.43957014573826e-05, "loss": 0.0847, "step": 11500 }, { "epoch": 27.01, "grad_norm": 0.2560718357563019, "learning_rate": 3.43773001619314e-05, "loss": 0.0066, "step": 11510 }, { "epoch": 27.01, "grad_norm": 0.11808411777019501, "learning_rate": 3.43588988664802e-05, "loss": 0.1041, "step": 11520 }, { "epoch": 27.01, "grad_norm": 0.3728163540363312, "learning_rate": 3.4340497571029e-05, "loss": 0.0721, "step": 11530 }, { "epoch": 27.01, "grad_norm": 6.730001449584961, "learning_rate": 3.43220962755778e-05, "loss": 0.0634, "step": 11540 }, { "epoch": 27.01, "grad_norm": 63.36409378051758, "learning_rate": 3.43036949801266e-05, "loss": 0.2596, "step": 11550 }, { "epoch": 27.01, "grad_norm": 0.0218398105353117, "learning_rate": 3.42852936846754e-05, "loss": 0.0739, "step": 11560 }, { "epoch": 27.01, "grad_norm": 7.553366661071777, "learning_rate": 3.42668923892242e-05, "loss": 0.167, "step": 11570 }, { "epoch": 27.01, "grad_norm": 0.7259697914123535, "learning_rate": 3.4248491093773e-05, "loss": 0.1717, "step": 11580 }, { "epoch": 27.01, "grad_norm": 0.05467524379491806, "learning_rate": 3.42300897983218e-05, "loss": 0.043, "step": 11590 }, { "epoch": 27.01, "grad_norm": 0.04555194452404976, "learning_rate": 3.42116885028706e-05, "loss": 0.0153, "step": 11600 }, { "epoch": 27.01, "grad_norm": 7.373996734619141, "learning_rate": 3.41932872074194e-05, "loss": 0.1179, "step": 11610 }, { "epoch": 27.01, "grad_norm": 31.452836990356445, "learning_rate": 3.41748859119682e-05, "loss": 0.0755, "step": 11620 }, { "epoch": 27.01, "grad_norm": 9.720723152160645, "learning_rate": 3.4156484616517003e-05, "loss": 0.1386, "step": 11630 }, { "epoch": 27.01, "grad_norm": 0.06120923161506653, "learning_rate": 3.4138083321065804e-05, "loss": 0.167, "step": 11640 }, { "epoch": 27.01, "grad_norm": 0.039387207478284836, "learning_rate": 3.4119682025614604e-05, "loss": 0.0332, "step": 11650 }, { "epoch": 27.01, "grad_norm": 0.19560861587524414, "learning_rate": 3.4101280730163404e-05, "loss": 0.0708, "step": 11660 }, { "epoch": 27.01, "grad_norm": 2.266158103942871, "learning_rate": 3.4082879434712204e-05, "loss": 0.0311, "step": 11670 }, { "epoch": 27.01, "grad_norm": 0.20093105733394623, "learning_rate": 3.406447813926101e-05, "loss": 0.1291, "step": 11680 }, { "epoch": 27.01, "grad_norm": 0.83036869764328, "learning_rate": 3.4046076843809804e-05, "loss": 0.0734, "step": 11690 }, { "epoch": 27.01, "grad_norm": 5.829738140106201, "learning_rate": 3.4027675548358604e-05, "loss": 0.0868, "step": 11700 }, { "epoch": 27.01, "grad_norm": 7.233109474182129, "learning_rate": 3.400927425290741e-05, "loss": 0.1265, "step": 11710 }, { "epoch": 27.01, "grad_norm": 0.12846483290195465, "learning_rate": 3.3990872957456204e-05, "loss": 0.0854, "step": 11720 }, { "epoch": 27.01, "grad_norm": 9.74392318725586, "learning_rate": 3.3972471662005005e-05, "loss": 0.1329, "step": 11730 }, { "epoch": 27.01, "grad_norm": 0.6316181421279907, "learning_rate": 3.395407036655381e-05, "loss": 0.0495, "step": 11740 }, { "epoch": 27.01, "grad_norm": 4.9387054443359375, "learning_rate": 3.3935669071102605e-05, "loss": 0.006, "step": 11750 }, { "epoch": 27.01, "grad_norm": 0.014866938814520836, "learning_rate": 3.3917267775651405e-05, "loss": 0.1256, "step": 11760 }, { "epoch": 27.01, "eval_accuracy": 0.7173423423423423, "eval_loss": 1.3965140581130981, "eval_runtime": 38.9369, "eval_samples_per_second": 22.806, "eval_steps_per_second": 1.901, "step": 11760 }, { "epoch": 28.0, "grad_norm": 0.08045655488967896, "learning_rate": 3.389886648020021e-05, "loss": 0.0437, "step": 11770 }, { "epoch": 28.0, "grad_norm": 0.04858655855059624, "learning_rate": 3.388046518474901e-05, "loss": 0.0011, "step": 11780 }, { "epoch": 28.0, "grad_norm": 9.89545726776123, "learning_rate": 3.3862063889297805e-05, "loss": 0.0796, "step": 11790 }, { "epoch": 28.0, "grad_norm": 0.09138563275337219, "learning_rate": 3.384366259384661e-05, "loss": 0.0482, "step": 11800 }, { "epoch": 28.0, "grad_norm": 0.07711490988731384, "learning_rate": 3.382526129839541e-05, "loss": 0.0255, "step": 11810 }, { "epoch": 28.0, "grad_norm": 21.119638442993164, "learning_rate": 3.3806860002944206e-05, "loss": 0.0481, "step": 11820 }, { "epoch": 28.0, "grad_norm": 0.01722894050180912, "learning_rate": 3.378845870749301e-05, "loss": 0.0201, "step": 11830 }, { "epoch": 28.0, "grad_norm": 0.012293045409023762, "learning_rate": 3.377005741204181e-05, "loss": 0.1236, "step": 11840 }, { "epoch": 28.0, "grad_norm": 4.5801873207092285, "learning_rate": 3.3751656116590606e-05, "loss": 0.0025, "step": 11850 }, { "epoch": 28.0, "grad_norm": 0.039160292595624924, "learning_rate": 3.373325482113941e-05, "loss": 0.0014, "step": 11860 }, { "epoch": 28.0, "grad_norm": 0.021740267053246498, "learning_rate": 3.371485352568821e-05, "loss": 0.0528, "step": 11870 }, { "epoch": 28.0, "grad_norm": 0.004420125856995583, "learning_rate": 3.3696452230237006e-05, "loss": 0.1623, "step": 11880 }, { "epoch": 28.0, "grad_norm": 2.0642523765563965, "learning_rate": 3.3678050934785806e-05, "loss": 0.0021, "step": 11890 }, { "epoch": 28.0, "grad_norm": 13.541433334350586, "learning_rate": 3.365964963933461e-05, "loss": 0.1623, "step": 11900 }, { "epoch": 28.0, "grad_norm": 0.06738744676113129, "learning_rate": 3.3641248343883413e-05, "loss": 0.0012, "step": 11910 }, { "epoch": 28.01, "grad_norm": 0.05531314015388489, "learning_rate": 3.362284704843221e-05, "loss": 0.1722, "step": 11920 }, { "epoch": 28.01, "grad_norm": 12.824578285217285, "learning_rate": 3.3604445752981014e-05, "loss": 0.1559, "step": 11930 }, { "epoch": 28.01, "grad_norm": 0.024552879855036736, "learning_rate": 3.3586044457529814e-05, "loss": 0.0032, "step": 11940 }, { "epoch": 28.01, "grad_norm": 10.314522743225098, "learning_rate": 3.356764316207861e-05, "loss": 0.0975, "step": 11950 }, { "epoch": 28.01, "grad_norm": 50.956687927246094, "learning_rate": 3.3549241866627414e-05, "loss": 0.1327, "step": 11960 }, { "epoch": 28.01, "grad_norm": 10.152609825134277, "learning_rate": 3.3530840571176214e-05, "loss": 0.0645, "step": 11970 }, { "epoch": 28.01, "grad_norm": 53.54362106323242, "learning_rate": 3.351243927572501e-05, "loss": 0.0216, "step": 11980 }, { "epoch": 28.01, "grad_norm": 0.24693238735198975, "learning_rate": 3.3494037980273814e-05, "loss": 0.0092, "step": 11990 }, { "epoch": 28.01, "grad_norm": 39.524871826171875, "learning_rate": 3.3475636684822614e-05, "loss": 0.0326, "step": 12000 }, { "epoch": 28.01, "grad_norm": 2.962416172027588, "learning_rate": 3.345723538937141e-05, "loss": 0.0028, "step": 12010 }, { "epoch": 28.01, "grad_norm": 0.015500775538384914, "learning_rate": 3.3438834093920215e-05, "loss": 0.1865, "step": 12020 }, { "epoch": 28.01, "grad_norm": 50.48593521118164, "learning_rate": 3.3420432798469015e-05, "loss": 0.116, "step": 12030 }, { "epoch": 28.01, "grad_norm": 0.811002254486084, "learning_rate": 3.3402031503017815e-05, "loss": 0.0727, "step": 12040 }, { "epoch": 28.01, "grad_norm": 56.46418380737305, "learning_rate": 3.3383630207566615e-05, "loss": 0.0967, "step": 12050 }, { "epoch": 28.01, "grad_norm": 0.04430406540632248, "learning_rate": 3.3365228912115415e-05, "loss": 0.0665, "step": 12060 }, { "epoch": 28.01, "grad_norm": 0.013381538912653923, "learning_rate": 3.3346827616664215e-05, "loss": 0.0414, "step": 12070 }, { "epoch": 28.01, "grad_norm": 0.0382387675344944, "learning_rate": 3.3328426321213015e-05, "loss": 0.0819, "step": 12080 }, { "epoch": 28.01, "grad_norm": 0.07482504844665527, "learning_rate": 3.3310025025761816e-05, "loss": 0.0756, "step": 12090 }, { "epoch": 28.01, "grad_norm": 35.36876678466797, "learning_rate": 3.3291623730310616e-05, "loss": 0.1254, "step": 12100 }, { "epoch": 28.01, "grad_norm": 0.0059246160089969635, "learning_rate": 3.3273222434859416e-05, "loss": 0.1452, "step": 12110 }, { "epoch": 28.01, "grad_norm": 8.87775707244873, "learning_rate": 3.3254821139408216e-05, "loss": 0.1301, "step": 12120 }, { "epoch": 28.01, "grad_norm": 0.0224022027105093, "learning_rate": 3.3236419843957016e-05, "loss": 0.04, "step": 12130 }, { "epoch": 28.01, "grad_norm": 0.08252600580453873, "learning_rate": 3.3218018548505816e-05, "loss": 0.0537, "step": 12140 }, { "epoch": 28.01, "grad_norm": 0.013457234017550945, "learning_rate": 3.3199617253054616e-05, "loss": 0.0159, "step": 12150 }, { "epoch": 28.01, "grad_norm": 0.1613243669271469, "learning_rate": 3.3181215957603416e-05, "loss": 0.1108, "step": 12160 }, { "epoch": 28.01, "grad_norm": 0.0809352844953537, "learning_rate": 3.3162814662152216e-05, "loss": 0.001, "step": 12170 }, { "epoch": 28.01, "grad_norm": 0.011667724698781967, "learning_rate": 3.3144413366701017e-05, "loss": 0.0021, "step": 12180 }, { "epoch": 28.01, "eval_accuracy": 0.7342342342342343, "eval_loss": 1.4513802528381348, "eval_runtime": 38.794, "eval_samples_per_second": 22.89, "eval_steps_per_second": 1.908, "step": 12180 }, { "epoch": 29.0, "grad_norm": 0.02104813978075981, "learning_rate": 3.312601207124982e-05, "loss": 0.0928, "step": 12190 }, { "epoch": 29.0, "grad_norm": 0.006996306590735912, "learning_rate": 3.310761077579862e-05, "loss": 0.0851, "step": 12200 }, { "epoch": 29.0, "grad_norm": 0.006665319669991732, "learning_rate": 3.308920948034742e-05, "loss": 0.0012, "step": 12210 }, { "epoch": 29.0, "grad_norm": 17.009796142578125, "learning_rate": 3.307080818489622e-05, "loss": 0.1042, "step": 12220 }, { "epoch": 29.0, "grad_norm": 0.015308617614209652, "learning_rate": 3.305240688944502e-05, "loss": 0.0524, "step": 12230 }, { "epoch": 29.0, "grad_norm": 12.653825759887695, "learning_rate": 3.303400559399382e-05, "loss": 0.1284, "step": 12240 }, { "epoch": 29.0, "grad_norm": 44.44422149658203, "learning_rate": 3.301560429854262e-05, "loss": 0.0961, "step": 12250 }, { "epoch": 29.0, "grad_norm": 0.007318226154893637, "learning_rate": 3.299720300309142e-05, "loss": 0.0011, "step": 12260 }, { "epoch": 29.0, "grad_norm": 0.03102783113718033, "learning_rate": 3.297880170764022e-05, "loss": 0.0264, "step": 12270 }, { "epoch": 29.0, "grad_norm": 0.03847644105553627, "learning_rate": 3.296040041218902e-05, "loss": 0.1429, "step": 12280 }, { "epoch": 29.0, "grad_norm": 13.650267601013184, "learning_rate": 3.294199911673782e-05, "loss": 0.1052, "step": 12290 }, { "epoch": 29.0, "grad_norm": 0.36820557713508606, "learning_rate": 3.2923597821286625e-05, "loss": 0.0654, "step": 12300 }, { "epoch": 29.0, "grad_norm": 0.013632736168801785, "learning_rate": 3.290519652583542e-05, "loss": 0.0014, "step": 12310 }, { "epoch": 29.0, "grad_norm": 0.00829467736184597, "learning_rate": 3.288679523038422e-05, "loss": 0.0232, "step": 12320 }, { "epoch": 29.0, "grad_norm": 0.01030084490776062, "learning_rate": 3.2868393934933025e-05, "loss": 0.0835, "step": 12330 }, { "epoch": 29.01, "grad_norm": 0.01338445208966732, "learning_rate": 3.284999263948182e-05, "loss": 0.0216, "step": 12340 }, { "epoch": 29.01, "grad_norm": 0.33869656920433044, "learning_rate": 3.283159134403062e-05, "loss": 0.0056, "step": 12350 }, { "epoch": 29.01, "grad_norm": 0.016546163707971573, "learning_rate": 3.2813190048579425e-05, "loss": 0.0764, "step": 12360 }, { "epoch": 29.01, "grad_norm": 4.044371128082275, "learning_rate": 3.279478875312822e-05, "loss": 0.0997, "step": 12370 }, { "epoch": 29.01, "grad_norm": 0.11344069987535477, "learning_rate": 3.277638745767702e-05, "loss": 0.0927, "step": 12380 }, { "epoch": 29.01, "grad_norm": 0.09442989528179169, "learning_rate": 3.2757986162225826e-05, "loss": 0.0375, "step": 12390 }, { "epoch": 29.01, "grad_norm": 0.032961271703243256, "learning_rate": 3.2739584866774626e-05, "loss": 0.0655, "step": 12400 }, { "epoch": 29.01, "grad_norm": 29.774293899536133, "learning_rate": 3.272118357132342e-05, "loss": 0.0449, "step": 12410 }, { "epoch": 29.01, "grad_norm": 28.634647369384766, "learning_rate": 3.2702782275872226e-05, "loss": 0.1557, "step": 12420 }, { "epoch": 29.01, "grad_norm": 0.052351102232933044, "learning_rate": 3.2684380980421026e-05, "loss": 0.0192, "step": 12430 }, { "epoch": 29.01, "grad_norm": 0.08391708880662918, "learning_rate": 3.266597968496982e-05, "loss": 0.0334, "step": 12440 }, { "epoch": 29.01, "grad_norm": 0.0222425889223814, "learning_rate": 3.2647578389518626e-05, "loss": 0.0608, "step": 12450 }, { "epoch": 29.01, "grad_norm": 0.010909227654337883, "learning_rate": 3.2629177094067427e-05, "loss": 0.0445, "step": 12460 }, { "epoch": 29.01, "grad_norm": 3.7934606075286865, "learning_rate": 3.261077579861622e-05, "loss": 0.0854, "step": 12470 }, { "epoch": 29.01, "grad_norm": 5.625136852264404, "learning_rate": 3.259237450316503e-05, "loss": 0.0471, "step": 12480 }, { "epoch": 29.01, "grad_norm": 30.222606658935547, "learning_rate": 3.257397320771383e-05, "loss": 0.1509, "step": 12490 }, { "epoch": 29.01, "grad_norm": 0.051470424979925156, "learning_rate": 3.255557191226262e-05, "loss": 0.1511, "step": 12500 }, { "epoch": 29.01, "grad_norm": 4.000987529754639, "learning_rate": 3.253717061681143e-05, "loss": 0.0246, "step": 12510 }, { "epoch": 29.01, "grad_norm": 63.03067398071289, "learning_rate": 3.251876932136023e-05, "loss": 0.1366, "step": 12520 }, { "epoch": 29.01, "grad_norm": 0.11710133403539658, "learning_rate": 3.250036802590903e-05, "loss": 0.001, "step": 12530 }, { "epoch": 29.01, "grad_norm": 0.9972437620162964, "learning_rate": 3.248196673045783e-05, "loss": 0.08, "step": 12540 }, { "epoch": 29.01, "grad_norm": 7.652525901794434, "learning_rate": 3.246356543500663e-05, "loss": 0.1765, "step": 12550 }, { "epoch": 29.01, "grad_norm": 0.347403883934021, "learning_rate": 3.244516413955543e-05, "loss": 0.0018, "step": 12560 }, { "epoch": 29.01, "grad_norm": 0.07959982007741928, "learning_rate": 3.242676284410423e-05, "loss": 0.0426, "step": 12570 }, { "epoch": 29.01, "grad_norm": 0.03433597460389137, "learning_rate": 3.240836154865303e-05, "loss": 0.1113, "step": 12580 }, { "epoch": 29.01, "grad_norm": 0.1946258246898651, "learning_rate": 3.238996025320183e-05, "loss": 0.1299, "step": 12590 }, { "epoch": 29.01, "grad_norm": 0.2794837951660156, "learning_rate": 3.237155895775063e-05, "loss": 0.0476, "step": 12600 }, { "epoch": 29.01, "eval_accuracy": 0.7173423423423423, "eval_loss": 1.291523814201355, "eval_runtime": 38.9534, "eval_samples_per_second": 22.796, "eval_steps_per_second": 1.9, "step": 12600 }, { "epoch": 30.0, "grad_norm": 32.00685501098633, "learning_rate": 3.235315766229943e-05, "loss": 0.1156, "step": 12610 }, { "epoch": 30.0, "grad_norm": 0.07065641134977341, "learning_rate": 3.233475636684823e-05, "loss": 0.0397, "step": 12620 }, { "epoch": 30.0, "grad_norm": 3.7180702686309814, "learning_rate": 3.231635507139703e-05, "loss": 0.0469, "step": 12630 }, { "epoch": 30.0, "grad_norm": 0.03295394033193588, "learning_rate": 3.229795377594583e-05, "loss": 0.0384, "step": 12640 }, { "epoch": 30.0, "grad_norm": 0.0369311086833477, "learning_rate": 3.227955248049463e-05, "loss": 0.0088, "step": 12650 }, { "epoch": 30.0, "grad_norm": 0.24643197655677795, "learning_rate": 3.226115118504343e-05, "loss": 0.0652, "step": 12660 }, { "epoch": 30.0, "grad_norm": 0.02824876271188259, "learning_rate": 3.224274988959223e-05, "loss": 0.0169, "step": 12670 }, { "epoch": 30.0, "grad_norm": 5.246930122375488, "learning_rate": 3.222434859414103e-05, "loss": 0.0051, "step": 12680 }, { "epoch": 30.0, "grad_norm": 47.156028747558594, "learning_rate": 3.220594729868983e-05, "loss": 0.0966, "step": 12690 }, { "epoch": 30.0, "grad_norm": 0.023476136848330498, "learning_rate": 3.218754600323863e-05, "loss": 0.1579, "step": 12700 }, { "epoch": 30.0, "grad_norm": 0.12407675385475159, "learning_rate": 3.216914470778743e-05, "loss": 0.048, "step": 12710 }, { "epoch": 30.0, "grad_norm": 0.08669500052928925, "learning_rate": 3.215074341233623e-05, "loss": 0.0019, "step": 12720 }, { "epoch": 30.0, "grad_norm": 37.406314849853516, "learning_rate": 3.213234211688503e-05, "loss": 0.1033, "step": 12730 }, { "epoch": 30.0, "grad_norm": 38.6237678527832, "learning_rate": 3.211394082143383e-05, "loss": 0.11, "step": 12740 }, { "epoch": 30.0, "grad_norm": 0.009670387022197247, "learning_rate": 3.209553952598263e-05, "loss": 0.0324, "step": 12750 }, { "epoch": 30.01, "grad_norm": 0.011206231079995632, "learning_rate": 3.207713823053143e-05, "loss": 0.1248, "step": 12760 }, { "epoch": 30.01, "grad_norm": 0.17277628183364868, "learning_rate": 3.205873693508023e-05, "loss": 0.0164, "step": 12770 }, { "epoch": 30.01, "grad_norm": 75.500244140625, "learning_rate": 3.204033563962903e-05, "loss": 0.1048, "step": 12780 }, { "epoch": 30.01, "grad_norm": 0.032317329198122025, "learning_rate": 3.202193434417784e-05, "loss": 0.0273, "step": 12790 }, { "epoch": 30.01, "grad_norm": 0.012517527677118778, "learning_rate": 3.200353304872663e-05, "loss": 0.0304, "step": 12800 }, { "epoch": 30.01, "grad_norm": 0.05787045508623123, "learning_rate": 3.198513175327543e-05, "loss": 0.0156, "step": 12810 }, { "epoch": 30.01, "grad_norm": 0.5614012479782104, "learning_rate": 3.196673045782424e-05, "loss": 0.1343, "step": 12820 }, { "epoch": 30.01, "grad_norm": 0.007453802041709423, "learning_rate": 3.194832916237303e-05, "loss": 0.0983, "step": 12830 }, { "epoch": 30.01, "grad_norm": 0.10857084393501282, "learning_rate": 3.192992786692183e-05, "loss": 0.0024, "step": 12840 }, { "epoch": 30.01, "grad_norm": 0.0047862897627055645, "learning_rate": 3.191152657147064e-05, "loss": 0.1283, "step": 12850 }, { "epoch": 30.01, "grad_norm": 0.009203149937093258, "learning_rate": 3.189312527601943e-05, "loss": 0.0203, "step": 12860 }, { "epoch": 30.01, "grad_norm": 0.26198357343673706, "learning_rate": 3.187472398056823e-05, "loss": 0.1059, "step": 12870 }, { "epoch": 30.01, "grad_norm": 0.0081553990021348, "learning_rate": 3.185632268511704e-05, "loss": 0.1133, "step": 12880 }, { "epoch": 30.01, "grad_norm": 16.972984313964844, "learning_rate": 3.183792138966583e-05, "loss": 0.193, "step": 12890 }, { "epoch": 30.01, "grad_norm": 0.27453407645225525, "learning_rate": 3.181952009421463e-05, "loss": 0.0293, "step": 12900 }, { "epoch": 30.01, "grad_norm": 20.03423500061035, "learning_rate": 3.180111879876343e-05, "loss": 0.0772, "step": 12910 }, { "epoch": 30.01, "grad_norm": 0.0040716310031712055, "learning_rate": 3.178271750331224e-05, "loss": 0.0139, "step": 12920 }, { "epoch": 30.01, "grad_norm": 10.097931861877441, "learning_rate": 3.176431620786103e-05, "loss": 0.0693, "step": 12930 }, { "epoch": 30.01, "grad_norm": 0.01247051265090704, "learning_rate": 3.174591491240983e-05, "loss": 0.103, "step": 12940 }, { "epoch": 30.01, "grad_norm": 0.022959919646382332, "learning_rate": 3.172751361695864e-05, "loss": 0.0341, "step": 12950 }, { "epoch": 30.01, "grad_norm": 0.004824052099138498, "learning_rate": 3.170911232150743e-05, "loss": 0.0056, "step": 12960 }, { "epoch": 30.01, "grad_norm": 12.092653274536133, "learning_rate": 3.169071102605623e-05, "loss": 0.1353, "step": 12970 }, { "epoch": 30.01, "grad_norm": 0.925576388835907, "learning_rate": 3.167230973060504e-05, "loss": 0.0969, "step": 12980 }, { "epoch": 30.01, "grad_norm": 0.07548154145479202, "learning_rate": 3.165390843515383e-05, "loss": 0.1926, "step": 12990 }, { "epoch": 30.01, "grad_norm": 0.11369650810956955, "learning_rate": 3.163550713970263e-05, "loss": 0.1268, "step": 13000 }, { "epoch": 30.01, "grad_norm": 9.48705005645752, "learning_rate": 3.161710584425144e-05, "loss": 0.1238, "step": 13010 }, { "epoch": 30.01, "grad_norm": 0.051245737820863724, "learning_rate": 3.159870454880024e-05, "loss": 0.0065, "step": 13020 }, { "epoch": 30.01, "eval_accuracy": 0.7094594594594594, "eval_loss": 1.3396903276443481, "eval_runtime": 39.2399, "eval_samples_per_second": 22.63, "eval_steps_per_second": 1.886, "step": 13020 }, { "epoch": 31.0, "grad_norm": 0.041375719010829926, "learning_rate": 3.158030325334903e-05, "loss": 0.0015, "step": 13030 }, { "epoch": 31.0, "grad_norm": 13.484585762023926, "learning_rate": 3.156190195789784e-05, "loss": 0.0868, "step": 13040 }, { "epoch": 31.0, "grad_norm": 16.3665714263916, "learning_rate": 3.154350066244664e-05, "loss": 0.0683, "step": 13050 }, { "epoch": 31.0, "grad_norm": 18.90866470336914, "learning_rate": 3.1525099366995434e-05, "loss": 0.1037, "step": 13060 }, { "epoch": 31.0, "grad_norm": 14.579414367675781, "learning_rate": 3.150669807154424e-05, "loss": 0.1559, "step": 13070 }, { "epoch": 31.0, "grad_norm": 4.739386558532715, "learning_rate": 3.148829677609304e-05, "loss": 0.0373, "step": 13080 }, { "epoch": 31.0, "grad_norm": 0.026588771492242813, "learning_rate": 3.1469895480641834e-05, "loss": 0.0876, "step": 13090 }, { "epoch": 31.0, "grad_norm": 49.45380401611328, "learning_rate": 3.145149418519064e-05, "loss": 0.0323, "step": 13100 }, { "epoch": 31.0, "grad_norm": 0.015482367016375065, "learning_rate": 3.143309288973944e-05, "loss": 0.0893, "step": 13110 }, { "epoch": 31.0, "grad_norm": 0.029974903911352158, "learning_rate": 3.1414691594288234e-05, "loss": 0.0616, "step": 13120 }, { "epoch": 31.0, "grad_norm": 0.11971423774957657, "learning_rate": 3.139629029883704e-05, "loss": 0.026, "step": 13130 }, { "epoch": 31.0, "grad_norm": 24.187334060668945, "learning_rate": 3.137788900338584e-05, "loss": 0.0959, "step": 13140 }, { "epoch": 31.0, "grad_norm": 0.07652156054973602, "learning_rate": 3.135948770793464e-05, "loss": 0.0014, "step": 13150 }, { "epoch": 31.0, "grad_norm": 0.26165226101875305, "learning_rate": 3.134108641248344e-05, "loss": 0.0068, "step": 13160 }, { "epoch": 31.0, "grad_norm": 0.27254414558410645, "learning_rate": 3.132268511703224e-05, "loss": 0.0008, "step": 13170 }, { "epoch": 31.01, "grad_norm": 10.051579475402832, "learning_rate": 3.130428382158104e-05, "loss": 0.1076, "step": 13180 }, { "epoch": 31.01, "grad_norm": 0.21670132875442505, "learning_rate": 3.128588252612984e-05, "loss": 0.0178, "step": 13190 }, { "epoch": 31.01, "grad_norm": 7.604001522064209, "learning_rate": 3.126748123067864e-05, "loss": 0.0622, "step": 13200 }, { "epoch": 31.01, "grad_norm": 0.004798478446900845, "learning_rate": 3.124907993522744e-05, "loss": 0.04, "step": 13210 }, { "epoch": 31.01, "grad_norm": 0.05649665370583534, "learning_rate": 3.123067863977624e-05, "loss": 0.0325, "step": 13220 }, { "epoch": 31.01, "grad_norm": 0.013989850878715515, "learning_rate": 3.121227734432504e-05, "loss": 0.0669, "step": 13230 }, { "epoch": 31.01, "grad_norm": 0.2536843717098236, "learning_rate": 3.119387604887384e-05, "loss": 0.0344, "step": 13240 }, { "epoch": 31.01, "grad_norm": 18.08637809753418, "learning_rate": 3.117547475342264e-05, "loss": 0.1827, "step": 13250 }, { "epoch": 31.01, "grad_norm": 0.4526243209838867, "learning_rate": 3.115707345797144e-05, "loss": 0.0668, "step": 13260 }, { "epoch": 31.01, "grad_norm": 26.46347999572754, "learning_rate": 3.113867216252024e-05, "loss": 0.0317, "step": 13270 }, { "epoch": 31.01, "grad_norm": 10.450908660888672, "learning_rate": 3.112027086706904e-05, "loss": 0.1146, "step": 13280 }, { "epoch": 31.01, "grad_norm": 15.456535339355469, "learning_rate": 3.110186957161784e-05, "loss": 0.0594, "step": 13290 }, { "epoch": 31.01, "grad_norm": 0.1086319163441658, "learning_rate": 3.108346827616664e-05, "loss": 0.0602, "step": 13300 }, { "epoch": 31.01, "grad_norm": 14.730328559875488, "learning_rate": 3.106506698071544e-05, "loss": 0.106, "step": 13310 }, { "epoch": 31.01, "grad_norm": 37.09258270263672, "learning_rate": 3.104666568526424e-05, "loss": 0.0091, "step": 13320 }, { "epoch": 31.01, "grad_norm": 1.7202033996582031, "learning_rate": 3.1028264389813043e-05, "loss": 0.1634, "step": 13330 }, { "epoch": 31.01, "grad_norm": 11.892850875854492, "learning_rate": 3.1009863094361844e-05, "loss": 0.0566, "step": 13340 }, { "epoch": 31.01, "grad_norm": 0.0069226413033902645, "learning_rate": 3.0991461798910644e-05, "loss": 0.0477, "step": 13350 }, { "epoch": 31.01, "grad_norm": 46.1098518371582, "learning_rate": 3.0973060503459444e-05, "loss": 0.0911, "step": 13360 }, { "epoch": 31.01, "grad_norm": 0.013168955221772194, "learning_rate": 3.0954659208008244e-05, "loss": 0.0538, "step": 13370 }, { "epoch": 31.01, "grad_norm": 27.440155029296875, "learning_rate": 3.0936257912557044e-05, "loss": 0.1327, "step": 13380 }, { "epoch": 31.01, "grad_norm": 0.0305141843855381, "learning_rate": 3.0917856617105844e-05, "loss": 0.039, "step": 13390 }, { "epoch": 31.01, "grad_norm": 0.005779525265097618, "learning_rate": 3.0899455321654644e-05, "loss": 0.0019, "step": 13400 }, { "epoch": 31.01, "grad_norm": 0.060245249420404434, "learning_rate": 3.088105402620345e-05, "loss": 0.0031, "step": 13410 }, { "epoch": 31.01, "grad_norm": 0.0227971151471138, "learning_rate": 3.0862652730752244e-05, "loss": 0.0631, "step": 13420 }, { "epoch": 31.01, "grad_norm": 0.02739185467362404, "learning_rate": 3.0844251435301045e-05, "loss": 0.0543, "step": 13430 }, { "epoch": 31.01, "grad_norm": 0.001662073889747262, "learning_rate": 3.082585013984985e-05, "loss": 0.0435, "step": 13440 }, { "epoch": 31.01, "eval_accuracy": 0.6948198198198198, "eval_loss": 1.8911927938461304, "eval_runtime": 38.7968, "eval_samples_per_second": 22.888, "eval_steps_per_second": 1.907, "step": 13440 }, { "epoch": 32.0, "grad_norm": 0.0038868181873112917, "learning_rate": 3.0807448844398645e-05, "loss": 0.1149, "step": 13450 }, { "epoch": 32.0, "grad_norm": 0.19289876520633698, "learning_rate": 3.0789047548947445e-05, "loss": 0.1026, "step": 13460 }, { "epoch": 32.0, "grad_norm": 0.35097235441207886, "learning_rate": 3.077064625349625e-05, "loss": 0.0466, "step": 13470 }, { "epoch": 32.0, "grad_norm": 0.012586521916091442, "learning_rate": 3.0752244958045045e-05, "loss": 0.0783, "step": 13480 }, { "epoch": 32.0, "grad_norm": 0.3049808442592621, "learning_rate": 3.0733843662593845e-05, "loss": 0.1405, "step": 13490 }, { "epoch": 32.0, "grad_norm": 0.13303309679031372, "learning_rate": 3.071544236714265e-05, "loss": 0.0215, "step": 13500 }, { "epoch": 32.0, "grad_norm": 0.060418274253606796, "learning_rate": 3.0697041071691445e-05, "loss": 0.0711, "step": 13510 }, { "epoch": 32.0, "grad_norm": 0.021137768402695656, "learning_rate": 3.0678639776240246e-05, "loss": 0.0407, "step": 13520 }, { "epoch": 32.0, "grad_norm": 0.027525225654244423, "learning_rate": 3.066023848078905e-05, "loss": 0.0626, "step": 13530 }, { "epoch": 32.0, "grad_norm": 32.222984313964844, "learning_rate": 3.064183718533785e-05, "loss": 0.1088, "step": 13540 }, { "epoch": 32.0, "grad_norm": 6.560800075531006, "learning_rate": 3.0623435889886646e-05, "loss": 0.0797, "step": 13550 }, { "epoch": 32.0, "grad_norm": 18.688661575317383, "learning_rate": 3.060503459443545e-05, "loss": 0.0968, "step": 13560 }, { "epoch": 32.0, "grad_norm": 0.02211933769285679, "learning_rate": 3.058663329898425e-05, "loss": 0.1444, "step": 13570 }, { "epoch": 32.0, "grad_norm": 0.02764921449124813, "learning_rate": 3.0568232003533046e-05, "loss": 0.0071, "step": 13580 }, { "epoch": 32.0, "grad_norm": 0.03323595970869064, "learning_rate": 3.054983070808185e-05, "loss": 0.0033, "step": 13590 }, { "epoch": 32.01, "grad_norm": 21.21465301513672, "learning_rate": 3.053142941263065e-05, "loss": 0.0496, "step": 13600 }, { "epoch": 32.01, "grad_norm": 4.1312432289123535, "learning_rate": 3.051302811717945e-05, "loss": 0.0679, "step": 13610 }, { "epoch": 32.01, "grad_norm": 0.027629682794213295, "learning_rate": 3.049462682172825e-05, "loss": 0.0377, "step": 13620 }, { "epoch": 32.01, "grad_norm": 0.06411249935626984, "learning_rate": 3.0476225526277054e-05, "loss": 0.0027, "step": 13630 }, { "epoch": 32.01, "grad_norm": 28.367021560668945, "learning_rate": 3.0457824230825854e-05, "loss": 0.1851, "step": 13640 }, { "epoch": 32.01, "grad_norm": 0.14021526277065277, "learning_rate": 3.043942293537465e-05, "loss": 0.0068, "step": 13650 }, { "epoch": 32.01, "grad_norm": 0.00497056171298027, "learning_rate": 3.0421021639923454e-05, "loss": 0.0096, "step": 13660 }, { "epoch": 32.01, "grad_norm": 0.09903844445943832, "learning_rate": 3.0402620344472254e-05, "loss": 0.0247, "step": 13670 }, { "epoch": 32.01, "grad_norm": 0.24359776079654694, "learning_rate": 3.038421904902105e-05, "loss": 0.035, "step": 13680 }, { "epoch": 32.01, "grad_norm": 17.976119995117188, "learning_rate": 3.0365817753569854e-05, "loss": 0.1399, "step": 13690 }, { "epoch": 32.01, "grad_norm": 0.016763564199209213, "learning_rate": 3.0347416458118654e-05, "loss": 0.0012, "step": 13700 }, { "epoch": 32.01, "grad_norm": 0.03081514686346054, "learning_rate": 3.032901516266745e-05, "loss": 0.0606, "step": 13710 }, { "epoch": 32.01, "grad_norm": 0.020975248888134956, "learning_rate": 3.0310613867216255e-05, "loss": 0.1219, "step": 13720 }, { "epoch": 32.01, "grad_norm": 0.08395440131425858, "learning_rate": 3.0292212571765055e-05, "loss": 0.1052, "step": 13730 }, { "epoch": 32.01, "grad_norm": 0.04376498609781265, "learning_rate": 3.027381127631385e-05, "loss": 0.1467, "step": 13740 }, { "epoch": 32.01, "grad_norm": 0.012876118533313274, "learning_rate": 3.0255409980862655e-05, "loss": 0.0342, "step": 13750 }, { "epoch": 32.01, "grad_norm": 0.1162232756614685, "learning_rate": 3.0237008685411455e-05, "loss": 0.1335, "step": 13760 }, { "epoch": 32.01, "grad_norm": 0.05354901775717735, "learning_rate": 3.021860738996026e-05, "loss": 0.1076, "step": 13770 }, { "epoch": 32.01, "grad_norm": 0.008597791194915771, "learning_rate": 3.0200206094509055e-05, "loss": 0.0528, "step": 13780 }, { "epoch": 32.01, "grad_norm": 0.40106403827667236, "learning_rate": 3.0181804799057855e-05, "loss": 0.0025, "step": 13790 }, { "epoch": 32.01, "grad_norm": 0.02502295933663845, "learning_rate": 3.016340350360666e-05, "loss": 0.1028, "step": 13800 }, { "epoch": 32.01, "grad_norm": 0.009747982025146484, "learning_rate": 3.0145002208155452e-05, "loss": 0.0218, "step": 13810 }, { "epoch": 32.01, "grad_norm": 0.27103859186172485, "learning_rate": 3.0126600912704256e-05, "loss": 0.0131, "step": 13820 }, { "epoch": 32.01, "grad_norm": 0.07821822911500931, "learning_rate": 3.010819961725306e-05, "loss": 0.0508, "step": 13830 }, { "epoch": 32.01, "grad_norm": 7.904542446136475, "learning_rate": 3.0089798321801853e-05, "loss": 0.0756, "step": 13840 }, { "epoch": 32.01, "grad_norm": 0.14019513130187988, "learning_rate": 3.0071397026350656e-05, "loss": 0.1796, "step": 13850 }, { "epoch": 32.01, "grad_norm": 0.00757236173376441, "learning_rate": 3.005299573089946e-05, "loss": 0.0268, "step": 13860 }, { "epoch": 32.01, "eval_accuracy": 0.7286036036036037, "eval_loss": 1.5766730308532715, "eval_runtime": 39.4733, "eval_samples_per_second": 22.496, "eval_steps_per_second": 1.875, "step": 13860 }, { "epoch": 33.0, "grad_norm": 0.5436367392539978, "learning_rate": 3.0034594435448253e-05, "loss": 0.052, "step": 13870 }, { "epoch": 33.0, "grad_norm": 0.02723160944879055, "learning_rate": 3.0016193139997057e-05, "loss": 0.0698, "step": 13880 }, { "epoch": 33.0, "grad_norm": 0.005910804029554129, "learning_rate": 2.999779184454586e-05, "loss": 0.0185, "step": 13890 }, { "epoch": 33.0, "grad_norm": 5.659780025482178, "learning_rate": 2.997939054909466e-05, "loss": 0.1311, "step": 13900 }, { "epoch": 33.0, "grad_norm": 9.268265724182129, "learning_rate": 2.9960989253643457e-05, "loss": 0.089, "step": 13910 }, { "epoch": 33.0, "grad_norm": 0.08459486067295074, "learning_rate": 2.9942587958192257e-05, "loss": 0.0104, "step": 13920 }, { "epoch": 33.0, "grad_norm": 0.17442312836647034, "learning_rate": 2.992418666274106e-05, "loss": 0.0581, "step": 13930 }, { "epoch": 33.0, "grad_norm": 0.04593927040696144, "learning_rate": 2.9905785367289857e-05, "loss": 0.0009, "step": 13940 }, { "epoch": 33.0, "grad_norm": 0.023078270256519318, "learning_rate": 2.9887384071838657e-05, "loss": 0.0013, "step": 13950 }, { "epoch": 33.0, "grad_norm": 0.07796052098274231, "learning_rate": 2.986898277638746e-05, "loss": 0.0006, "step": 13960 }, { "epoch": 33.0, "grad_norm": 0.8386691212654114, "learning_rate": 2.9850581480936258e-05, "loss": 0.0464, "step": 13970 }, { "epoch": 33.0, "grad_norm": 0.004704775754362345, "learning_rate": 2.9832180185485058e-05, "loss": 0.0561, "step": 13980 }, { "epoch": 33.0, "grad_norm": 16.554256439208984, "learning_rate": 2.981377889003386e-05, "loss": 0.0981, "step": 13990 }, { "epoch": 33.0, "grad_norm": 0.007485406938940287, "learning_rate": 2.9795377594582658e-05, "loss": 0.0493, "step": 14000 }, { "epoch": 33.0, "grad_norm": 0.03169386461377144, "learning_rate": 2.9776976299131458e-05, "loss": 0.0035, "step": 14010 }, { "epoch": 33.01, "grad_norm": 0.021181074902415276, "learning_rate": 2.975857500368026e-05, "loss": 0.0241, "step": 14020 }, { "epoch": 33.01, "grad_norm": 0.13125194609165192, "learning_rate": 2.974017370822906e-05, "loss": 0.0269, "step": 14030 }, { "epoch": 33.01, "grad_norm": 60.82809829711914, "learning_rate": 2.972177241277786e-05, "loss": 0.1368, "step": 14040 }, { "epoch": 33.01, "grad_norm": 1.089142084121704, "learning_rate": 2.9703371117326662e-05, "loss": 0.0073, "step": 14050 }, { "epoch": 33.01, "grad_norm": 13.938457489013672, "learning_rate": 2.9684969821875462e-05, "loss": 0.0849, "step": 14060 }, { "epoch": 33.01, "grad_norm": 0.70406574010849, "learning_rate": 2.966656852642426e-05, "loss": 0.1026, "step": 14070 }, { "epoch": 33.01, "grad_norm": 0.01301854383200407, "learning_rate": 2.9648167230973062e-05, "loss": 0.0848, "step": 14080 }, { "epoch": 33.01, "grad_norm": 0.006891491822898388, "learning_rate": 2.9629765935521862e-05, "loss": 0.0399, "step": 14090 }, { "epoch": 33.01, "grad_norm": 0.010800345800817013, "learning_rate": 2.961136464007066e-05, "loss": 0.0005, "step": 14100 }, { "epoch": 33.01, "grad_norm": 0.006483331322669983, "learning_rate": 2.9592963344619463e-05, "loss": 0.1803, "step": 14110 }, { "epoch": 33.01, "grad_norm": 0.0052506220526993275, "learning_rate": 2.9574562049168263e-05, "loss": 0.0681, "step": 14120 }, { "epoch": 33.01, "grad_norm": 6.714112758636475, "learning_rate": 2.955616075371706e-05, "loss": 0.1035, "step": 14130 }, { "epoch": 33.01, "grad_norm": 6.365170955657959, "learning_rate": 2.9537759458265863e-05, "loss": 0.1068, "step": 14140 }, { "epoch": 33.01, "grad_norm": 0.1516963392496109, "learning_rate": 2.9519358162814663e-05, "loss": 0.0267, "step": 14150 }, { "epoch": 33.01, "grad_norm": 0.44562458992004395, "learning_rate": 2.9500956867363467e-05, "loss": 0.0887, "step": 14160 }, { "epoch": 33.01, "grad_norm": 0.04399920254945755, "learning_rate": 2.9482555571912263e-05, "loss": 0.0365, "step": 14170 }, { "epoch": 33.01, "grad_norm": 0.004122023470699787, "learning_rate": 2.9464154276461063e-05, "loss": 0.0552, "step": 14180 }, { "epoch": 33.01, "grad_norm": 38.76673126220703, "learning_rate": 2.9445752981009867e-05, "loss": 0.0167, "step": 14190 }, { "epoch": 33.01, "grad_norm": 16.6954288482666, "learning_rate": 2.9427351685558664e-05, "loss": 0.1445, "step": 14200 }, { "epoch": 33.01, "grad_norm": 0.03708453103899956, "learning_rate": 2.9408950390107464e-05, "loss": 0.0057, "step": 14210 }, { "epoch": 33.01, "grad_norm": 0.042344264686107635, "learning_rate": 2.9390549094656267e-05, "loss": 0.0747, "step": 14220 }, { "epoch": 33.01, "grad_norm": 31.488584518432617, "learning_rate": 2.9372147799205064e-05, "loss": 0.1425, "step": 14230 }, { "epoch": 33.01, "grad_norm": 81.25188446044922, "learning_rate": 2.9353746503753864e-05, "loss": 0.0618, "step": 14240 }, { "epoch": 33.01, "grad_norm": 0.0070849936455488205, "learning_rate": 2.9335345208302668e-05, "loss": 0.07, "step": 14250 }, { "epoch": 33.01, "grad_norm": 0.09240752458572388, "learning_rate": 2.9316943912851468e-05, "loss": 0.0872, "step": 14260 }, { "epoch": 33.01, "grad_norm": 0.10065799206495285, "learning_rate": 2.9298542617400264e-05, "loss": 0.0012, "step": 14270 }, { "epoch": 33.01, "grad_norm": 0.11566521972417831, "learning_rate": 2.9280141321949068e-05, "loss": 0.0487, "step": 14280 }, { "epoch": 33.01, "eval_accuracy": 0.6948198198198198, "eval_loss": 1.64386785030365, "eval_runtime": 40.8874, "eval_samples_per_second": 21.718, "eval_steps_per_second": 1.81, "step": 14280 }, { "epoch": 34.0, "grad_norm": 0.012225592508912086, "learning_rate": 2.9261740026497868e-05, "loss": 0.002, "step": 14290 }, { "epoch": 34.0, "grad_norm": 0.0045622168108820915, "learning_rate": 2.9243338731046665e-05, "loss": 0.0388, "step": 14300 }, { "epoch": 34.0, "grad_norm": 5.046571731567383, "learning_rate": 2.9224937435595468e-05, "loss": 0.0706, "step": 14310 }, { "epoch": 34.0, "grad_norm": 9.082304000854492, "learning_rate": 2.920653614014427e-05, "loss": 0.018, "step": 14320 }, { "epoch": 34.0, "grad_norm": 18.434141159057617, "learning_rate": 2.9188134844693065e-05, "loss": 0.0919, "step": 14330 }, { "epoch": 34.0, "grad_norm": 21.874008178710938, "learning_rate": 2.916973354924187e-05, "loss": 0.0768, "step": 14340 }, { "epoch": 34.0, "grad_norm": 0.020802170038223267, "learning_rate": 2.915133225379067e-05, "loss": 0.0008, "step": 14350 }, { "epoch": 34.0, "grad_norm": 0.00489756790921092, "learning_rate": 2.9132930958339465e-05, "loss": 0.0451, "step": 14360 }, { "epoch": 34.0, "grad_norm": 0.21378767490386963, "learning_rate": 2.911452966288827e-05, "loss": 0.0487, "step": 14370 }, { "epoch": 34.0, "grad_norm": 0.009158837608993053, "learning_rate": 2.909612836743707e-05, "loss": 0.0993, "step": 14380 }, { "epoch": 34.0, "grad_norm": 0.006322511006146669, "learning_rate": 2.9077727071985873e-05, "loss": 0.0008, "step": 14390 }, { "epoch": 34.0, "grad_norm": 0.017560819163918495, "learning_rate": 2.905932577653467e-05, "loss": 0.0038, "step": 14400 }, { "epoch": 34.0, "grad_norm": 0.01258725207298994, "learning_rate": 2.904092448108347e-05, "loss": 0.0009, "step": 14410 }, { "epoch": 34.0, "grad_norm": 0.04801618680357933, "learning_rate": 2.9022523185632273e-05, "loss": 0.0177, "step": 14420 }, { "epoch": 34.0, "grad_norm": 0.12019483745098114, "learning_rate": 2.900412189018107e-05, "loss": 0.0011, "step": 14430 }, { "epoch": 34.01, "grad_norm": 0.029142582789063454, "learning_rate": 2.898572059472987e-05, "loss": 0.0014, "step": 14440 }, { "epoch": 34.01, "grad_norm": 0.02893805503845215, "learning_rate": 2.8967319299278673e-05, "loss": 0.0474, "step": 14450 }, { "epoch": 34.01, "grad_norm": 0.022320715710520744, "learning_rate": 2.894891800382747e-05, "loss": 0.0351, "step": 14460 }, { "epoch": 34.01, "grad_norm": 0.006042890250682831, "learning_rate": 2.893051670837627e-05, "loss": 0.0643, "step": 14470 }, { "epoch": 34.01, "grad_norm": 0.5463034510612488, "learning_rate": 2.8912115412925074e-05, "loss": 0.1444, "step": 14480 }, { "epoch": 34.01, "grad_norm": 0.030179716646671295, "learning_rate": 2.889371411747387e-05, "loss": 0.0242, "step": 14490 }, { "epoch": 34.01, "grad_norm": 0.007446791976690292, "learning_rate": 2.887531282202267e-05, "loss": 0.0257, "step": 14500 }, { "epoch": 34.01, "grad_norm": 1.6489183902740479, "learning_rate": 2.8856911526571474e-05, "loss": 0.0958, "step": 14510 }, { "epoch": 34.01, "grad_norm": 0.004721594974398613, "learning_rate": 2.8838510231120274e-05, "loss": 0.0324, "step": 14520 }, { "epoch": 34.01, "grad_norm": 0.061093464493751526, "learning_rate": 2.882010893566907e-05, "loss": 0.0235, "step": 14530 }, { "epoch": 34.01, "grad_norm": 0.0076067266054451466, "learning_rate": 2.8801707640217874e-05, "loss": 0.072, "step": 14540 }, { "epoch": 34.01, "grad_norm": 0.032138094305992126, "learning_rate": 2.8783306344766674e-05, "loss": 0.0008, "step": 14550 }, { "epoch": 34.01, "grad_norm": 0.07343176752328873, "learning_rate": 2.876490504931547e-05, "loss": 0.0391, "step": 14560 }, { "epoch": 34.01, "grad_norm": 0.003957556094974279, "learning_rate": 2.8746503753864275e-05, "loss": 0.0057, "step": 14570 }, { "epoch": 34.01, "grad_norm": 4.1355366706848145, "learning_rate": 2.8728102458413075e-05, "loss": 0.0859, "step": 14580 }, { "epoch": 34.01, "grad_norm": 0.026744043454527855, "learning_rate": 2.870970116296187e-05, "loss": 0.0972, "step": 14590 }, { "epoch": 34.01, "grad_norm": 0.02687433548271656, "learning_rate": 2.8691299867510675e-05, "loss": 0.0012, "step": 14600 }, { "epoch": 34.01, "grad_norm": 1.3930257558822632, "learning_rate": 2.8672898572059475e-05, "loss": 0.1244, "step": 14610 }, { "epoch": 34.01, "grad_norm": 0.3148985505104065, "learning_rate": 2.8654497276608272e-05, "loss": 0.1685, "step": 14620 }, { "epoch": 34.01, "grad_norm": 0.009437570348381996, "learning_rate": 2.8636095981157075e-05, "loss": 0.0461, "step": 14630 }, { "epoch": 34.01, "grad_norm": 36.728065490722656, "learning_rate": 2.8617694685705875e-05, "loss": 0.0151, "step": 14640 }, { "epoch": 34.01, "grad_norm": 0.02354242280125618, "learning_rate": 2.859929339025468e-05, "loss": 0.0082, "step": 14650 }, { "epoch": 34.01, "grad_norm": 0.450647234916687, "learning_rate": 2.8580892094803476e-05, "loss": 0.0035, "step": 14660 }, { "epoch": 34.01, "grad_norm": 0.03615148365497589, "learning_rate": 2.8562490799352276e-05, "loss": 0.0476, "step": 14670 }, { "epoch": 34.01, "grad_norm": 0.051935531198978424, "learning_rate": 2.854408950390108e-05, "loss": 0.001, "step": 14680 }, { "epoch": 34.01, "grad_norm": 0.05409989133477211, "learning_rate": 2.8525688208449876e-05, "loss": 0.0746, "step": 14690 }, { "epoch": 34.01, "grad_norm": 0.02356979064643383, "learning_rate": 2.8507286912998676e-05, "loss": 0.0448, "step": 14700 }, { "epoch": 34.01, "eval_accuracy": 0.7353603603603603, "eval_loss": 1.5989632606506348, "eval_runtime": 40.4825, "eval_samples_per_second": 21.935, "eval_steps_per_second": 1.828, "step": 14700 }, { "epoch": 35.0, "grad_norm": 0.008124900050461292, "learning_rate": 2.848888561754748e-05, "loss": 0.049, "step": 14710 }, { "epoch": 35.0, "grad_norm": 0.017850523814558983, "learning_rate": 2.8470484322096276e-05, "loss": 0.0283, "step": 14720 }, { "epoch": 35.0, "grad_norm": 0.013829112984240055, "learning_rate": 2.8452083026645077e-05, "loss": 0.0005, "step": 14730 }, { "epoch": 35.0, "grad_norm": 0.21093538403511047, "learning_rate": 2.843368173119388e-05, "loss": 0.0046, "step": 14740 }, { "epoch": 35.0, "grad_norm": 30.419967651367188, "learning_rate": 2.8415280435742673e-05, "loss": 0.0616, "step": 14750 }, { "epoch": 35.0, "grad_norm": 5.482494354248047, "learning_rate": 2.8396879140291477e-05, "loss": 0.196, "step": 14760 }, { "epoch": 35.0, "grad_norm": 2.091125726699829, "learning_rate": 2.837847784484028e-05, "loss": 0.001, "step": 14770 }, { "epoch": 35.0, "grad_norm": 0.0029685271438211203, "learning_rate": 2.836007654938908e-05, "loss": 0.0625, "step": 14780 }, { "epoch": 35.0, "grad_norm": 0.013809144496917725, "learning_rate": 2.8341675253937877e-05, "loss": 0.0004, "step": 14790 }, { "epoch": 35.0, "grad_norm": 0.007634790614247322, "learning_rate": 2.832327395848668e-05, "loss": 0.0817, "step": 14800 }, { "epoch": 35.0, "grad_norm": 4.974092960357666, "learning_rate": 2.830487266303548e-05, "loss": 0.0542, "step": 14810 }, { "epoch": 35.0, "grad_norm": 0.005965742748230696, "learning_rate": 2.8286471367584278e-05, "loss": 0.0003, "step": 14820 }, { "epoch": 35.0, "grad_norm": 0.0047448077239096165, "learning_rate": 2.8268070072133078e-05, "loss": 0.0247, "step": 14830 }, { "epoch": 35.0, "grad_norm": 0.0023388988338410854, "learning_rate": 2.824966877668188e-05, "loss": 0.0048, "step": 14840 }, { "epoch": 35.0, "grad_norm": 0.0013745896285399795, "learning_rate": 2.8231267481230678e-05, "loss": 0.0077, "step": 14850 }, { "epoch": 35.01, "grad_norm": 6.498810768127441, "learning_rate": 2.8212866185779478e-05, "loss": 0.0423, "step": 14860 }, { "epoch": 35.01, "grad_norm": 0.02234739437699318, "learning_rate": 2.819446489032828e-05, "loss": 0.0649, "step": 14870 }, { "epoch": 35.01, "grad_norm": 1.1391817331314087, "learning_rate": 2.8176063594877085e-05, "loss": 0.0073, "step": 14880 }, { "epoch": 35.01, "grad_norm": 0.019465837627649307, "learning_rate": 2.815766229942588e-05, "loss": 0.0364, "step": 14890 }, { "epoch": 35.01, "grad_norm": 0.005256633274257183, "learning_rate": 2.8139261003974682e-05, "loss": 0.1411, "step": 14900 }, { "epoch": 35.01, "grad_norm": 34.344825744628906, "learning_rate": 2.8120859708523485e-05, "loss": 0.007, "step": 14910 }, { "epoch": 35.01, "grad_norm": 0.020804625004529953, "learning_rate": 2.810245841307228e-05, "loss": 0.0598, "step": 14920 }, { "epoch": 35.01, "grad_norm": 0.02393057756125927, "learning_rate": 2.8084057117621082e-05, "loss": 0.1846, "step": 14930 }, { "epoch": 35.01, "grad_norm": 27.570369720458984, "learning_rate": 2.8065655822169882e-05, "loss": 0.1569, "step": 14940 }, { "epoch": 35.01, "grad_norm": 31.2330322265625, "learning_rate": 2.804725452671868e-05, "loss": 0.0561, "step": 14950 }, { "epoch": 35.01, "grad_norm": 0.04531894251704216, "learning_rate": 2.8028853231267483e-05, "loss": 0.1036, "step": 14960 }, { "epoch": 35.01, "grad_norm": 0.06247282400727272, "learning_rate": 2.8010451935816283e-05, "loss": 0.123, "step": 14970 }, { "epoch": 35.01, "grad_norm": 0.027552228420972824, "learning_rate": 2.799205064036508e-05, "loss": 0.0291, "step": 14980 }, { "epoch": 35.01, "grad_norm": 0.035376742482185364, "learning_rate": 2.7973649344913883e-05, "loss": 0.0458, "step": 14990 }, { "epoch": 35.01, "grad_norm": 0.054985884577035904, "learning_rate": 2.7955248049462683e-05, "loss": 0.0864, "step": 15000 }, { "epoch": 35.01, "grad_norm": 0.0469009093940258, "learning_rate": 2.7936846754011487e-05, "loss": 0.0442, "step": 15010 }, { "epoch": 35.01, "grad_norm": 0.2176922708749771, "learning_rate": 2.7918445458560283e-05, "loss": 0.0718, "step": 15020 }, { "epoch": 35.01, "grad_norm": 6.875114917755127, "learning_rate": 2.7900044163109083e-05, "loss": 0.0042, "step": 15030 }, { "epoch": 35.01, "grad_norm": 59.076351165771484, "learning_rate": 2.7881642867657887e-05, "loss": 0.0111, "step": 15040 }, { "epoch": 35.01, "grad_norm": 0.07590507715940475, "learning_rate": 2.7863241572206684e-05, "loss": 0.0776, "step": 15050 }, { "epoch": 35.01, "grad_norm": 39.765769958496094, "learning_rate": 2.7844840276755484e-05, "loss": 0.0745, "step": 15060 }, { "epoch": 35.01, "grad_norm": 8.698269844055176, "learning_rate": 2.7826438981304287e-05, "loss": 0.1075, "step": 15070 }, { "epoch": 35.01, "grad_norm": 0.08205238729715347, "learning_rate": 2.7808037685853084e-05, "loss": 0.0927, "step": 15080 }, { "epoch": 35.01, "grad_norm": 0.0548090860247612, "learning_rate": 2.7789636390401884e-05, "loss": 0.0759, "step": 15090 }, { "epoch": 35.01, "grad_norm": 0.2036578506231308, "learning_rate": 2.7771235094950688e-05, "loss": 0.0013, "step": 15100 }, { "epoch": 35.01, "grad_norm": 0.007210288662463427, "learning_rate": 2.7752833799499484e-05, "loss": 0.0409, "step": 15110 }, { "epoch": 35.01, "grad_norm": 0.8128947615623474, "learning_rate": 2.7734432504048284e-05, "loss": 0.0166, "step": 15120 }, { "epoch": 35.01, "eval_accuracy": 0.7466216216216216, "eval_loss": 1.3866125345230103, "eval_runtime": 40.3674, "eval_samples_per_second": 21.998, "eval_steps_per_second": 1.833, "step": 15120 }, { "epoch": 36.0, "grad_norm": 5.199777126312256, "learning_rate": 2.7716031208597088e-05, "loss": 0.0392, "step": 15130 }, { "epoch": 36.0, "grad_norm": 0.014085162431001663, "learning_rate": 2.7697629913145888e-05, "loss": 0.0007, "step": 15140 }, { "epoch": 36.0, "grad_norm": 0.006830631755292416, "learning_rate": 2.7679228617694685e-05, "loss": 0.0252, "step": 15150 }, { "epoch": 36.0, "grad_norm": 0.01971651241183281, "learning_rate": 2.7660827322243488e-05, "loss": 0.0514, "step": 15160 }, { "epoch": 36.0, "grad_norm": 0.06991241872310638, "learning_rate": 2.764242602679229e-05, "loss": 0.0034, "step": 15170 }, { "epoch": 36.0, "grad_norm": 6.701659202575684, "learning_rate": 2.7624024731341085e-05, "loss": 0.0145, "step": 15180 }, { "epoch": 36.0, "grad_norm": 0.02505069226026535, "learning_rate": 2.760562343588989e-05, "loss": 0.004, "step": 15190 }, { "epoch": 36.0, "grad_norm": 0.0011081969132646918, "learning_rate": 2.758722214043869e-05, "loss": 0.0478, "step": 15200 }, { "epoch": 36.0, "grad_norm": 0.22539083659648895, "learning_rate": 2.7568820844987485e-05, "loss": 0.001, "step": 15210 }, { "epoch": 36.0, "grad_norm": 45.33222198486328, "learning_rate": 2.755041954953629e-05, "loss": 0.0577, "step": 15220 }, { "epoch": 36.0, "grad_norm": 22.034912109375, "learning_rate": 2.753201825408509e-05, "loss": 0.0064, "step": 15230 }, { "epoch": 36.0, "grad_norm": 0.002172585343942046, "learning_rate": 2.7513616958633886e-05, "loss": 0.0289, "step": 15240 }, { "epoch": 36.0, "grad_norm": 0.3061286211013794, "learning_rate": 2.749521566318269e-05, "loss": 0.0004, "step": 15250 }, { "epoch": 36.0, "grad_norm": 0.004251683130860329, "learning_rate": 2.747681436773149e-05, "loss": 0.146, "step": 15260 }, { "epoch": 36.0, "grad_norm": 0.0011500741820782423, "learning_rate": 2.7458413072280293e-05, "loss": 0.0003, "step": 15270 }, { "epoch": 36.01, "grad_norm": 0.009844346903264523, "learning_rate": 2.744001177682909e-05, "loss": 0.0534, "step": 15280 }, { "epoch": 36.01, "grad_norm": 0.0010252447100356221, "learning_rate": 2.742161048137789e-05, "loss": 0.178, "step": 15290 }, { "epoch": 36.01, "grad_norm": 0.0019560528453439474, "learning_rate": 2.7403209185926693e-05, "loss": 0.0295, "step": 15300 }, { "epoch": 36.01, "grad_norm": 42.15266036987305, "learning_rate": 2.738480789047549e-05, "loss": 0.0355, "step": 15310 }, { "epoch": 36.01, "grad_norm": 0.0029344751965254545, "learning_rate": 2.736640659502429e-05, "loss": 0.0258, "step": 15320 }, { "epoch": 36.01, "grad_norm": 0.007348980288952589, "learning_rate": 2.7348005299573094e-05, "loss": 0.0184, "step": 15330 }, { "epoch": 36.01, "grad_norm": 0.003109491430222988, "learning_rate": 2.732960400412189e-05, "loss": 0.0027, "step": 15340 }, { "epoch": 36.01, "grad_norm": 2.0378973484039307, "learning_rate": 2.731120270867069e-05, "loss": 0.0007, "step": 15350 }, { "epoch": 36.01, "grad_norm": 0.07484769821166992, "learning_rate": 2.7292801413219494e-05, "loss": 0.0012, "step": 15360 }, { "epoch": 36.01, "grad_norm": 0.5420640110969543, "learning_rate": 2.727440011776829e-05, "loss": 0.044, "step": 15370 }, { "epoch": 36.01, "grad_norm": 0.0010783092584460974, "learning_rate": 2.725599882231709e-05, "loss": 0.0591, "step": 15380 }, { "epoch": 36.01, "grad_norm": 0.005851478781551123, "learning_rate": 2.7237597526865894e-05, "loss": 0.0002, "step": 15390 }, { "epoch": 36.01, "grad_norm": 24.51445960998535, "learning_rate": 2.7219196231414694e-05, "loss": 0.0758, "step": 15400 }, { "epoch": 36.01, "grad_norm": 59.16766357421875, "learning_rate": 2.720079493596349e-05, "loss": 0.0366, "step": 15410 }, { "epoch": 36.01, "grad_norm": 0.007826605811715126, "learning_rate": 2.7182393640512295e-05, "loss": 0.0034, "step": 15420 }, { "epoch": 36.01, "grad_norm": 0.11950548738241196, "learning_rate": 2.7163992345061095e-05, "loss": 0.1212, "step": 15430 }, { "epoch": 36.01, "grad_norm": 0.668392539024353, "learning_rate": 2.714559104960989e-05, "loss": 0.0063, "step": 15440 }, { "epoch": 36.01, "grad_norm": 49.121849060058594, "learning_rate": 2.7127189754158695e-05, "loss": 0.0223, "step": 15450 }, { "epoch": 36.01, "grad_norm": 0.2821270227432251, "learning_rate": 2.7108788458707495e-05, "loss": 0.0005, "step": 15460 }, { "epoch": 36.01, "grad_norm": 0.003207216504961252, "learning_rate": 2.7090387163256292e-05, "loss": 0.1271, "step": 15470 }, { "epoch": 36.01, "grad_norm": 0.23750409483909607, "learning_rate": 2.7071985867805095e-05, "loss": 0.0904, "step": 15480 }, { "epoch": 36.01, "grad_norm": 10.629240036010742, "learning_rate": 2.7053584572353895e-05, "loss": 0.0721, "step": 15490 }, { "epoch": 36.01, "grad_norm": 0.029718786478042603, "learning_rate": 2.7035183276902692e-05, "loss": 0.0314, "step": 15500 }, { "epoch": 36.01, "grad_norm": 0.00921417772769928, "learning_rate": 2.7016781981451496e-05, "loss": 0.0583, "step": 15510 }, { "epoch": 36.01, "grad_norm": 0.024288874119520187, "learning_rate": 2.6998380686000296e-05, "loss": 0.0412, "step": 15520 }, { "epoch": 36.01, "grad_norm": 0.01890076883137226, "learning_rate": 2.69799793905491e-05, "loss": 0.0013, "step": 15530 }, { "epoch": 36.01, "grad_norm": 0.002824255032464862, "learning_rate": 2.6961578095097896e-05, "loss": 0.1029, "step": 15540 }, { "epoch": 36.01, "eval_accuracy": 0.7105855855855856, "eval_loss": 1.7426668405532837, "eval_runtime": 39.0783, "eval_samples_per_second": 22.724, "eval_steps_per_second": 1.894, "step": 15540 }, { "epoch": 37.0, "grad_norm": 0.017560964450240135, "learning_rate": 2.6943176799646696e-05, "loss": 0.0951, "step": 15550 }, { "epoch": 37.0, "grad_norm": 0.04531846195459366, "learning_rate": 2.69247755041955e-05, "loss": 0.0723, "step": 15560 }, { "epoch": 37.0, "grad_norm": 1.8820828199386597, "learning_rate": 2.6906374208744296e-05, "loss": 0.0335, "step": 15570 }, { "epoch": 37.0, "grad_norm": 0.0036666542291641235, "learning_rate": 2.6887972913293096e-05, "loss": 0.0375, "step": 15580 }, { "epoch": 37.0, "grad_norm": 0.002468695631250739, "learning_rate": 2.68695716178419e-05, "loss": 0.001, "step": 15590 }, { "epoch": 37.0, "grad_norm": 0.207871675491333, "learning_rate": 2.6851170322390697e-05, "loss": 0.1084, "step": 15600 }, { "epoch": 37.0, "grad_norm": 0.026301635429263115, "learning_rate": 2.6832769026939497e-05, "loss": 0.0007, "step": 15610 }, { "epoch": 37.0, "grad_norm": 0.005130626726895571, "learning_rate": 2.68143677314883e-05, "loss": 0.1154, "step": 15620 }, { "epoch": 37.0, "grad_norm": 0.1096784695982933, "learning_rate": 2.67959664360371e-05, "loss": 0.112, "step": 15630 }, { "epoch": 37.0, "grad_norm": 0.00986840482801199, "learning_rate": 2.6777565140585897e-05, "loss": 0.0784, "step": 15640 }, { "epoch": 37.0, "grad_norm": 0.677042543888092, "learning_rate": 2.67591638451347e-05, "loss": 0.0483, "step": 15650 }, { "epoch": 37.0, "grad_norm": 0.00216664164327085, "learning_rate": 2.67407625496835e-05, "loss": 0.0014, "step": 15660 }, { "epoch": 37.0, "grad_norm": 0.06850877404212952, "learning_rate": 2.6722361254232298e-05, "loss": 0.0023, "step": 15670 }, { "epoch": 37.0, "grad_norm": 0.0250447578728199, "learning_rate": 2.67039599587811e-05, "loss": 0.0511, "step": 15680 }, { "epoch": 37.0, "grad_norm": 0.02025407738983631, "learning_rate": 2.66855586633299e-05, "loss": 0.0628, "step": 15690 }, { "epoch": 37.01, "grad_norm": 0.00478848721832037, "learning_rate": 2.6667157367878698e-05, "loss": 0.0897, "step": 15700 }, { "epoch": 37.01, "grad_norm": 0.13232554495334625, "learning_rate": 2.66487560724275e-05, "loss": 0.0195, "step": 15710 }, { "epoch": 37.01, "grad_norm": 0.03150768205523491, "learning_rate": 2.66303547769763e-05, "loss": 0.0366, "step": 15720 }, { "epoch": 37.01, "grad_norm": 0.05836179479956627, "learning_rate": 2.6611953481525098e-05, "loss": 0.0235, "step": 15730 }, { "epoch": 37.01, "grad_norm": 0.003542052349075675, "learning_rate": 2.6593552186073902e-05, "loss": 0.1149, "step": 15740 }, { "epoch": 37.01, "grad_norm": 0.0033408894669264555, "learning_rate": 2.6575150890622702e-05, "loss": 0.0015, "step": 15750 }, { "epoch": 37.01, "grad_norm": 0.00453279260545969, "learning_rate": 2.6556749595171505e-05, "loss": 0.001, "step": 15760 }, { "epoch": 37.01, "grad_norm": 0.22416509687900543, "learning_rate": 2.65383482997203e-05, "loss": 0.0005, "step": 15770 }, { "epoch": 37.01, "grad_norm": 0.07529207319021225, "learning_rate": 2.6519947004269102e-05, "loss": 0.0583, "step": 15780 }, { "epoch": 37.01, "grad_norm": 90.49588012695312, "learning_rate": 2.6501545708817906e-05, "loss": 0.0168, "step": 15790 }, { "epoch": 37.01, "grad_norm": 0.5191478133201599, "learning_rate": 2.64831444133667e-05, "loss": 0.0957, "step": 15800 }, { "epoch": 37.01, "grad_norm": 0.003834774950519204, "learning_rate": 2.6464743117915503e-05, "loss": 0.0245, "step": 15810 }, { "epoch": 37.01, "grad_norm": 0.020836833864450455, "learning_rate": 2.6446341822464306e-05, "loss": 0.0311, "step": 15820 }, { "epoch": 37.01, "grad_norm": 31.80251693725586, "learning_rate": 2.64279405270131e-05, "loss": 0.1184, "step": 15830 }, { "epoch": 37.01, "grad_norm": 0.004135098308324814, "learning_rate": 2.6409539231561903e-05, "loss": 0.146, "step": 15840 }, { "epoch": 37.01, "grad_norm": 0.04643867164850235, "learning_rate": 2.6391137936110706e-05, "loss": 0.0443, "step": 15850 }, { "epoch": 37.01, "grad_norm": 1.761141300201416, "learning_rate": 2.63727366406595e-05, "loss": 0.1349, "step": 15860 }, { "epoch": 37.01, "grad_norm": 0.007804957218468189, "learning_rate": 2.6354335345208303e-05, "loss": 0.0093, "step": 15870 }, { "epoch": 37.01, "grad_norm": 0.010941618122160435, "learning_rate": 2.6335934049757103e-05, "loss": 0.0335, "step": 15880 }, { "epoch": 37.01, "grad_norm": 0.005491418763995171, "learning_rate": 2.6317532754305907e-05, "loss": 0.0461, "step": 15890 }, { "epoch": 37.01, "grad_norm": 0.017632165923714638, "learning_rate": 2.6299131458854704e-05, "loss": 0.0007, "step": 15900 }, { "epoch": 37.01, "grad_norm": 0.04650917276740074, "learning_rate": 2.6280730163403504e-05, "loss": 0.1037, "step": 15910 }, { "epoch": 37.01, "grad_norm": 17.576862335205078, "learning_rate": 2.6262328867952307e-05, "loss": 0.0725, "step": 15920 }, { "epoch": 37.01, "grad_norm": 0.09621303528547287, "learning_rate": 2.6243927572501104e-05, "loss": 0.0178, "step": 15930 }, { "epoch": 37.01, "grad_norm": 10.42127799987793, "learning_rate": 2.6225526277049904e-05, "loss": 0.2358, "step": 15940 }, { "epoch": 37.01, "grad_norm": 0.18157176673412323, "learning_rate": 2.6207124981598708e-05, "loss": 0.0535, "step": 15950 }, { "epoch": 37.01, "grad_norm": 0.05556317791342735, "learning_rate": 2.6188723686147504e-05, "loss": 0.0678, "step": 15960 }, { "epoch": 37.01, "eval_accuracy": 0.7364864864864865, "eval_loss": 1.419447422027588, "eval_runtime": 39.1526, "eval_samples_per_second": 22.68, "eval_steps_per_second": 1.89, "step": 15960 }, { "epoch": 38.0, "grad_norm": 3.7612640857696533, "learning_rate": 2.6170322390696304e-05, "loss": 0.0745, "step": 15970 }, { "epoch": 38.0, "grad_norm": 0.08191471546888351, "learning_rate": 2.6151921095245108e-05, "loss": 0.0784, "step": 15980 }, { "epoch": 38.0, "grad_norm": 0.015262553468346596, "learning_rate": 2.6133519799793905e-05, "loss": 0.0007, "step": 15990 }, { "epoch": 38.0, "grad_norm": 0.006925337016582489, "learning_rate": 2.6115118504342705e-05, "loss": 0.0034, "step": 16000 }, { "epoch": 38.0, "grad_norm": 0.012639672495424747, "learning_rate": 2.6096717208891508e-05, "loss": 0.0025, "step": 16010 }, { "epoch": 38.0, "grad_norm": 2.9875874519348145, "learning_rate": 2.607831591344031e-05, "loss": 0.0021, "step": 16020 }, { "epoch": 38.0, "grad_norm": 0.013625150546431541, "learning_rate": 2.6059914617989105e-05, "loss": 0.0067, "step": 16030 }, { "epoch": 38.0, "grad_norm": 5.456194877624512, "learning_rate": 2.604151332253791e-05, "loss": 0.0573, "step": 16040 }, { "epoch": 38.0, "grad_norm": 25.647403717041016, "learning_rate": 2.602311202708671e-05, "loss": 0.0657, "step": 16050 }, { "epoch": 38.0, "grad_norm": 0.0463079996407032, "learning_rate": 2.6004710731635505e-05, "loss": 0.0213, "step": 16060 }, { "epoch": 38.0, "grad_norm": 0.02664658986032009, "learning_rate": 2.598630943618431e-05, "loss": 0.0016, "step": 16070 }, { "epoch": 38.0, "grad_norm": 1.7586325407028198, "learning_rate": 2.596790814073311e-05, "loss": 0.0569, "step": 16080 }, { "epoch": 38.0, "grad_norm": 0.015104389749467373, "learning_rate": 2.5949506845281906e-05, "loss": 0.1198, "step": 16090 }, { "epoch": 38.0, "grad_norm": 0.009908678941428661, "learning_rate": 2.593110554983071e-05, "loss": 0.1008, "step": 16100 }, { "epoch": 38.0, "grad_norm": 23.33299446105957, "learning_rate": 2.591270425437951e-05, "loss": 0.0304, "step": 16110 }, { "epoch": 38.01, "grad_norm": 0.06382476538419724, "learning_rate": 2.5894302958928306e-05, "loss": 0.1119, "step": 16120 }, { "epoch": 38.01, "grad_norm": 0.05750289559364319, "learning_rate": 2.587590166347711e-05, "loss": 0.0004, "step": 16130 }, { "epoch": 38.01, "grad_norm": 0.06110772490501404, "learning_rate": 2.585750036802591e-05, "loss": 0.0007, "step": 16140 }, { "epoch": 38.01, "grad_norm": 0.0023573378566652536, "learning_rate": 2.5839099072574713e-05, "loss": 0.031, "step": 16150 }, { "epoch": 38.01, "grad_norm": 0.015254752710461617, "learning_rate": 2.582069777712351e-05, "loss": 0.0157, "step": 16160 }, { "epoch": 38.01, "grad_norm": 0.05136784166097641, "learning_rate": 2.580229648167231e-05, "loss": 0.0594, "step": 16170 }, { "epoch": 38.01, "grad_norm": 0.004741484299302101, "learning_rate": 2.5783895186221114e-05, "loss": 0.0053, "step": 16180 }, { "epoch": 38.01, "grad_norm": 0.027769001200795174, "learning_rate": 2.576549389076991e-05, "loss": 0.0471, "step": 16190 }, { "epoch": 38.01, "grad_norm": 0.0037545578088611364, "learning_rate": 2.574709259531871e-05, "loss": 0.0274, "step": 16200 }, { "epoch": 38.01, "grad_norm": 2.7802109718322754, "learning_rate": 2.5728691299867514e-05, "loss": 0.0447, "step": 16210 }, { "epoch": 38.01, "grad_norm": 0.006928629241883755, "learning_rate": 2.571029000441631e-05, "loss": 0.0005, "step": 16220 }, { "epoch": 38.01, "grad_norm": 0.029513835906982422, "learning_rate": 2.569188870896511e-05, "loss": 0.0168, "step": 16230 }, { "epoch": 38.01, "grad_norm": 0.002607174916192889, "learning_rate": 2.5673487413513914e-05, "loss": 0.0006, "step": 16240 }, { "epoch": 38.01, "grad_norm": 0.034030377864837646, "learning_rate": 2.5655086118062714e-05, "loss": 0.0126, "step": 16250 }, { "epoch": 38.01, "grad_norm": 2.631910562515259, "learning_rate": 2.563668482261151e-05, "loss": 0.019, "step": 16260 }, { "epoch": 38.01, "grad_norm": 0.061333347111940384, "learning_rate": 2.5618283527160315e-05, "loss": 0.0005, "step": 16270 }, { "epoch": 38.01, "grad_norm": 0.33661890029907227, "learning_rate": 2.5599882231709115e-05, "loss": 0.0435, "step": 16280 }, { "epoch": 38.01, "grad_norm": 0.032611507922410965, "learning_rate": 2.558148093625791e-05, "loss": 0.0224, "step": 16290 }, { "epoch": 38.01, "grad_norm": 0.0037321383133530617, "learning_rate": 2.5563079640806715e-05, "loss": 0.0165, "step": 16300 }, { "epoch": 38.01, "grad_norm": 124.54664611816406, "learning_rate": 2.5544678345355515e-05, "loss": 0.0152, "step": 16310 }, { "epoch": 38.01, "grad_norm": 0.03767447546124458, "learning_rate": 2.5526277049904312e-05, "loss": 0.0946, "step": 16320 }, { "epoch": 38.01, "grad_norm": 0.0054799229837954044, "learning_rate": 2.5507875754453115e-05, "loss": 0.1248, "step": 16330 }, { "epoch": 38.01, "grad_norm": 0.002836138242855668, "learning_rate": 2.5489474459001915e-05, "loss": 0.0002, "step": 16340 }, { "epoch": 38.01, "grad_norm": 4.855489730834961, "learning_rate": 2.5471073163550712e-05, "loss": 0.063, "step": 16350 }, { "epoch": 38.01, "grad_norm": 0.013726359233260155, "learning_rate": 2.5452671868099516e-05, "loss": 0.027, "step": 16360 }, { "epoch": 38.01, "grad_norm": 0.7091811299324036, "learning_rate": 2.5434270572648316e-05, "loss": 0.1472, "step": 16370 }, { "epoch": 38.01, "grad_norm": 0.004487201105803251, "learning_rate": 2.541586927719712e-05, "loss": 0.0007, "step": 16380 }, { "epoch": 38.01, "eval_accuracy": 0.7072072072072072, "eval_loss": 1.9136948585510254, "eval_runtime": 38.9937, "eval_samples_per_second": 22.773, "eval_steps_per_second": 1.898, "step": 16380 }, { "epoch": 39.0, "grad_norm": 9.85066032409668, "learning_rate": 2.5397467981745916e-05, "loss": 0.0349, "step": 16390 }, { "epoch": 39.0, "grad_norm": 0.1230677142739296, "learning_rate": 2.5379066686294716e-05, "loss": 0.0003, "step": 16400 }, { "epoch": 39.0, "grad_norm": 0.010495364665985107, "learning_rate": 2.536066539084352e-05, "loss": 0.0453, "step": 16410 }, { "epoch": 39.0, "grad_norm": 0.0022531235590577126, "learning_rate": 2.5342264095392316e-05, "loss": 0.0454, "step": 16420 }, { "epoch": 39.0, "grad_norm": 46.67967987060547, "learning_rate": 2.5323862799941116e-05, "loss": 0.1218, "step": 16430 }, { "epoch": 39.0, "grad_norm": 0.027552777901291847, "learning_rate": 2.530546150448992e-05, "loss": 0.0181, "step": 16440 }, { "epoch": 39.0, "grad_norm": 0.18322528898715973, "learning_rate": 2.5287060209038717e-05, "loss": 0.0005, "step": 16450 }, { "epoch": 39.0, "grad_norm": 1.810534119606018, "learning_rate": 2.5268658913587517e-05, "loss": 0.0027, "step": 16460 }, { "epoch": 39.0, "grad_norm": 0.05370220169425011, "learning_rate": 2.525025761813632e-05, "loss": 0.0217, "step": 16470 }, { "epoch": 39.0, "grad_norm": 0.008379046805202961, "learning_rate": 2.5231856322685117e-05, "loss": 0.0405, "step": 16480 }, { "epoch": 39.0, "grad_norm": 0.004804358817636967, "learning_rate": 2.5213455027233917e-05, "loss": 0.0448, "step": 16490 }, { "epoch": 39.0, "grad_norm": 0.009872007183730602, "learning_rate": 2.519505373178272e-05, "loss": 0.0058, "step": 16500 }, { "epoch": 39.0, "grad_norm": 0.0027543141040951014, "learning_rate": 2.517665243633152e-05, "loss": 0.0384, "step": 16510 }, { "epoch": 39.0, "grad_norm": 0.0008711799746379256, "learning_rate": 2.5158251140880317e-05, "loss": 0.0007, "step": 16520 }, { "epoch": 39.0, "grad_norm": 3.9537038803100586, "learning_rate": 2.513984984542912e-05, "loss": 0.0887, "step": 16530 }, { "epoch": 39.01, "grad_norm": 0.0022325078025460243, "learning_rate": 2.512144854997792e-05, "loss": 0.0005, "step": 16540 }, { "epoch": 39.01, "grad_norm": 0.014836194925010204, "learning_rate": 2.5103047254526718e-05, "loss": 0.0028, "step": 16550 }, { "epoch": 39.01, "grad_norm": 0.3858121335506439, "learning_rate": 2.508464595907552e-05, "loss": 0.0892, "step": 16560 }, { "epoch": 39.01, "grad_norm": 0.01734367199242115, "learning_rate": 2.506624466362432e-05, "loss": 0.0148, "step": 16570 }, { "epoch": 39.01, "grad_norm": 0.00928126834332943, "learning_rate": 2.5047843368173118e-05, "loss": 0.0977, "step": 16580 }, { "epoch": 39.01, "grad_norm": 0.005387528333812952, "learning_rate": 2.502944207272192e-05, "loss": 0.0325, "step": 16590 }, { "epoch": 39.01, "grad_norm": 11.179068565368652, "learning_rate": 2.5011040777270722e-05, "loss": 0.0623, "step": 16600 }, { "epoch": 39.01, "grad_norm": 0.06299767643213272, "learning_rate": 2.4992639481819522e-05, "loss": 0.0917, "step": 16610 }, { "epoch": 39.01, "grad_norm": 0.013811836019158363, "learning_rate": 2.4974238186368322e-05, "loss": 0.0981, "step": 16620 }, { "epoch": 39.01, "grad_norm": 0.037350479513406754, "learning_rate": 2.4955836890917122e-05, "loss": 0.0119, "step": 16630 }, { "epoch": 39.01, "grad_norm": 0.34366223216056824, "learning_rate": 2.4937435595465922e-05, "loss": 0.0972, "step": 16640 }, { "epoch": 39.01, "grad_norm": 0.2959058880805969, "learning_rate": 2.4919034300014722e-05, "loss": 0.0052, "step": 16650 }, { "epoch": 39.01, "grad_norm": 0.041184067726135254, "learning_rate": 2.4900633004563522e-05, "loss": 0.0447, "step": 16660 }, { "epoch": 39.01, "grad_norm": 0.00951891764998436, "learning_rate": 2.4882231709112323e-05, "loss": 0.007, "step": 16670 }, { "epoch": 39.01, "grad_norm": 0.020858224481344223, "learning_rate": 2.4863830413661123e-05, "loss": 0.0467, "step": 16680 }, { "epoch": 39.01, "grad_norm": 0.05100645124912262, "learning_rate": 2.4845429118209923e-05, "loss": 0.0731, "step": 16690 }, { "epoch": 39.01, "grad_norm": 0.005188668146729469, "learning_rate": 2.4827027822758723e-05, "loss": 0.0587, "step": 16700 }, { "epoch": 39.01, "grad_norm": 0.008620868436992168, "learning_rate": 2.4808626527307523e-05, "loss": 0.0281, "step": 16710 }, { "epoch": 39.01, "grad_norm": 0.014651943929493427, "learning_rate": 2.4790225231856323e-05, "loss": 0.0893, "step": 16720 }, { "epoch": 39.01, "grad_norm": 0.18780621886253357, "learning_rate": 2.4771823936405127e-05, "loss": 0.0099, "step": 16730 }, { "epoch": 39.01, "grad_norm": 0.015461204573512077, "learning_rate": 2.4753422640953923e-05, "loss": 0.0778, "step": 16740 }, { "epoch": 39.01, "grad_norm": 0.245300754904747, "learning_rate": 2.4735021345502724e-05, "loss": 0.0006, "step": 16750 }, { "epoch": 39.01, "grad_norm": 0.094014972448349, "learning_rate": 2.4716620050051527e-05, "loss": 0.0009, "step": 16760 }, { "epoch": 39.01, "grad_norm": 0.003981316927820444, "learning_rate": 2.4698218754600324e-05, "loss": 0.0896, "step": 16770 }, { "epoch": 39.01, "grad_norm": 3.2442426681518555, "learning_rate": 2.4679817459149127e-05, "loss": 0.0009, "step": 16780 }, { "epoch": 39.01, "grad_norm": 0.0036344637628644705, "learning_rate": 2.4661416163697927e-05, "loss": 0.0018, "step": 16790 }, { "epoch": 39.01, "grad_norm": 0.002372809685766697, "learning_rate": 2.4643014868246724e-05, "loss": 0.0602, "step": 16800 }, { "epoch": 39.01, "eval_accuracy": 0.7308558558558559, "eval_loss": 1.617972731590271, "eval_runtime": 38.5744, "eval_samples_per_second": 23.02, "eval_steps_per_second": 1.918, "step": 16800 }, { "epoch": 40.0, "grad_norm": 0.004174708854407072, "learning_rate": 2.4624613572795528e-05, "loss": 0.0004, "step": 16810 }, { "epoch": 40.0, "grad_norm": 0.009840855374932289, "learning_rate": 2.4606212277344324e-05, "loss": 0.0135, "step": 16820 }, { "epoch": 40.0, "grad_norm": 0.013369137421250343, "learning_rate": 2.4587810981893124e-05, "loss": 0.0318, "step": 16830 }, { "epoch": 40.0, "grad_norm": 0.0507052019238472, "learning_rate": 2.4569409686441928e-05, "loss": 0.0004, "step": 16840 }, { "epoch": 40.0, "grad_norm": 0.04035002738237381, "learning_rate": 2.4551008390990725e-05, "loss": 0.0084, "step": 16850 }, { "epoch": 40.0, "grad_norm": 0.11788014322519302, "learning_rate": 2.4532607095539528e-05, "loss": 0.0253, "step": 16860 }, { "epoch": 40.0, "grad_norm": 0.003910732455551624, "learning_rate": 2.451420580008833e-05, "loss": 0.0002, "step": 16870 }, { "epoch": 40.0, "grad_norm": 0.7661402225494385, "learning_rate": 2.4495804504637125e-05, "loss": 0.0299, "step": 16880 }, { "epoch": 40.0, "grad_norm": 0.026837226003408432, "learning_rate": 2.447740320918593e-05, "loss": 0.0003, "step": 16890 }, { "epoch": 40.0, "grad_norm": 13.870619773864746, "learning_rate": 2.445900191373473e-05, "loss": 0.0911, "step": 16900 }, { "epoch": 40.0, "grad_norm": 0.41090813279151917, "learning_rate": 2.444060061828353e-05, "loss": 0.0368, "step": 16910 }, { "epoch": 40.0, "grad_norm": 0.0015127554070204496, "learning_rate": 2.442219932283233e-05, "loss": 0.0339, "step": 16920 }, { "epoch": 40.0, "grad_norm": 0.05826074630022049, "learning_rate": 2.440379802738113e-05, "loss": 0.0242, "step": 16930 }, { "epoch": 40.0, "grad_norm": 0.01771511137485504, "learning_rate": 2.438539673192993e-05, "loss": 0.0498, "step": 16940 }, { "epoch": 40.0, "grad_norm": 0.03588107228279114, "learning_rate": 2.436699543647873e-05, "loss": 0.0668, "step": 16950 }, { "epoch": 40.01, "grad_norm": 0.004284579772502184, "learning_rate": 2.434859414102753e-05, "loss": 0.0873, "step": 16960 }, { "epoch": 40.01, "grad_norm": 13.748348236083984, "learning_rate": 2.433019284557633e-05, "loss": 0.0038, "step": 16970 }, { "epoch": 40.01, "grad_norm": 0.025629336014389992, "learning_rate": 2.431179155012513e-05, "loss": 0.001, "step": 16980 }, { "epoch": 40.01, "grad_norm": 0.0758618637919426, "learning_rate": 2.429339025467393e-05, "loss": 0.0006, "step": 16990 }, { "epoch": 40.01, "grad_norm": 0.004732039291411638, "learning_rate": 2.427498895922273e-05, "loss": 0.054, "step": 17000 }, { "epoch": 40.01, "grad_norm": 0.05720449239015579, "learning_rate": 2.425658766377153e-05, "loss": 0.0007, "step": 17010 }, { "epoch": 40.01, "grad_norm": 0.14128656685352325, "learning_rate": 2.423818636832033e-05, "loss": 0.0004, "step": 17020 }, { "epoch": 40.01, "grad_norm": 0.0036420163232833147, "learning_rate": 2.421978507286913e-05, "loss": 0.034, "step": 17030 }, { "epoch": 40.01, "grad_norm": 0.09513210505247116, "learning_rate": 2.4201383777417934e-05, "loss": 0.0915, "step": 17040 }, { "epoch": 40.01, "grad_norm": 0.025200609117746353, "learning_rate": 2.418298248196673e-05, "loss": 0.1074, "step": 17050 }, { "epoch": 40.01, "grad_norm": 65.16971588134766, "learning_rate": 2.416458118651553e-05, "loss": 0.0952, "step": 17060 }, { "epoch": 40.01, "grad_norm": 0.010291090235114098, "learning_rate": 2.4146179891064334e-05, "loss": 0.0003, "step": 17070 }, { "epoch": 40.01, "grad_norm": 0.7548648715019226, "learning_rate": 2.412777859561313e-05, "loss": 0.0522, "step": 17080 }, { "epoch": 40.01, "grad_norm": 0.024852802976965904, "learning_rate": 2.4109377300161934e-05, "loss": 0.0224, "step": 17090 }, { "epoch": 40.01, "grad_norm": 10.804015159606934, "learning_rate": 2.4090976004710734e-05, "loss": 0.1196, "step": 17100 }, { "epoch": 40.01, "grad_norm": 19.513050079345703, "learning_rate": 2.407257470925953e-05, "loss": 0.0707, "step": 17110 }, { "epoch": 40.01, "grad_norm": 0.014289339073002338, "learning_rate": 2.4054173413808335e-05, "loss": 0.0865, "step": 17120 }, { "epoch": 40.01, "grad_norm": 37.88726806640625, "learning_rate": 2.4035772118357135e-05, "loss": 0.0773, "step": 17130 }, { "epoch": 40.01, "grad_norm": 0.6281788349151611, "learning_rate": 2.401737082290593e-05, "loss": 0.0007, "step": 17140 }, { "epoch": 40.01, "grad_norm": 0.09738222509622574, "learning_rate": 2.3998969527454735e-05, "loss": 0.0008, "step": 17150 }, { "epoch": 40.01, "grad_norm": 0.0028530398849397898, "learning_rate": 2.3980568232003535e-05, "loss": 0.0301, "step": 17160 }, { "epoch": 40.01, "grad_norm": 11.567769050598145, "learning_rate": 2.3962166936552335e-05, "loss": 0.0078, "step": 17170 }, { "epoch": 40.01, "grad_norm": 0.010698237456381321, "learning_rate": 2.3943765641101135e-05, "loss": 0.0003, "step": 17180 }, { "epoch": 40.01, "grad_norm": 0.0026003606617450714, "learning_rate": 2.3925364345649935e-05, "loss": 0.0827, "step": 17190 }, { "epoch": 40.01, "grad_norm": 0.004792836960405111, "learning_rate": 2.3906963050198736e-05, "loss": 0.0963, "step": 17200 }, { "epoch": 40.01, "grad_norm": 0.004160434473305941, "learning_rate": 2.3888561754747536e-05, "loss": 0.0004, "step": 17210 }, { "epoch": 40.01, "grad_norm": 0.0015712743625044823, "learning_rate": 2.3870160459296336e-05, "loss": 0.0977, "step": 17220 }, { "epoch": 40.01, "eval_accuracy": 0.7353603603603603, "eval_loss": 1.5709514617919922, "eval_runtime": 188.7609, "eval_samples_per_second": 4.704, "eval_steps_per_second": 0.392, "step": 17220 }, { "epoch": 41.0, "grad_norm": 6.6910247802734375, "learning_rate": 2.3851759163845136e-05, "loss": 0.0501, "step": 17230 }, { "epoch": 41.0, "grad_norm": 0.0936957523226738, "learning_rate": 2.3833357868393936e-05, "loss": 0.0745, "step": 17240 }, { "epoch": 41.0, "grad_norm": 44.87736892700195, "learning_rate": 2.3814956572942736e-05, "loss": 0.0959, "step": 17250 }, { "epoch": 41.0, "grad_norm": 0.6046668291091919, "learning_rate": 2.3796555277491536e-05, "loss": 0.0053, "step": 17260 }, { "epoch": 41.0, "grad_norm": 0.00754895992577076, "learning_rate": 2.3778153982040336e-05, "loss": 0.0051, "step": 17270 }, { "epoch": 41.0, "grad_norm": 0.04283663257956505, "learning_rate": 2.3759752686589136e-05, "loss": 0.1071, "step": 17280 }, { "epoch": 41.0, "grad_norm": 0.02367532253265381, "learning_rate": 2.3741351391137937e-05, "loss": 0.0285, "step": 17290 }, { "epoch": 41.0, "grad_norm": 0.08702404052019119, "learning_rate": 2.372295009568674e-05, "loss": 0.0055, "step": 17300 }, { "epoch": 41.0, "grad_norm": 0.0050528873689472675, "learning_rate": 2.3704548800235537e-05, "loss": 0.0553, "step": 17310 }, { "epoch": 41.0, "grad_norm": 0.03083922155201435, "learning_rate": 2.3686147504784337e-05, "loss": 0.0396, "step": 17320 }, { "epoch": 41.0, "grad_norm": 0.008288837969303131, "learning_rate": 2.3667746209333137e-05, "loss": 0.012, "step": 17330 }, { "epoch": 41.0, "grad_norm": 1.1453498601913452, "learning_rate": 2.3649344913881937e-05, "loss": 0.0006, "step": 17340 }, { "epoch": 41.0, "grad_norm": 0.010078281164169312, "learning_rate": 2.363094361843074e-05, "loss": 0.0316, "step": 17350 }, { "epoch": 41.0, "grad_norm": 18.37259864807129, "learning_rate": 2.3612542322979537e-05, "loss": 0.0638, "step": 17360 }, { "epoch": 41.0, "grad_norm": 0.00207115919329226, "learning_rate": 2.3594141027528337e-05, "loss": 0.0002, "step": 17370 }, { "epoch": 41.01, "grad_norm": 0.03447471559047699, "learning_rate": 2.357573973207714e-05, "loss": 0.047, "step": 17380 }, { "epoch": 41.01, "grad_norm": 0.02097044140100479, "learning_rate": 2.3557338436625938e-05, "loss": 0.0003, "step": 17390 }, { "epoch": 41.01, "grad_norm": 0.001427238341420889, "learning_rate": 2.353893714117474e-05, "loss": 0.0122, "step": 17400 }, { "epoch": 41.01, "grad_norm": 0.0013924982631579041, "learning_rate": 2.352053584572354e-05, "loss": 0.0134, "step": 17410 }, { "epoch": 41.01, "grad_norm": 0.22463291883468628, "learning_rate": 2.3502134550272338e-05, "loss": 0.0424, "step": 17420 }, { "epoch": 41.01, "grad_norm": 0.1324460357427597, "learning_rate": 2.348373325482114e-05, "loss": 0.0004, "step": 17430 }, { "epoch": 41.01, "grad_norm": 0.015974344685673714, "learning_rate": 2.346533195936994e-05, "loss": 0.022, "step": 17440 }, { "epoch": 41.01, "grad_norm": 0.0521487221121788, "learning_rate": 2.344693066391874e-05, "loss": 0.0183, "step": 17450 }, { "epoch": 41.01, "grad_norm": 1.8123937845230103, "learning_rate": 2.3428529368467542e-05, "loss": 0.0009, "step": 17460 }, { "epoch": 41.01, "grad_norm": 0.06810711324214935, "learning_rate": 2.3410128073016342e-05, "loss": 0.0933, "step": 17470 }, { "epoch": 41.01, "grad_norm": 0.24335838854312897, "learning_rate": 2.3391726777565142e-05, "loss": 0.0208, "step": 17480 }, { "epoch": 41.01, "grad_norm": 0.029186120256781578, "learning_rate": 2.3373325482113942e-05, "loss": 0.0566, "step": 17490 }, { "epoch": 41.01, "grad_norm": 7.900701999664307, "learning_rate": 2.3354924186662742e-05, "loss": 0.0158, "step": 17500 }, { "epoch": 41.01, "grad_norm": 0.007547201123088598, "learning_rate": 2.3336522891211542e-05, "loss": 0.1573, "step": 17510 }, { "epoch": 41.01, "grad_norm": 0.07081840187311172, "learning_rate": 2.3318121595760343e-05, "loss": 0.011, "step": 17520 }, { "epoch": 41.01, "grad_norm": 3.3901169300079346, "learning_rate": 2.3299720300309143e-05, "loss": 0.0747, "step": 17530 }, { "epoch": 41.01, "grad_norm": 15.250762939453125, "learning_rate": 2.3281319004857943e-05, "loss": 0.0037, "step": 17540 }, { "epoch": 41.01, "grad_norm": 0.00895916298031807, "learning_rate": 2.3262917709406743e-05, "loss": 0.0028, "step": 17550 }, { "epoch": 41.01, "grad_norm": 0.008652539923787117, "learning_rate": 2.3244516413955543e-05, "loss": 0.0022, "step": 17560 }, { "epoch": 41.01, "grad_norm": 0.004491760861128569, "learning_rate": 2.3226115118504343e-05, "loss": 0.051, "step": 17570 }, { "epoch": 41.01, "grad_norm": 0.0023870787117630243, "learning_rate": 2.3207713823053143e-05, "loss": 0.1203, "step": 17580 }, { "epoch": 41.01, "grad_norm": 0.007122043985873461, "learning_rate": 2.3189312527601943e-05, "loss": 0.0005, "step": 17590 }, { "epoch": 41.01, "grad_norm": 0.003632687497884035, "learning_rate": 2.3170911232150744e-05, "loss": 0.0662, "step": 17600 }, { "epoch": 41.01, "grad_norm": 5.689450263977051, "learning_rate": 2.3152509936699547e-05, "loss": 0.0521, "step": 17610 }, { "epoch": 41.01, "grad_norm": 12.24816608428955, "learning_rate": 2.3134108641248344e-05, "loss": 0.0515, "step": 17620 }, { "epoch": 41.01, "grad_norm": 10.62009334564209, "learning_rate": 2.3115707345797144e-05, "loss": 0.0531, "step": 17630 }, { "epoch": 41.01, "grad_norm": 0.02156691811978817, "learning_rate": 2.3097306050345947e-05, "loss": 0.0606, "step": 17640 }, { "epoch": 41.01, "eval_accuracy": 0.7342342342342343, "eval_loss": 1.3908066749572754, "eval_runtime": 193.3149, "eval_samples_per_second": 4.594, "eval_steps_per_second": 0.383, "step": 17640 }, { "epoch": 42.0, "grad_norm": 0.056538455188274384, "learning_rate": 2.3078904754894744e-05, "loss": 0.0007, "step": 17650 }, { "epoch": 42.0, "grad_norm": 0.012973904609680176, "learning_rate": 2.3060503459443548e-05, "loss": 0.0648, "step": 17660 }, { "epoch": 42.0, "grad_norm": 0.43850526213645935, "learning_rate": 2.3042102163992348e-05, "loss": 0.0018, "step": 17670 }, { "epoch": 42.0, "grad_norm": 12.316707611083984, "learning_rate": 2.3023700868541144e-05, "loss": 0.0728, "step": 17680 }, { "epoch": 42.0, "grad_norm": 0.034915756434202194, "learning_rate": 2.3005299573089948e-05, "loss": 0.0126, "step": 17690 }, { "epoch": 42.0, "grad_norm": 14.699057579040527, "learning_rate": 2.2986898277638748e-05, "loss": 0.0525, "step": 17700 }, { "epoch": 42.0, "grad_norm": 0.06873279809951782, "learning_rate": 2.2968496982187548e-05, "loss": 0.0026, "step": 17710 }, { "epoch": 42.0, "grad_norm": 0.032149434089660645, "learning_rate": 2.2950095686736348e-05, "loss": 0.0307, "step": 17720 }, { "epoch": 42.0, "grad_norm": 0.002019342966377735, "learning_rate": 2.293169439128515e-05, "loss": 0.0006, "step": 17730 }, { "epoch": 42.0, "grad_norm": 0.028612077236175537, "learning_rate": 2.291329309583395e-05, "loss": 0.0774, "step": 17740 }, { "epoch": 42.0, "grad_norm": 0.005914296023547649, "learning_rate": 2.289489180038275e-05, "loss": 0.0439, "step": 17750 }, { "epoch": 42.0, "grad_norm": 0.006604051683098078, "learning_rate": 2.2876490504931545e-05, "loss": 0.0078, "step": 17760 }, { "epoch": 42.0, "grad_norm": 0.3514362573623657, "learning_rate": 2.285808920948035e-05, "loss": 0.0004, "step": 17770 }, { "epoch": 42.0, "grad_norm": 0.6133325099945068, "learning_rate": 2.283968791402915e-05, "loss": 0.002, "step": 17780 }, { "epoch": 42.0, "grad_norm": 34.64471435546875, "learning_rate": 2.282128661857795e-05, "loss": 0.1041, "step": 17790 }, { "epoch": 42.01, "grad_norm": 0.014041568152606487, "learning_rate": 2.280288532312675e-05, "loss": 0.0008, "step": 17800 }, { "epoch": 42.01, "grad_norm": 0.03229415416717529, "learning_rate": 2.278448402767555e-05, "loss": 0.036, "step": 17810 }, { "epoch": 42.01, "grad_norm": 0.004131761845201254, "learning_rate": 2.276608273222435e-05, "loss": 0.0136, "step": 17820 }, { "epoch": 42.01, "grad_norm": 0.006716595031321049, "learning_rate": 2.274768143677315e-05, "loss": 0.0645, "step": 17830 }, { "epoch": 42.01, "grad_norm": 10.16481876373291, "learning_rate": 2.2729280141321953e-05, "loss": 0.0539, "step": 17840 }, { "epoch": 42.01, "grad_norm": 0.02316008321940899, "learning_rate": 2.271087884587075e-05, "loss": 0.0512, "step": 17850 }, { "epoch": 42.01, "grad_norm": 0.31423234939575195, "learning_rate": 2.269247755041955e-05, "loss": 0.0007, "step": 17860 }, { "epoch": 42.01, "grad_norm": 0.010419754311442375, "learning_rate": 2.267407625496835e-05, "loss": 0.0003, "step": 17870 }, { "epoch": 42.01, "grad_norm": 0.003867323510348797, "learning_rate": 2.265567495951715e-05, "loss": 0.001, "step": 17880 }, { "epoch": 42.01, "grad_norm": 0.007264145649969578, "learning_rate": 2.263727366406595e-05, "loss": 0.0104, "step": 17890 }, { "epoch": 42.01, "grad_norm": 0.06619902700185776, "learning_rate": 2.261887236861475e-05, "loss": 0.0032, "step": 17900 }, { "epoch": 42.01, "grad_norm": 0.0021775520872324705, "learning_rate": 2.260047107316355e-05, "loss": 0.001, "step": 17910 }, { "epoch": 42.01, "grad_norm": 0.032563529908657074, "learning_rate": 2.2582069777712354e-05, "loss": 0.0634, "step": 17920 }, { "epoch": 42.01, "grad_norm": 0.010027103126049042, "learning_rate": 2.256366848226115e-05, "loss": 0.0072, "step": 17930 }, { "epoch": 42.01, "grad_norm": 0.003468131646513939, "learning_rate": 2.254526718680995e-05, "loss": 0.0004, "step": 17940 }, { "epoch": 42.01, "grad_norm": 0.0016712818760424852, "learning_rate": 2.2526865891358754e-05, "loss": 0.0016, "step": 17950 }, { "epoch": 42.01, "grad_norm": 0.004305675625801086, "learning_rate": 2.250846459590755e-05, "loss": 0.0005, "step": 17960 }, { "epoch": 42.01, "grad_norm": 0.0015782952541485429, "learning_rate": 2.2490063300456355e-05, "loss": 0.0002, "step": 17970 }, { "epoch": 42.01, "grad_norm": 0.024773990735411644, "learning_rate": 2.2471662005005155e-05, "loss": 0.0228, "step": 17980 }, { "epoch": 42.01, "grad_norm": 0.0037998452316969633, "learning_rate": 2.245326070955395e-05, "loss": 0.0525, "step": 17990 }, { "epoch": 42.01, "grad_norm": 0.003529587760567665, "learning_rate": 2.2434859414102755e-05, "loss": 0.0001, "step": 18000 }, { "epoch": 42.01, "grad_norm": 0.008061232976615429, "learning_rate": 2.2416458118651555e-05, "loss": 0.0019, "step": 18010 }, { "epoch": 42.01, "grad_norm": 0.007496473845094442, "learning_rate": 2.2398056823200355e-05, "loss": 0.0001, "step": 18020 }, { "epoch": 42.01, "grad_norm": 0.01914142817258835, "learning_rate": 2.2379655527749155e-05, "loss": 0.0379, "step": 18030 }, { "epoch": 42.01, "grad_norm": 0.08634476363658905, "learning_rate": 2.2361254232297955e-05, "loss": 0.1079, "step": 18040 }, { "epoch": 42.01, "grad_norm": 0.006580962333828211, "learning_rate": 2.2342852936846755e-05, "loss": 0.0002, "step": 18050 }, { "epoch": 42.01, "grad_norm": 0.005276253912597895, "learning_rate": 2.2324451641395556e-05, "loss": 0.1046, "step": 18060 }, { "epoch": 42.01, "eval_accuracy": 0.7252252252252253, "eval_loss": 1.7845572233200073, "eval_runtime": 190.8962, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.388, "step": 18060 }, { "epoch": 43.0, "grad_norm": 0.0017791197169572115, "learning_rate": 2.2306050345944356e-05, "loss": 0.0002, "step": 18070 }, { "epoch": 43.0, "grad_norm": 0.03276696428656578, "learning_rate": 2.2287649050493156e-05, "loss": 0.0003, "step": 18080 }, { "epoch": 43.0, "grad_norm": 0.01086380984634161, "learning_rate": 2.2269247755041956e-05, "loss": 0.0005, "step": 18090 }, { "epoch": 43.0, "grad_norm": 0.006562290713191032, "learning_rate": 2.2250846459590756e-05, "loss": 0.0252, "step": 18100 }, { "epoch": 43.0, "grad_norm": 0.0010953620076179504, "learning_rate": 2.2232445164139556e-05, "loss": 0.0096, "step": 18110 }, { "epoch": 43.0, "grad_norm": 0.0018935714615508914, "learning_rate": 2.2214043868688356e-05, "loss": 0.0011, "step": 18120 }, { "epoch": 43.0, "grad_norm": 0.005813705734908581, "learning_rate": 2.2195642573237156e-05, "loss": 0.1357, "step": 18130 }, { "epoch": 43.0, "grad_norm": 0.004509144462645054, "learning_rate": 2.2177241277785957e-05, "loss": 0.0003, "step": 18140 }, { "epoch": 43.0, "grad_norm": 0.08074437826871872, "learning_rate": 2.215883998233476e-05, "loss": 0.0007, "step": 18150 }, { "epoch": 43.0, "grad_norm": 0.002948348643258214, "learning_rate": 2.2140438686883557e-05, "loss": 0.0001, "step": 18160 }, { "epoch": 43.0, "grad_norm": 0.006361374631524086, "learning_rate": 2.2122037391432357e-05, "loss": 0.0031, "step": 18170 }, { "epoch": 43.0, "grad_norm": 9.16525936126709, "learning_rate": 2.210363609598116e-05, "loss": 0.0998, "step": 18180 }, { "epoch": 43.0, "grad_norm": 19.75349998474121, "learning_rate": 2.2085234800529957e-05, "loss": 0.0581, "step": 18190 }, { "epoch": 43.0, "grad_norm": 0.005757891573011875, "learning_rate": 2.2066833505078757e-05, "loss": 0.0717, "step": 18200 }, { "epoch": 43.0, "grad_norm": 0.002932116389274597, "learning_rate": 2.204843220962756e-05, "loss": 0.1433, "step": 18210 }, { "epoch": 43.01, "grad_norm": 0.030524814501404762, "learning_rate": 2.2030030914176357e-05, "loss": 0.0004, "step": 18220 }, { "epoch": 43.01, "grad_norm": 0.13268819451332092, "learning_rate": 2.201162961872516e-05, "loss": 0.0009, "step": 18230 }, { "epoch": 43.01, "grad_norm": 21.9083309173584, "learning_rate": 2.199322832327396e-05, "loss": 0.0293, "step": 18240 }, { "epoch": 43.01, "grad_norm": 0.02300228551030159, "learning_rate": 2.1974827027822758e-05, "loss": 0.0245, "step": 18250 }, { "epoch": 43.01, "grad_norm": 0.0050278110429644585, "learning_rate": 2.195642573237156e-05, "loss": 0.0001, "step": 18260 }, { "epoch": 43.01, "grad_norm": 0.04535319283604622, "learning_rate": 2.1938024436920358e-05, "loss": 0.0991, "step": 18270 }, { "epoch": 43.01, "grad_norm": 0.025395981967449188, "learning_rate": 2.191962314146916e-05, "loss": 0.0361, "step": 18280 }, { "epoch": 43.01, "grad_norm": 0.09500641375780106, "learning_rate": 2.190122184601796e-05, "loss": 0.0028, "step": 18290 }, { "epoch": 43.01, "grad_norm": 0.02161112241446972, "learning_rate": 2.188282055056676e-05, "loss": 0.0089, "step": 18300 }, { "epoch": 43.01, "grad_norm": 0.03070366010069847, "learning_rate": 2.1864419255115562e-05, "loss": 0.1413, "step": 18310 }, { "epoch": 43.01, "grad_norm": 0.0025216133799403906, "learning_rate": 2.1846017959664362e-05, "loss": 0.0003, "step": 18320 }, { "epoch": 43.01, "grad_norm": 0.544791042804718, "learning_rate": 2.1827616664213162e-05, "loss": 0.0926, "step": 18330 }, { "epoch": 43.01, "grad_norm": 0.08401042222976685, "learning_rate": 2.1809215368761962e-05, "loss": 0.0004, "step": 18340 }, { "epoch": 43.01, "grad_norm": 0.05331547558307648, "learning_rate": 2.1790814073310762e-05, "loss": 0.057, "step": 18350 }, { "epoch": 43.01, "grad_norm": 0.07863025367259979, "learning_rate": 2.1772412777859562e-05, "loss": 0.0571, "step": 18360 }, { "epoch": 43.01, "grad_norm": 0.03373830392956734, "learning_rate": 2.1754011482408363e-05, "loss": 0.0863, "step": 18370 }, { "epoch": 43.01, "grad_norm": 0.0050715371035039425, "learning_rate": 2.1735610186957163e-05, "loss": 0.0542, "step": 18380 }, { "epoch": 43.01, "grad_norm": 0.019844966009259224, "learning_rate": 2.1717208891505963e-05, "loss": 0.033, "step": 18390 }, { "epoch": 43.01, "grad_norm": 12.274739265441895, "learning_rate": 2.1698807596054763e-05, "loss": 0.0837, "step": 18400 }, { "epoch": 43.01, "grad_norm": 0.0111334677785635, "learning_rate": 2.1680406300603563e-05, "loss": 0.0326, "step": 18410 }, { "epoch": 43.01, "grad_norm": 0.011171177960932255, "learning_rate": 2.1662005005152363e-05, "loss": 0.0048, "step": 18420 }, { "epoch": 43.01, "grad_norm": 2.7167227268218994, "learning_rate": 2.1643603709701163e-05, "loss": 0.0808, "step": 18430 }, { "epoch": 43.01, "grad_norm": 0.005512547213584185, "learning_rate": 2.1625202414249963e-05, "loss": 0.0954, "step": 18440 }, { "epoch": 43.01, "grad_norm": 0.008709631860256195, "learning_rate": 2.1606801118798763e-05, "loss": 0.0004, "step": 18450 }, { "epoch": 43.01, "grad_norm": 24.202104568481445, "learning_rate": 2.1588399823347567e-05, "loss": 0.059, "step": 18460 }, { "epoch": 43.01, "grad_norm": 0.03742791339755058, "learning_rate": 2.1569998527896364e-05, "loss": 0.0772, "step": 18470 }, { "epoch": 43.01, "grad_norm": 0.00550016388297081, "learning_rate": 2.1551597232445164e-05, "loss": 0.0004, "step": 18480 }, { "epoch": 43.01, "eval_accuracy": 0.7240990990990991, "eval_loss": 1.6395900249481201, "eval_runtime": 109.1588, "eval_samples_per_second": 8.135, "eval_steps_per_second": 0.678, "step": 18480 }, { "epoch": 44.0, "grad_norm": 0.14148157835006714, "learning_rate": 2.1533195936993967e-05, "loss": 0.0578, "step": 18490 }, { "epoch": 44.0, "grad_norm": 30.328266143798828, "learning_rate": 2.1514794641542764e-05, "loss": 0.1007, "step": 18500 }, { "epoch": 44.0, "grad_norm": 0.004108451772481203, "learning_rate": 2.1496393346091564e-05, "loss": 0.0004, "step": 18510 }, { "epoch": 44.0, "grad_norm": 0.011362356133759022, "learning_rate": 2.1477992050640368e-05, "loss": 0.0333, "step": 18520 }, { "epoch": 44.0, "grad_norm": 0.02009459026157856, "learning_rate": 2.1459590755189164e-05, "loss": 0.0486, "step": 18530 }, { "epoch": 44.0, "grad_norm": 0.031281691044569016, "learning_rate": 2.1441189459737968e-05, "loss": 0.0411, "step": 18540 }, { "epoch": 44.0, "grad_norm": 0.006339728366583586, "learning_rate": 2.1422788164286768e-05, "loss": 0.0003, "step": 18550 }, { "epoch": 44.0, "grad_norm": 0.00324187777005136, "learning_rate": 2.1404386868835565e-05, "loss": 0.0004, "step": 18560 }, { "epoch": 44.0, "grad_norm": 47.98183822631836, "learning_rate": 2.1385985573384368e-05, "loss": 0.1047, "step": 18570 }, { "epoch": 44.0, "grad_norm": 6.298222064971924, "learning_rate": 2.136758427793317e-05, "loss": 0.0573, "step": 18580 }, { "epoch": 44.0, "grad_norm": 10.068635940551758, "learning_rate": 2.134918298248197e-05, "loss": 0.0494, "step": 18590 }, { "epoch": 44.0, "grad_norm": 0.14003126323223114, "learning_rate": 2.133078168703077e-05, "loss": 0.0006, "step": 18600 }, { "epoch": 44.0, "grad_norm": 0.020095407962799072, "learning_rate": 2.131238039157957e-05, "loss": 0.0159, "step": 18610 }, { "epoch": 44.0, "grad_norm": 0.0022556742187589407, "learning_rate": 2.129397909612837e-05, "loss": 0.0012, "step": 18620 }, { "epoch": 44.0, "grad_norm": 0.02804502658545971, "learning_rate": 2.127557780067717e-05, "loss": 0.0357, "step": 18630 }, { "epoch": 44.01, "grad_norm": 0.009413721971213818, "learning_rate": 2.125717650522597e-05, "loss": 0.0004, "step": 18640 }, { "epoch": 44.01, "grad_norm": 0.008668404072523117, "learning_rate": 2.123877520977477e-05, "loss": 0.0004, "step": 18650 }, { "epoch": 44.01, "grad_norm": 0.006598074920475483, "learning_rate": 2.122037391432357e-05, "loss": 0.0004, "step": 18660 }, { "epoch": 44.01, "grad_norm": 0.005593809299170971, "learning_rate": 2.120197261887237e-05, "loss": 0.038, "step": 18670 }, { "epoch": 44.01, "grad_norm": 0.013844887726008892, "learning_rate": 2.118357132342117e-05, "loss": 0.0556, "step": 18680 }, { "epoch": 44.01, "grad_norm": 0.032117415219545364, "learning_rate": 2.116517002796997e-05, "loss": 0.0303, "step": 18690 }, { "epoch": 44.01, "grad_norm": 0.04788897559046745, "learning_rate": 2.114676873251877e-05, "loss": 0.0002, "step": 18700 }, { "epoch": 44.01, "grad_norm": 0.0030235203448683023, "learning_rate": 2.112836743706757e-05, "loss": 0.0002, "step": 18710 }, { "epoch": 44.01, "grad_norm": 0.00274168630130589, "learning_rate": 2.1109966141616373e-05, "loss": 0.0342, "step": 18720 }, { "epoch": 44.01, "grad_norm": 3.2250986099243164, "learning_rate": 2.109156484616517e-05, "loss": 0.0497, "step": 18730 }, { "epoch": 44.01, "grad_norm": 0.0157657228410244, "learning_rate": 2.107316355071397e-05, "loss": 0.1054, "step": 18740 }, { "epoch": 44.01, "grad_norm": 0.0087531553581357, "learning_rate": 2.1054762255262774e-05, "loss": 0.0018, "step": 18750 }, { "epoch": 44.01, "grad_norm": 0.17030996084213257, "learning_rate": 2.103636095981157e-05, "loss": 0.0336, "step": 18760 }, { "epoch": 44.01, "grad_norm": 0.07173646986484528, "learning_rate": 2.1017959664360374e-05, "loss": 0.0003, "step": 18770 }, { "epoch": 44.01, "grad_norm": 0.025418242439627647, "learning_rate": 2.0999558368909174e-05, "loss": 0.0097, "step": 18780 }, { "epoch": 44.01, "grad_norm": 15.252046585083008, "learning_rate": 2.098115707345797e-05, "loss": 0.0543, "step": 18790 }, { "epoch": 44.01, "grad_norm": 0.01682462729513645, "learning_rate": 2.0962755778006774e-05, "loss": 0.0023, "step": 18800 }, { "epoch": 44.01, "grad_norm": 0.1344674676656723, "learning_rate": 2.094435448255557e-05, "loss": 0.0003, "step": 18810 }, { "epoch": 44.01, "grad_norm": 0.01934729889035225, "learning_rate": 2.092595318710437e-05, "loss": 0.0003, "step": 18820 }, { "epoch": 44.01, "grad_norm": 0.0010090465657413006, "learning_rate": 2.0907551891653175e-05, "loss": 0.0477, "step": 18830 }, { "epoch": 44.01, "grad_norm": 0.005481057800352573, "learning_rate": 2.088915059620197e-05, "loss": 0.0017, "step": 18840 }, { "epoch": 44.01, "grad_norm": 0.12130451202392578, "learning_rate": 2.0870749300750775e-05, "loss": 0.1554, "step": 18850 }, { "epoch": 44.01, "grad_norm": 0.010078057646751404, "learning_rate": 2.0852348005299575e-05, "loss": 0.0126, "step": 18860 }, { "epoch": 44.01, "grad_norm": 0.0075241439044475555, "learning_rate": 2.0833946709848372e-05, "loss": 0.0003, "step": 18870 }, { "epoch": 44.01, "grad_norm": 0.03676333650946617, "learning_rate": 2.0815545414397175e-05, "loss": 0.0013, "step": 18880 }, { "epoch": 44.01, "grad_norm": 0.003307629842311144, "learning_rate": 2.0797144118945975e-05, "loss": 0.0662, "step": 18890 }, { "epoch": 44.01, "grad_norm": 0.13727083802223206, "learning_rate": 2.0778742823494775e-05, "loss": 0.0881, "step": 18900 }, { "epoch": 44.01, "eval_accuracy": 0.7195945945945946, "eval_loss": 1.6206213235855103, "eval_runtime": 113.0431, "eval_samples_per_second": 7.855, "eval_steps_per_second": 0.655, "step": 18900 }, { "epoch": 45.0, "grad_norm": 7.187080383300781, "learning_rate": 2.0760341528043576e-05, "loss": 0.0406, "step": 18910 }, { "epoch": 45.0, "grad_norm": 1.3414313793182373, "learning_rate": 2.0741940232592376e-05, "loss": 0.0366, "step": 18920 }, { "epoch": 45.0, "grad_norm": 4.305115222930908, "learning_rate": 2.0723538937141176e-05, "loss": 0.0339, "step": 18930 }, { "epoch": 45.0, "grad_norm": 0.024366533383727074, "learning_rate": 2.0705137641689976e-05, "loss": 0.0021, "step": 18940 }, { "epoch": 45.0, "grad_norm": 0.06154268607497215, "learning_rate": 2.0686736346238776e-05, "loss": 0.0437, "step": 18950 }, { "epoch": 45.0, "grad_norm": 0.012441672384738922, "learning_rate": 2.0668335050787576e-05, "loss": 0.0033, "step": 18960 }, { "epoch": 45.0, "grad_norm": 0.008273870684206486, "learning_rate": 2.0649933755336376e-05, "loss": 0.0003, "step": 18970 }, { "epoch": 45.0, "grad_norm": 0.00964616984128952, "learning_rate": 2.0631532459885176e-05, "loss": 0.0002, "step": 18980 }, { "epoch": 45.0, "grad_norm": 0.004747298080474138, "learning_rate": 2.0613131164433977e-05, "loss": 0.0002, "step": 18990 }, { "epoch": 45.0, "grad_norm": 0.30449992418289185, "learning_rate": 2.0594729868982777e-05, "loss": 0.0005, "step": 19000 }, { "epoch": 45.0, "grad_norm": 0.049932073801755905, "learning_rate": 2.0576328573531577e-05, "loss": 0.1286, "step": 19010 }, { "epoch": 45.0, "grad_norm": 0.0017162609146907926, "learning_rate": 2.0557927278080377e-05, "loss": 0.0004, "step": 19020 }, { "epoch": 45.0, "grad_norm": 0.0063346978276968, "learning_rate": 2.053952598262918e-05, "loss": 0.0256, "step": 19030 }, { "epoch": 45.0, "grad_norm": 0.18762005865573883, "learning_rate": 2.0521124687177977e-05, "loss": 0.0018, "step": 19040 }, { "epoch": 45.0, "grad_norm": 0.01104031503200531, "learning_rate": 2.0502723391726777e-05, "loss": 0.0636, "step": 19050 }, { "epoch": 45.01, "grad_norm": 0.006080237217247486, "learning_rate": 2.048432209627558e-05, "loss": 0.001, "step": 19060 }, { "epoch": 45.01, "grad_norm": 0.013174448162317276, "learning_rate": 2.0465920800824377e-05, "loss": 0.0052, "step": 19070 }, { "epoch": 45.01, "grad_norm": 0.006308967713266611, "learning_rate": 2.044751950537318e-05, "loss": 0.0027, "step": 19080 }, { "epoch": 45.01, "grad_norm": 0.018024206161499023, "learning_rate": 2.042911820992198e-05, "loss": 0.0003, "step": 19090 }, { "epoch": 45.01, "grad_norm": 0.07368378341197968, "learning_rate": 2.0410716914470778e-05, "loss": 0.0002, "step": 19100 }, { "epoch": 45.01, "grad_norm": 0.09977413713932037, "learning_rate": 2.039231561901958e-05, "loss": 0.0295, "step": 19110 }, { "epoch": 45.01, "grad_norm": 0.03205341845750809, "learning_rate": 2.037391432356838e-05, "loss": 0.0007, "step": 19120 }, { "epoch": 45.01, "grad_norm": 0.0013329902430996299, "learning_rate": 2.0355513028117178e-05, "loss": 0.0739, "step": 19130 }, { "epoch": 45.01, "grad_norm": 0.014855876564979553, "learning_rate": 2.033711173266598e-05, "loss": 0.0005, "step": 19140 }, { "epoch": 45.01, "grad_norm": 0.25314536690711975, "learning_rate": 2.0318710437214782e-05, "loss": 0.0372, "step": 19150 }, { "epoch": 45.01, "grad_norm": 0.004318530671298504, "learning_rate": 2.0300309141763582e-05, "loss": 0.002, "step": 19160 }, { "epoch": 45.01, "grad_norm": 0.0036497735418379307, "learning_rate": 2.0281907846312382e-05, "loss": 0.0032, "step": 19170 }, { "epoch": 45.01, "grad_norm": 3.6807165145874023, "learning_rate": 2.0263506550861182e-05, "loss": 0.1226, "step": 19180 }, { "epoch": 45.01, "grad_norm": 0.06369734555482864, "learning_rate": 2.0245105255409982e-05, "loss": 0.0439, "step": 19190 }, { "epoch": 45.01, "grad_norm": 0.6539937257766724, "learning_rate": 2.0226703959958782e-05, "loss": 0.0295, "step": 19200 }, { "epoch": 45.01, "grad_norm": 0.0202985480427742, "learning_rate": 2.0208302664507582e-05, "loss": 0.003, "step": 19210 }, { "epoch": 45.01, "grad_norm": 0.010312013328075409, "learning_rate": 2.0189901369056383e-05, "loss": 0.0003, "step": 19220 }, { "epoch": 45.01, "grad_norm": 1.6498684883117676, "learning_rate": 2.0171500073605183e-05, "loss": 0.0075, "step": 19230 }, { "epoch": 45.01, "grad_norm": 0.006631897762417793, "learning_rate": 2.0153098778153983e-05, "loss": 0.0618, "step": 19240 }, { "epoch": 45.01, "grad_norm": 0.02790352888405323, "learning_rate": 2.0134697482702783e-05, "loss": 0.0006, "step": 19250 }, { "epoch": 45.01, "grad_norm": 0.013174889609217644, "learning_rate": 2.0116296187251586e-05, "loss": 0.0065, "step": 19260 }, { "epoch": 45.01, "grad_norm": 0.003927392885088921, "learning_rate": 2.0097894891800383e-05, "loss": 0.022, "step": 19270 }, { "epoch": 45.01, "grad_norm": 0.004552062135189772, "learning_rate": 2.0079493596349183e-05, "loss": 0.0006, "step": 19280 }, { "epoch": 45.01, "grad_norm": 0.005525296088308096, "learning_rate": 2.0061092300897987e-05, "loss": 0.0004, "step": 19290 }, { "epoch": 45.01, "grad_norm": 0.011126107536256313, "learning_rate": 2.0042691005446783e-05, "loss": 0.0321, "step": 19300 }, { "epoch": 45.01, "grad_norm": 0.011895339004695415, "learning_rate": 2.0024289709995584e-05, "loss": 0.0009, "step": 19310 }, { "epoch": 45.01, "grad_norm": 8.476682662963867, "learning_rate": 2.0005888414544384e-05, "loss": 0.0934, "step": 19320 }, { "epoch": 45.01, "eval_accuracy": 0.7319819819819819, "eval_loss": 1.6994054317474365, "eval_runtime": 105.9213, "eval_samples_per_second": 8.384, "eval_steps_per_second": 0.699, "step": 19320 }, { "epoch": 46.0, "grad_norm": 0.019197309389710426, "learning_rate": 1.9987487119093184e-05, "loss": 0.049, "step": 19330 }, { "epoch": 46.0, "grad_norm": 0.0017492288025096059, "learning_rate": 1.9969085823641987e-05, "loss": 0.0658, "step": 19340 }, { "epoch": 46.0, "grad_norm": 0.0031795764807611704, "learning_rate": 1.9950684528190784e-05, "loss": 0.0003, "step": 19350 }, { "epoch": 46.0, "grad_norm": 0.01530479546636343, "learning_rate": 1.9932283232739584e-05, "loss": 0.0531, "step": 19360 }, { "epoch": 46.0, "grad_norm": 0.04168014973402023, "learning_rate": 1.9913881937288388e-05, "loss": 0.0503, "step": 19370 }, { "epoch": 46.0, "grad_norm": 0.08437560498714447, "learning_rate": 1.9895480641837184e-05, "loss": 0.0356, "step": 19380 }, { "epoch": 46.0, "grad_norm": 0.010686563327908516, "learning_rate": 1.9877079346385988e-05, "loss": 0.0048, "step": 19390 }, { "epoch": 46.0, "grad_norm": 0.3396773636341095, "learning_rate": 1.9858678050934788e-05, "loss": 0.0385, "step": 19400 }, { "epoch": 46.0, "grad_norm": 0.06184344366192818, "learning_rate": 1.9840276755483585e-05, "loss": 0.0581, "step": 19410 }, { "epoch": 46.0, "grad_norm": 0.01603071019053459, "learning_rate": 1.9821875460032388e-05, "loss": 0.0802, "step": 19420 }, { "epoch": 46.0, "grad_norm": 0.007481284439563751, "learning_rate": 1.980347416458119e-05, "loss": 0.0004, "step": 19430 }, { "epoch": 46.0, "grad_norm": 0.026033438742160797, "learning_rate": 1.9785072869129985e-05, "loss": 0.0027, "step": 19440 }, { "epoch": 46.0, "grad_norm": 0.5898566246032715, "learning_rate": 1.976667157367879e-05, "loss": 0.0004, "step": 19450 }, { "epoch": 46.0, "grad_norm": 0.003322466742247343, "learning_rate": 1.974827027822759e-05, "loss": 0.0003, "step": 19460 }, { "epoch": 46.0, "grad_norm": 0.014266034588217735, "learning_rate": 1.972986898277639e-05, "loss": 0.0004, "step": 19470 }, { "epoch": 46.01, "grad_norm": 0.005200342275202274, "learning_rate": 1.971146768732519e-05, "loss": 0.0002, "step": 19480 }, { "epoch": 46.01, "grad_norm": 0.009922517463564873, "learning_rate": 1.969306639187399e-05, "loss": 0.0009, "step": 19490 }, { "epoch": 46.01, "grad_norm": 0.011363714933395386, "learning_rate": 1.967466509642279e-05, "loss": 0.0295, "step": 19500 }, { "epoch": 46.01, "grad_norm": 63.66260528564453, "learning_rate": 1.965626380097159e-05, "loss": 0.0362, "step": 19510 }, { "epoch": 46.01, "grad_norm": 0.03335092216730118, "learning_rate": 1.963786250552039e-05, "loss": 0.0003, "step": 19520 }, { "epoch": 46.01, "grad_norm": 0.0011501980479806662, "learning_rate": 1.961946121006919e-05, "loss": 0.0075, "step": 19530 }, { "epoch": 46.01, "grad_norm": 0.00867602787911892, "learning_rate": 1.960105991461799e-05, "loss": 0.0001, "step": 19540 }, { "epoch": 46.01, "grad_norm": 0.0034504039213061333, "learning_rate": 1.958265861916679e-05, "loss": 0.0002, "step": 19550 }, { "epoch": 46.01, "grad_norm": 0.001728889998048544, "learning_rate": 1.956425732371559e-05, "loss": 0.0039, "step": 19560 }, { "epoch": 46.01, "grad_norm": 0.0017645972548052669, "learning_rate": 1.9545856028264393e-05, "loss": 0.0396, "step": 19570 }, { "epoch": 46.01, "grad_norm": 0.002970011904835701, "learning_rate": 1.952745473281319e-05, "loss": 0.0248, "step": 19580 }, { "epoch": 46.01, "grad_norm": 0.01166481152176857, "learning_rate": 1.950905343736199e-05, "loss": 0.0003, "step": 19590 }, { "epoch": 46.01, "grad_norm": 0.030790084972977638, "learning_rate": 1.9490652141910794e-05, "loss": 0.0669, "step": 19600 }, { "epoch": 46.01, "grad_norm": 0.0027878263499587774, "learning_rate": 1.947225084645959e-05, "loss": 0.0031, "step": 19610 }, { "epoch": 46.01, "grad_norm": 0.0010275563690811396, "learning_rate": 1.945384955100839e-05, "loss": 0.0001, "step": 19620 }, { "epoch": 46.01, "grad_norm": 0.004202474374324083, "learning_rate": 1.9435448255557194e-05, "loss": 0.0266, "step": 19630 }, { "epoch": 46.01, "grad_norm": 1.82235586643219, "learning_rate": 1.941704696010599e-05, "loss": 0.0292, "step": 19640 }, { "epoch": 46.01, "grad_norm": 0.044389884918928146, "learning_rate": 1.9398645664654794e-05, "loss": 0.0566, "step": 19650 }, { "epoch": 46.01, "grad_norm": 0.029817450791597366, "learning_rate": 1.9380244369203594e-05, "loss": 0.0018, "step": 19660 }, { "epoch": 46.01, "grad_norm": 0.0017704556230455637, "learning_rate": 1.936184307375239e-05, "loss": 0.0367, "step": 19670 }, { "epoch": 46.01, "grad_norm": 0.004369074944406748, "learning_rate": 1.9343441778301195e-05, "loss": 0.0005, "step": 19680 }, { "epoch": 46.01, "grad_norm": 0.0013988850405439734, "learning_rate": 1.9325040482849995e-05, "loss": 0.0002, "step": 19690 }, { "epoch": 46.01, "grad_norm": 0.0056244307197630405, "learning_rate": 1.9306639187398795e-05, "loss": 0.0001, "step": 19700 }, { "epoch": 46.01, "grad_norm": 0.0017847019480541348, "learning_rate": 1.9288237891947595e-05, "loss": 0.0001, "step": 19710 }, { "epoch": 46.01, "grad_norm": 0.038331713527441025, "learning_rate": 1.9269836596496392e-05, "loss": 0.0002, "step": 19720 }, { "epoch": 46.01, "grad_norm": 0.0021360579412430525, "learning_rate": 1.9251435301045195e-05, "loss": 0.0001, "step": 19730 }, { "epoch": 46.01, "grad_norm": 0.0037925804499536753, "learning_rate": 1.9233034005593995e-05, "loss": 0.0001, "step": 19740 }, { "epoch": 46.01, "eval_accuracy": 0.7162162162162162, "eval_loss": 2.0068118572235107, "eval_runtime": 115.1092, "eval_samples_per_second": 7.714, "eval_steps_per_second": 0.643, "step": 19740 }, { "epoch": 47.0, "grad_norm": 88.71112060546875, "learning_rate": 1.9214632710142792e-05, "loss": 0.0471, "step": 19750 }, { "epoch": 47.0, "grad_norm": 0.0014459657249972224, "learning_rate": 1.9196231414691596e-05, "loss": 0.0006, "step": 19760 }, { "epoch": 47.0, "grad_norm": 0.0023679425939917564, "learning_rate": 1.9177830119240396e-05, "loss": 0.0557, "step": 19770 }, { "epoch": 47.0, "grad_norm": 63.74143600463867, "learning_rate": 1.9159428823789196e-05, "loss": 0.0349, "step": 19780 }, { "epoch": 47.0, "grad_norm": 0.000937833683565259, "learning_rate": 1.9141027528337996e-05, "loss": 0.0002, "step": 19790 }, { "epoch": 47.0, "grad_norm": 0.0016346214106306434, "learning_rate": 1.9122626232886796e-05, "loss": 0.0003, "step": 19800 }, { "epoch": 47.0, "grad_norm": 0.002341426908969879, "learning_rate": 1.9104224937435596e-05, "loss": 0.0043, "step": 19810 }, { "epoch": 47.0, "grad_norm": 0.08337362110614777, "learning_rate": 1.9085823641984396e-05, "loss": 0.0002, "step": 19820 }, { "epoch": 47.0, "grad_norm": 0.04266023263335228, "learning_rate": 1.9067422346533196e-05, "loss": 0.0002, "step": 19830 }, { "epoch": 47.0, "grad_norm": 0.001009949017316103, "learning_rate": 1.9049021051081996e-05, "loss": 0.0001, "step": 19840 }, { "epoch": 47.0, "grad_norm": 0.003605714999139309, "learning_rate": 1.9030619755630797e-05, "loss": 0.0004, "step": 19850 }, { "epoch": 47.0, "grad_norm": 0.00593281164765358, "learning_rate": 1.9012218460179597e-05, "loss": 0.0338, "step": 19860 }, { "epoch": 47.0, "grad_norm": 29.594966888427734, "learning_rate": 1.8993817164728397e-05, "loss": 0.0494, "step": 19870 }, { "epoch": 47.0, "grad_norm": 0.005202536471188068, "learning_rate": 1.89754158692772e-05, "loss": 0.0797, "step": 19880 }, { "epoch": 47.0, "grad_norm": 0.0021886222530156374, "learning_rate": 1.8957014573825997e-05, "loss": 0.0355, "step": 19890 }, { "epoch": 47.01, "grad_norm": 0.0017879304941743612, "learning_rate": 1.8938613278374797e-05, "loss": 0.0598, "step": 19900 }, { "epoch": 47.01, "grad_norm": 0.037195175886154175, "learning_rate": 1.89202119829236e-05, "loss": 0.0307, "step": 19910 }, { "epoch": 47.01, "grad_norm": 0.002196249086409807, "learning_rate": 1.8901810687472397e-05, "loss": 0.0747, "step": 19920 }, { "epoch": 47.01, "grad_norm": 0.3126240670681, "learning_rate": 1.8883409392021198e-05, "loss": 0.0001, "step": 19930 }, { "epoch": 47.01, "grad_norm": 0.003922034986317158, "learning_rate": 1.886500809657e-05, "loss": 0.0002, "step": 19940 }, { "epoch": 47.01, "grad_norm": 0.00338058453053236, "learning_rate": 1.8846606801118798e-05, "loss": 0.0572, "step": 19950 }, { "epoch": 47.01, "grad_norm": 10.452116012573242, "learning_rate": 1.88282055056676e-05, "loss": 0.1096, "step": 19960 }, { "epoch": 47.01, "grad_norm": 0.010865331627428532, "learning_rate": 1.88098042102164e-05, "loss": 0.0158, "step": 19970 }, { "epoch": 47.01, "grad_norm": 2.9410135746002197, "learning_rate": 1.8791402914765198e-05, "loss": 0.0097, "step": 19980 }, { "epoch": 47.01, "grad_norm": 103.67179870605469, "learning_rate": 1.8773001619314e-05, "loss": 0.1498, "step": 19990 }, { "epoch": 47.01, "grad_norm": 0.005131885409355164, "learning_rate": 1.8754600323862802e-05, "loss": 0.0751, "step": 20000 }, { "epoch": 47.01, "grad_norm": 0.003077390603721142, "learning_rate": 1.8736199028411602e-05, "loss": 0.0021, "step": 20010 }, { "epoch": 47.01, "grad_norm": 0.014272770844399929, "learning_rate": 1.8717797732960402e-05, "loss": 0.0003, "step": 20020 }, { "epoch": 47.01, "grad_norm": 0.05707676336169243, "learning_rate": 1.8699396437509202e-05, "loss": 0.118, "step": 20030 }, { "epoch": 47.01, "grad_norm": 0.009387039579451084, "learning_rate": 1.8680995142058002e-05, "loss": 0.0003, "step": 20040 }, { "epoch": 47.01, "grad_norm": 0.0015558353625237942, "learning_rate": 1.8662593846606802e-05, "loss": 0.0006, "step": 20050 }, { "epoch": 47.01, "grad_norm": 0.0268620066344738, "learning_rate": 1.8644192551155602e-05, "loss": 0.026, "step": 20060 }, { "epoch": 47.01, "grad_norm": 0.0657452940940857, "learning_rate": 1.8625791255704403e-05, "loss": 0.0004, "step": 20070 }, { "epoch": 47.01, "grad_norm": 0.005338532850146294, "learning_rate": 1.8607389960253203e-05, "loss": 0.0014, "step": 20080 }, { "epoch": 47.01, "grad_norm": 0.005034546833485365, "learning_rate": 1.8588988664802003e-05, "loss": 0.053, "step": 20090 }, { "epoch": 47.01, "grad_norm": 0.002866822760552168, "learning_rate": 1.8570587369350803e-05, "loss": 0.0631, "step": 20100 }, { "epoch": 47.01, "grad_norm": 0.008067389018833637, "learning_rate": 1.8552186073899603e-05, "loss": 0.0005, "step": 20110 }, { "epoch": 47.01, "grad_norm": 81.31429290771484, "learning_rate": 1.8533784778448403e-05, "loss": 0.0132, "step": 20120 }, { "epoch": 47.01, "grad_norm": 22.95867347717285, "learning_rate": 1.8515383482997203e-05, "loss": 0.0539, "step": 20130 }, { "epoch": 47.01, "grad_norm": 2.4549527168273926, "learning_rate": 1.8496982187546007e-05, "loss": 0.1012, "step": 20140 }, { "epoch": 47.01, "grad_norm": 0.003576475428417325, "learning_rate": 1.8478580892094803e-05, "loss": 0.0006, "step": 20150 }, { "epoch": 47.01, "grad_norm": 0.0023642319720238447, "learning_rate": 1.8460179596643604e-05, "loss": 0.024, "step": 20160 }, { "epoch": 47.01, "eval_accuracy": 0.7376126126126126, "eval_loss": 1.534989595413208, "eval_runtime": 67.2041, "eval_samples_per_second": 13.213, "eval_steps_per_second": 1.101, "step": 20160 }, { "epoch": 48.0, "grad_norm": 0.37329983711242676, "learning_rate": 1.8441778301192407e-05, "loss": 0.0007, "step": 20170 }, { "epoch": 48.0, "grad_norm": 0.17028795182704926, "learning_rate": 1.8423377005741204e-05, "loss": 0.0557, "step": 20180 }, { "epoch": 48.0, "grad_norm": 0.010627706535160542, "learning_rate": 1.8404975710290007e-05, "loss": 0.0456, "step": 20190 }, { "epoch": 48.0, "grad_norm": 0.0034769929479807615, "learning_rate": 1.8386574414838807e-05, "loss": 0.0014, "step": 20200 }, { "epoch": 48.0, "grad_norm": 0.016666559502482414, "learning_rate": 1.8368173119387604e-05, "loss": 0.0011, "step": 20210 }, { "epoch": 48.0, "grad_norm": 25.03653907775879, "learning_rate": 1.8349771823936408e-05, "loss": 0.0389, "step": 20220 }, { "epoch": 48.0, "grad_norm": 0.004017295315861702, "learning_rate": 1.8331370528485208e-05, "loss": 0.0222, "step": 20230 }, { "epoch": 48.0, "grad_norm": 0.1146978884935379, "learning_rate": 1.8312969233034004e-05, "loss": 0.0051, "step": 20240 }, { "epoch": 48.0, "grad_norm": 0.036740757524967194, "learning_rate": 1.8294567937582808e-05, "loss": 0.051, "step": 20250 }, { "epoch": 48.0, "grad_norm": 0.011806732974946499, "learning_rate": 1.8276166642131605e-05, "loss": 0.0082, "step": 20260 }, { "epoch": 48.0, "grad_norm": 0.007759864907711744, "learning_rate": 1.8257765346680408e-05, "loss": 0.0133, "step": 20270 }, { "epoch": 48.0, "grad_norm": 0.03835444524884224, "learning_rate": 1.823936405122921e-05, "loss": 0.0007, "step": 20280 }, { "epoch": 48.0, "grad_norm": 0.0012061005691066384, "learning_rate": 1.8220962755778005e-05, "loss": 0.0247, "step": 20290 }, { "epoch": 48.0, "grad_norm": 0.005193190183490515, "learning_rate": 1.820256146032681e-05, "loss": 0.0076, "step": 20300 }, { "epoch": 48.0, "grad_norm": 0.002150206360965967, "learning_rate": 1.818416016487561e-05, "loss": 0.0001, "step": 20310 }, { "epoch": 48.01, "grad_norm": 0.0028511222917586565, "learning_rate": 1.816575886942441e-05, "loss": 0.0002, "step": 20320 }, { "epoch": 48.01, "grad_norm": 0.002456626622006297, "learning_rate": 1.814735757397321e-05, "loss": 0.0002, "step": 20330 }, { "epoch": 48.01, "grad_norm": 0.034909721463918686, "learning_rate": 1.812895627852201e-05, "loss": 0.0001, "step": 20340 }, { "epoch": 48.01, "grad_norm": 0.008975083939731121, "learning_rate": 1.811055498307081e-05, "loss": 0.0002, "step": 20350 }, { "epoch": 48.01, "grad_norm": 0.0032727932557463646, "learning_rate": 1.809215368761961e-05, "loss": 0.0001, "step": 20360 }, { "epoch": 48.01, "grad_norm": 0.005578655283898115, "learning_rate": 1.807375239216841e-05, "loss": 0.0484, "step": 20370 }, { "epoch": 48.01, "grad_norm": 0.006554843857884407, "learning_rate": 1.805535109671721e-05, "loss": 0.0001, "step": 20380 }, { "epoch": 48.01, "grad_norm": 0.0009955121204257011, "learning_rate": 1.803694980126601e-05, "loss": 0.0346, "step": 20390 }, { "epoch": 48.01, "grad_norm": 0.008674697019159794, "learning_rate": 1.801854850581481e-05, "loss": 0.0001, "step": 20400 }, { "epoch": 48.01, "grad_norm": 0.13496126234531403, "learning_rate": 1.800014721036361e-05, "loss": 0.0169, "step": 20410 }, { "epoch": 48.01, "grad_norm": 0.00233973260037601, "learning_rate": 1.798174591491241e-05, "loss": 0.0003, "step": 20420 }, { "epoch": 48.01, "grad_norm": 0.01351605448871851, "learning_rate": 1.796334461946121e-05, "loss": 0.0487, "step": 20430 }, { "epoch": 48.01, "grad_norm": 0.003713756799697876, "learning_rate": 1.794494332401001e-05, "loss": 0.0001, "step": 20440 }, { "epoch": 48.01, "grad_norm": 0.004229304380714893, "learning_rate": 1.7926542028558814e-05, "loss": 0.0558, "step": 20450 }, { "epoch": 48.01, "grad_norm": 0.0006775284418836236, "learning_rate": 1.790814073310761e-05, "loss": 0.001, "step": 20460 }, { "epoch": 48.01, "grad_norm": 0.0010827960213646293, "learning_rate": 1.788973943765641e-05, "loss": 0.0223, "step": 20470 }, { "epoch": 48.01, "grad_norm": 0.0010313157690688968, "learning_rate": 1.7871338142205214e-05, "loss": 0.1121, "step": 20480 }, { "epoch": 48.01, "grad_norm": 2.7552216053009033, "learning_rate": 1.785293684675401e-05, "loss": 0.0005, "step": 20490 }, { "epoch": 48.01, "grad_norm": 0.016195788979530334, "learning_rate": 1.7834535551302814e-05, "loss": 0.0009, "step": 20500 }, { "epoch": 48.01, "grad_norm": 0.0725574940443039, "learning_rate": 1.7816134255851614e-05, "loss": 0.0002, "step": 20510 }, { "epoch": 48.01, "grad_norm": 0.0008012351463548839, "learning_rate": 1.779773296040041e-05, "loss": 0.0289, "step": 20520 }, { "epoch": 48.01, "grad_norm": 0.0045431689359247684, "learning_rate": 1.7779331664949215e-05, "loss": 0.0001, "step": 20530 }, { "epoch": 48.01, "grad_norm": 0.0012236082693561912, "learning_rate": 1.7760930369498015e-05, "loss": 0.005, "step": 20540 }, { "epoch": 48.01, "grad_norm": 3.1969592571258545, "learning_rate": 1.774252907404681e-05, "loss": 0.0007, "step": 20550 }, { "epoch": 48.01, "grad_norm": 0.0031280736438930035, "learning_rate": 1.7724127778595615e-05, "loss": 0.0191, "step": 20560 }, { "epoch": 48.01, "grad_norm": 0.12724265456199646, "learning_rate": 1.7705726483144415e-05, "loss": 0.0004, "step": 20570 }, { "epoch": 48.01, "grad_norm": 1.0310579538345337, "learning_rate": 1.7687325187693215e-05, "loss": 0.017, "step": 20580 }, { "epoch": 48.01, "eval_accuracy": 0.7162162162162162, "eval_loss": 1.886398434638977, "eval_runtime": 40.674, "eval_samples_per_second": 21.832, "eval_steps_per_second": 1.819, "step": 20580 }, { "epoch": 49.0, "grad_norm": 0.3085496127605438, "learning_rate": 1.7668923892242015e-05, "loss": 0.0078, "step": 20590 }, { "epoch": 49.0, "grad_norm": 0.0024839958641678095, "learning_rate": 1.7650522596790815e-05, "loss": 0.0003, "step": 20600 }, { "epoch": 49.0, "grad_norm": 0.0012105575297027826, "learning_rate": 1.7632121301339616e-05, "loss": 0.0001, "step": 20610 }, { "epoch": 49.0, "grad_norm": 1.1235806941986084, "learning_rate": 1.7613720005888416e-05, "loss": 0.0165, "step": 20620 }, { "epoch": 49.0, "grad_norm": 0.0009100966854020953, "learning_rate": 1.7595318710437216e-05, "loss": 0.0625, "step": 20630 }, { "epoch": 49.0, "grad_norm": 2.2181501388549805, "learning_rate": 1.7576917414986016e-05, "loss": 0.0104, "step": 20640 }, { "epoch": 49.0, "grad_norm": 0.11276418715715408, "learning_rate": 1.7558516119534816e-05, "loss": 0.0056, "step": 20650 }, { "epoch": 49.0, "grad_norm": 0.009900757111608982, "learning_rate": 1.7540114824083616e-05, "loss": 0.0001, "step": 20660 }, { "epoch": 49.0, "grad_norm": 0.016009271144866943, "learning_rate": 1.7521713528632416e-05, "loss": 0.0001, "step": 20670 }, { "epoch": 49.0, "grad_norm": 52.463294982910156, "learning_rate": 1.7503312233181216e-05, "loss": 0.0275, "step": 20680 }, { "epoch": 49.0, "grad_norm": 42.01731491088867, "learning_rate": 1.7484910937730016e-05, "loss": 0.0424, "step": 20690 }, { "epoch": 49.0, "grad_norm": 0.002209991682320833, "learning_rate": 1.7466509642278817e-05, "loss": 0.0118, "step": 20700 }, { "epoch": 49.0, "grad_norm": 0.0023402757942676544, "learning_rate": 1.744810834682762e-05, "loss": 0.0001, "step": 20710 }, { "epoch": 49.0, "grad_norm": 0.0017410296713933349, "learning_rate": 1.7429707051376417e-05, "loss": 0.0016, "step": 20720 }, { "epoch": 49.0, "grad_norm": 2.1473774909973145, "learning_rate": 1.7411305755925217e-05, "loss": 0.0471, "step": 20730 }, { "epoch": 49.01, "grad_norm": 0.0010794149711728096, "learning_rate": 1.739290446047402e-05, "loss": 0.0644, "step": 20740 }, { "epoch": 49.01, "grad_norm": 0.0012719827936962247, "learning_rate": 1.7374503165022817e-05, "loss": 0.0001, "step": 20750 }, { "epoch": 49.01, "grad_norm": 0.0008286166121251881, "learning_rate": 1.735610186957162e-05, "loss": 0.0004, "step": 20760 }, { "epoch": 49.01, "grad_norm": 0.0054702237248420715, "learning_rate": 1.7337700574120417e-05, "loss": 0.0002, "step": 20770 }, { "epoch": 49.01, "grad_norm": 0.009072530083358288, "learning_rate": 1.7319299278669217e-05, "loss": 0.0676, "step": 20780 }, { "epoch": 49.01, "grad_norm": 0.0006656598416157067, "learning_rate": 1.730089798321802e-05, "loss": 0.0066, "step": 20790 }, { "epoch": 49.01, "grad_norm": 0.08233454823493958, "learning_rate": 1.7282496687766818e-05, "loss": 0.0001, "step": 20800 }, { "epoch": 49.01, "grad_norm": 0.0012012611841782928, "learning_rate": 1.726409539231562e-05, "loss": 0.0001, "step": 20810 }, { "epoch": 49.01, "grad_norm": 2.6688947677612305, "learning_rate": 1.724569409686442e-05, "loss": 0.105, "step": 20820 }, { "epoch": 49.01, "grad_norm": 0.002076583681628108, "learning_rate": 1.7227292801413218e-05, "loss": 0.0192, "step": 20830 }, { "epoch": 49.01, "grad_norm": 0.0016304003074765205, "learning_rate": 1.720889150596202e-05, "loss": 0.0001, "step": 20840 }, { "epoch": 49.01, "grad_norm": 0.05900062620639801, "learning_rate": 1.7190490210510822e-05, "loss": 0.0016, "step": 20850 }, { "epoch": 49.01, "grad_norm": 0.00635264627635479, "learning_rate": 1.717208891505962e-05, "loss": 0.0001, "step": 20860 }, { "epoch": 49.01, "grad_norm": 67.24282836914062, "learning_rate": 1.7153687619608422e-05, "loss": 0.0191, "step": 20870 }, { "epoch": 49.01, "grad_norm": 0.04959714412689209, "learning_rate": 1.7135286324157222e-05, "loss": 0.0266, "step": 20880 }, { "epoch": 49.01, "grad_norm": 11.01009464263916, "learning_rate": 1.7116885028706022e-05, "loss": 0.0656, "step": 20890 }, { "epoch": 49.01, "grad_norm": 0.004300578963011503, "learning_rate": 1.7098483733254822e-05, "loss": 0.0293, "step": 20900 }, { "epoch": 49.01, "grad_norm": 0.08246956020593643, "learning_rate": 1.7080082437803622e-05, "loss": 0.0195, "step": 20910 }, { "epoch": 49.01, "grad_norm": 0.005194004625082016, "learning_rate": 1.7061681142352423e-05, "loss": 0.0145, "step": 20920 }, { "epoch": 49.01, "grad_norm": 0.007515768054872751, "learning_rate": 1.7043279846901223e-05, "loss": 0.0472, "step": 20930 }, { "epoch": 49.01, "grad_norm": 0.0014940837863832712, "learning_rate": 1.7024878551450023e-05, "loss": 0.0532, "step": 20940 }, { "epoch": 49.01, "grad_norm": 0.017531683668494225, "learning_rate": 1.7006477255998823e-05, "loss": 0.0049, "step": 20950 }, { "epoch": 49.01, "grad_norm": 0.004585204645991325, "learning_rate": 1.6988075960547623e-05, "loss": 0.0143, "step": 20960 }, { "epoch": 49.01, "grad_norm": 0.002103234874084592, "learning_rate": 1.6969674665096423e-05, "loss": 0.0896, "step": 20970 }, { "epoch": 49.01, "grad_norm": 0.0011310140835121274, "learning_rate": 1.6951273369645223e-05, "loss": 0.0074, "step": 20980 }, { "epoch": 49.01, "grad_norm": 0.04580579325556755, "learning_rate": 1.6932872074194023e-05, "loss": 0.0733, "step": 20990 }, { "epoch": 49.01, "grad_norm": 0.039697013795375824, "learning_rate": 1.6914470778742823e-05, "loss": 0.121, "step": 21000 }, { "epoch": 49.01, "eval_accuracy": 0.722972972972973, "eval_loss": 1.6861770153045654, "eval_runtime": 43.5753, "eval_samples_per_second": 20.379, "eval_steps_per_second": 1.698, "step": 21000 }, { "epoch": 50.0, "grad_norm": 0.003826475003734231, "learning_rate": 1.6896069483291624e-05, "loss": 0.0202, "step": 21010 }, { "epoch": 50.0, "grad_norm": 0.016017960384488106, "learning_rate": 1.6877668187840427e-05, "loss": 0.0314, "step": 21020 }, { "epoch": 50.0, "grad_norm": 34.23000717163086, "learning_rate": 1.6859266892389224e-05, "loss": 0.0168, "step": 21030 }, { "epoch": 50.0, "grad_norm": 11.643963813781738, "learning_rate": 1.6840865596938024e-05, "loss": 0.0135, "step": 21040 }, { "epoch": 50.0, "grad_norm": 0.32508620619773865, "learning_rate": 1.6822464301486827e-05, "loss": 0.0003, "step": 21050 }, { "epoch": 50.0, "grad_norm": 2.0690901279449463, "learning_rate": 1.6804063006035624e-05, "loss": 0.0008, "step": 21060 }, { "epoch": 50.0, "grad_norm": 0.6246060132980347, "learning_rate": 1.6785661710584428e-05, "loss": 0.003, "step": 21070 }, { "epoch": 50.0, "grad_norm": 0.011646988801658154, "learning_rate": 1.6767260415133228e-05, "loss": 0.0127, "step": 21080 }, { "epoch": 50.0, "grad_norm": 0.003978679422289133, "learning_rate": 1.6748859119682024e-05, "loss": 0.0003, "step": 21090 }, { "epoch": 50.0, "grad_norm": 0.0010175154311582446, "learning_rate": 1.6730457824230828e-05, "loss": 0.0001, "step": 21100 }, { "epoch": 50.0, "grad_norm": 0.002983462531119585, "learning_rate": 1.6712056528779628e-05, "loss": 0.0001, "step": 21110 }, { "epoch": 50.0, "grad_norm": 0.0006442716694436967, "learning_rate": 1.6693655233328425e-05, "loss": 0.0001, "step": 21120 }, { "epoch": 50.0, "grad_norm": 13.063587188720703, "learning_rate": 1.667525393787723e-05, "loss": 0.0338, "step": 21130 }, { "epoch": 50.0, "grad_norm": 72.1922836303711, "learning_rate": 1.665685264242603e-05, "loss": 0.0071, "step": 21140 }, { "epoch": 50.0, "grad_norm": 0.007421460468322039, "learning_rate": 1.663845134697483e-05, "loss": 0.0001, "step": 21150 }, { "epoch": 50.01, "grad_norm": 0.020990287885069847, "learning_rate": 1.662005005152363e-05, "loss": 0.0267, "step": 21160 }, { "epoch": 50.01, "grad_norm": 0.009370788931846619, "learning_rate": 1.660164875607243e-05, "loss": 0.054, "step": 21170 }, { "epoch": 50.01, "grad_norm": 0.0025914448779076338, "learning_rate": 1.658324746062123e-05, "loss": 0.0002, "step": 21180 }, { "epoch": 50.01, "grad_norm": 0.0516793355345726, "learning_rate": 1.656484616517003e-05, "loss": 0.0003, "step": 21190 }, { "epoch": 50.01, "grad_norm": 0.0009586882079020143, "learning_rate": 1.654644486971883e-05, "loss": 0.0002, "step": 21200 }, { "epoch": 50.01, "grad_norm": 0.06358382105827332, "learning_rate": 1.652804357426763e-05, "loss": 0.0291, "step": 21210 }, { "epoch": 50.01, "grad_norm": 0.0018362919799983501, "learning_rate": 1.650964227881643e-05, "loss": 0.0149, "step": 21220 }, { "epoch": 50.01, "grad_norm": 1.6524035930633545, "learning_rate": 1.649124098336523e-05, "loss": 0.0015, "step": 21230 }, { "epoch": 50.01, "grad_norm": 0.03593685105443001, "learning_rate": 1.647283968791403e-05, "loss": 0.0425, "step": 21240 }, { "epoch": 50.01, "grad_norm": 0.004203982185572386, "learning_rate": 1.6454438392462833e-05, "loss": 0.0003, "step": 21250 }, { "epoch": 50.01, "grad_norm": 0.0018243154045194387, "learning_rate": 1.643603709701163e-05, "loss": 0.0001, "step": 21260 }, { "epoch": 50.01, "grad_norm": 0.023052990436553955, "learning_rate": 1.641763580156043e-05, "loss": 0.0927, "step": 21270 }, { "epoch": 50.01, "grad_norm": 0.0010030419798567891, "learning_rate": 1.639923450610923e-05, "loss": 0.0499, "step": 21280 }, { "epoch": 50.01, "grad_norm": 0.0008574148523621261, "learning_rate": 1.638083321065803e-05, "loss": 0.0018, "step": 21290 }, { "epoch": 50.01, "grad_norm": 0.005586415994912386, "learning_rate": 1.636243191520683e-05, "loss": 0.0209, "step": 21300 }, { "epoch": 50.01, "grad_norm": 0.0025664528366178274, "learning_rate": 1.634403061975563e-05, "loss": 0.0015, "step": 21310 }, { "epoch": 50.01, "grad_norm": 0.0018880977295339108, "learning_rate": 1.632562932430443e-05, "loss": 0.0324, "step": 21320 }, { "epoch": 50.01, "grad_norm": 0.1711210310459137, "learning_rate": 1.6307228028853234e-05, "loss": 0.0003, "step": 21330 }, { "epoch": 50.01, "grad_norm": 0.012983834370970726, "learning_rate": 1.628882673340203e-05, "loss": 0.0074, "step": 21340 }, { "epoch": 50.01, "grad_norm": 0.003905842313542962, "learning_rate": 1.627042543795083e-05, "loss": 0.0001, "step": 21350 }, { "epoch": 50.01, "grad_norm": 0.01312506664544344, "learning_rate": 1.6252024142499634e-05, "loss": 0.0517, "step": 21360 }, { "epoch": 50.01, "grad_norm": 0.0010434292489662766, "learning_rate": 1.623362284704843e-05, "loss": 0.0655, "step": 21370 }, { "epoch": 50.01, "grad_norm": 0.006396695505827665, "learning_rate": 1.6215221551597235e-05, "loss": 0.01, "step": 21380 }, { "epoch": 50.01, "grad_norm": 0.19382664561271667, "learning_rate": 1.6196820256146035e-05, "loss": 0.0044, "step": 21390 }, { "epoch": 50.01, "grad_norm": 0.010313979350030422, "learning_rate": 1.617841896069483e-05, "loss": 0.0407, "step": 21400 }, { "epoch": 50.01, "grad_norm": 0.003178203012794256, "learning_rate": 1.6160017665243635e-05, "loss": 0.0123, "step": 21410 }, { "epoch": 50.01, "grad_norm": 0.0013943740632385015, "learning_rate": 1.6141616369792435e-05, "loss": 0.0001, "step": 21420 }, { "epoch": 50.01, "eval_accuracy": 0.7364864864864865, "eval_loss": 1.8462260961532593, "eval_runtime": 40.7016, "eval_samples_per_second": 21.817, "eval_steps_per_second": 1.818, "step": 21420 }, { "epoch": 51.0, "grad_norm": 0.0011855022748932242, "learning_rate": 1.6123215074341232e-05, "loss": 0.0004, "step": 21430 }, { "epoch": 51.0, "grad_norm": 0.006975261494517326, "learning_rate": 1.6104813778890035e-05, "loss": 0.0112, "step": 21440 }, { "epoch": 51.0, "grad_norm": 0.0014384695095941424, "learning_rate": 1.6086412483438835e-05, "loss": 0.0006, "step": 21450 }, { "epoch": 51.0, "grad_norm": 0.0062417034059762955, "learning_rate": 1.6068011187987636e-05, "loss": 0.0546, "step": 21460 }, { "epoch": 51.0, "grad_norm": 0.0007521304069086909, "learning_rate": 1.6049609892536436e-05, "loss": 0.0002, "step": 21470 }, { "epoch": 51.0, "grad_norm": 0.0006712984177283943, "learning_rate": 1.6031208597085236e-05, "loss": 0.0072, "step": 21480 }, { "epoch": 51.0, "grad_norm": 0.00106193742249161, "learning_rate": 1.6012807301634036e-05, "loss": 0.0147, "step": 21490 }, { "epoch": 51.0, "grad_norm": 0.0004047717957291752, "learning_rate": 1.5994406006182836e-05, "loss": 0.0009, "step": 21500 }, { "epoch": 51.0, "grad_norm": 0.0011434787884354591, "learning_rate": 1.5976004710731636e-05, "loss": 0.0709, "step": 21510 }, { "epoch": 51.0, "grad_norm": 16.264480590820312, "learning_rate": 1.5957603415280436e-05, "loss": 0.023, "step": 21520 }, { "epoch": 51.0, "grad_norm": 0.0010457668686285615, "learning_rate": 1.5939202119829236e-05, "loss": 0.0001, "step": 21530 }, { "epoch": 51.0, "grad_norm": 0.0018191535491496325, "learning_rate": 1.5920800824378036e-05, "loss": 0.0001, "step": 21540 }, { "epoch": 51.0, "grad_norm": 0.0005493704811669886, "learning_rate": 1.5902399528926837e-05, "loss": 0.0001, "step": 21550 }, { "epoch": 51.0, "grad_norm": 0.000556766171939671, "learning_rate": 1.588399823347564e-05, "loss": 0.0061, "step": 21560 }, { "epoch": 51.0, "grad_norm": 2.683346748352051, "learning_rate": 1.5865596938024437e-05, "loss": 0.0006, "step": 21570 }, { "epoch": 51.01, "grad_norm": 0.0024154935963451862, "learning_rate": 1.5847195642573237e-05, "loss": 0.1307, "step": 21580 }, { "epoch": 51.01, "grad_norm": 0.045853517949581146, "learning_rate": 1.582879434712204e-05, "loss": 0.0001, "step": 21590 }, { "epoch": 51.01, "grad_norm": 0.059176698327064514, "learning_rate": 1.5810393051670837e-05, "loss": 0.1623, "step": 21600 }, { "epoch": 51.01, "grad_norm": 0.010752598755061626, "learning_rate": 1.5791991756219637e-05, "loss": 0.0094, "step": 21610 }, { "epoch": 51.01, "grad_norm": 0.004883680492639542, "learning_rate": 1.577359046076844e-05, "loss": 0.0579, "step": 21620 }, { "epoch": 51.01, "grad_norm": 0.004795772023499012, "learning_rate": 1.5755189165317237e-05, "loss": 0.0003, "step": 21630 }, { "epoch": 51.01, "grad_norm": 0.0015640510246157646, "learning_rate": 1.573678786986604e-05, "loss": 0.0314, "step": 21640 }, { "epoch": 51.01, "grad_norm": 0.005454888101667166, "learning_rate": 1.571838657441484e-05, "loss": 0.0003, "step": 21650 }, { "epoch": 51.01, "grad_norm": 0.0020785073284059763, "learning_rate": 1.5699985278963638e-05, "loss": 0.0319, "step": 21660 }, { "epoch": 51.01, "grad_norm": 0.0016603749245405197, "learning_rate": 1.568158398351244e-05, "loss": 0.0004, "step": 21670 }, { "epoch": 51.01, "grad_norm": 0.0012977832229807973, "learning_rate": 1.566318268806124e-05, "loss": 0.0071, "step": 21680 }, { "epoch": 51.01, "grad_norm": 0.0006946497596800327, "learning_rate": 1.564478139261004e-05, "loss": 0.0001, "step": 21690 }, { "epoch": 51.01, "grad_norm": 0.013320227153599262, "learning_rate": 1.562638009715884e-05, "loss": 0.0002, "step": 21700 }, { "epoch": 51.01, "grad_norm": 0.002938096411526203, "learning_rate": 1.560797880170764e-05, "loss": 0.0001, "step": 21710 }, { "epoch": 51.01, "grad_norm": 0.004965800791978836, "learning_rate": 1.5589577506256442e-05, "loss": 0.0, "step": 21720 }, { "epoch": 51.01, "grad_norm": 0.0008785520331002772, "learning_rate": 1.5571176210805242e-05, "loss": 0.0179, "step": 21730 }, { "epoch": 51.01, "grad_norm": 0.002170178573578596, "learning_rate": 1.555277491535404e-05, "loss": 0.0565, "step": 21740 }, { "epoch": 51.01, "grad_norm": 0.008555195294320583, "learning_rate": 1.5534373619902842e-05, "loss": 0.0, "step": 21750 }, { "epoch": 51.01, "grad_norm": 0.0019417338771745563, "learning_rate": 1.5515972324451642e-05, "loss": 0.0001, "step": 21760 }, { "epoch": 51.01, "grad_norm": 0.0005679653841070831, "learning_rate": 1.5497571029000442e-05, "loss": 0.0, "step": 21770 }, { "epoch": 51.01, "grad_norm": 0.0006499973824247718, "learning_rate": 1.5479169733549243e-05, "loss": 0.0099, "step": 21780 }, { "epoch": 51.01, "grad_norm": 0.9170955419540405, "learning_rate": 1.5460768438098043e-05, "loss": 0.0242, "step": 21790 }, { "epoch": 51.01, "grad_norm": 0.0007745533948764205, "learning_rate": 1.5442367142646843e-05, "loss": 0.0111, "step": 21800 }, { "epoch": 51.01, "grad_norm": 0.0008127467590384185, "learning_rate": 1.5423965847195643e-05, "loss": 0.0, "step": 21810 }, { "epoch": 51.01, "grad_norm": 0.00048087682807818055, "learning_rate": 1.5405564551744443e-05, "loss": 0.0006, "step": 21820 }, { "epoch": 51.01, "grad_norm": 0.0003872001834679395, "learning_rate": 1.5387163256293243e-05, "loss": 0.0, "step": 21830 }, { "epoch": 51.01, "grad_norm": 0.0020749419927597046, "learning_rate": 1.5368761960842043e-05, "loss": 0.0319, "step": 21840 }, { "epoch": 51.01, "eval_accuracy": 0.7286036036036037, "eval_loss": 1.9071617126464844, "eval_runtime": 40.8643, "eval_samples_per_second": 21.73, "eval_steps_per_second": 1.811, "step": 21840 }, { "epoch": 52.0, "grad_norm": 0.0010090394644066691, "learning_rate": 1.5350360665390843e-05, "loss": 0.0001, "step": 21850 }, { "epoch": 52.0, "grad_norm": 0.0006268495344556868, "learning_rate": 1.5331959369939644e-05, "loss": 0.0001, "step": 21860 }, { "epoch": 52.0, "grad_norm": 0.0034793647937476635, "learning_rate": 1.5313558074488447e-05, "loss": 0.0001, "step": 21870 }, { "epoch": 52.0, "grad_norm": 0.003257141914218664, "learning_rate": 1.5295156779037244e-05, "loss": 0.0026, "step": 21880 }, { "epoch": 52.0, "grad_norm": 0.0005939814727753401, "learning_rate": 1.5276755483586044e-05, "loss": 0.0513, "step": 21890 }, { "epoch": 52.0, "grad_norm": 0.0034212921746075153, "learning_rate": 1.5258354188134846e-05, "loss": 0.0003, "step": 21900 }, { "epoch": 52.0, "grad_norm": 0.00782698392868042, "learning_rate": 1.5239952892683646e-05, "loss": 0.0001, "step": 21910 }, { "epoch": 52.0, "grad_norm": 0.0007986134150996804, "learning_rate": 1.5221551597232444e-05, "loss": 0.0001, "step": 21920 }, { "epoch": 52.0, "grad_norm": 0.0010540427174419165, "learning_rate": 1.5203150301781246e-05, "loss": 0.0002, "step": 21930 }, { "epoch": 52.0, "grad_norm": 0.00037895364221185446, "learning_rate": 1.5184749006330046e-05, "loss": 0.0, "step": 21940 }, { "epoch": 52.0, "grad_norm": 0.0050917621701955795, "learning_rate": 1.5166347710878848e-05, "loss": 0.0002, "step": 21950 }, { "epoch": 52.0, "grad_norm": 0.017639679834246635, "learning_rate": 1.5147946415427646e-05, "loss": 0.0003, "step": 21960 }, { "epoch": 52.0, "grad_norm": 0.00042161368764936924, "learning_rate": 1.5129545119976446e-05, "loss": 0.0001, "step": 21970 }, { "epoch": 52.0, "grad_norm": 0.0010595995699986815, "learning_rate": 1.5111143824525248e-05, "loss": 0.0171, "step": 21980 }, { "epoch": 52.0, "grad_norm": 0.001517163822427392, "learning_rate": 1.5092742529074047e-05, "loss": 0.0002, "step": 21990 }, { "epoch": 52.01, "grad_norm": 0.00044204984442330897, "learning_rate": 1.5074341233622849e-05, "loss": 0.0649, "step": 22000 }, { "epoch": 52.01, "grad_norm": 0.022290529683232307, "learning_rate": 1.5055939938171649e-05, "loss": 0.0001, "step": 22010 }, { "epoch": 52.01, "grad_norm": 0.0006498958100564778, "learning_rate": 1.5037538642720447e-05, "loss": 0.0009, "step": 22020 }, { "epoch": 52.01, "grad_norm": 0.0012360989348962903, "learning_rate": 1.5019137347269249e-05, "loss": 0.0129, "step": 22030 }, { "epoch": 52.01, "grad_norm": 0.0009521116153337061, "learning_rate": 1.5000736051818049e-05, "loss": 0.0397, "step": 22040 }, { "epoch": 52.01, "grad_norm": 0.0004721345321740955, "learning_rate": 1.4982334756366847e-05, "loss": 0.0001, "step": 22050 }, { "epoch": 52.01, "grad_norm": 0.0019844111520797014, "learning_rate": 1.496393346091565e-05, "loss": 0.0, "step": 22060 }, { "epoch": 52.01, "grad_norm": 0.06269257515668869, "learning_rate": 1.494553216546445e-05, "loss": 0.0108, "step": 22070 }, { "epoch": 52.01, "grad_norm": 0.005493814591318369, "learning_rate": 1.4927130870013251e-05, "loss": 0.0689, "step": 22080 }, { "epoch": 52.01, "grad_norm": 0.020394032821059227, "learning_rate": 1.490872957456205e-05, "loss": 0.0001, "step": 22090 }, { "epoch": 52.01, "grad_norm": 0.0004452892462722957, "learning_rate": 1.489032827911085e-05, "loss": 0.0, "step": 22100 }, { "epoch": 52.01, "grad_norm": 0.005485246889293194, "learning_rate": 1.4871926983659651e-05, "loss": 0.0, "step": 22110 }, { "epoch": 52.01, "grad_norm": 0.000731699459720403, "learning_rate": 1.485352568820845e-05, "loss": 0.001, "step": 22120 }, { "epoch": 52.01, "grad_norm": 0.0008120546699501574, "learning_rate": 1.4835124392757252e-05, "loss": 0.0, "step": 22130 }, { "epoch": 52.01, "grad_norm": 0.0005848497967235744, "learning_rate": 1.4816723097306052e-05, "loss": 0.0003, "step": 22140 }, { "epoch": 52.01, "grad_norm": 0.0952475443482399, "learning_rate": 1.479832180185485e-05, "loss": 0.0526, "step": 22150 }, { "epoch": 52.01, "grad_norm": 0.004363304935395718, "learning_rate": 1.4779920506403652e-05, "loss": 0.0788, "step": 22160 }, { "epoch": 52.01, "grad_norm": 0.0003654547908809036, "learning_rate": 1.4761519210952452e-05, "loss": 0.0, "step": 22170 }, { "epoch": 52.01, "grad_norm": 0.001634811982512474, "learning_rate": 1.4743117915501254e-05, "loss": 0.0118, "step": 22180 }, { "epoch": 52.01, "grad_norm": 0.00047340861055999994, "learning_rate": 1.4724716620050052e-05, "loss": 0.016, "step": 22190 }, { "epoch": 52.01, "grad_norm": 0.0008979640551842749, "learning_rate": 1.470631532459885e-05, "loss": 0.0001, "step": 22200 }, { "epoch": 52.01, "grad_norm": 0.0005534207448363304, "learning_rate": 1.4687914029147654e-05, "loss": 0.0004, "step": 22210 }, { "epoch": 52.01, "grad_norm": 0.0010612070327624679, "learning_rate": 1.4669512733696453e-05, "loss": 0.001, "step": 22220 }, { "epoch": 52.01, "grad_norm": 0.0008421811508014798, "learning_rate": 1.4651111438245251e-05, "loss": 0.0149, "step": 22230 }, { "epoch": 52.01, "grad_norm": 0.0024608599487692118, "learning_rate": 1.4632710142794053e-05, "loss": 0.0302, "step": 22240 }, { "epoch": 52.01, "grad_norm": 0.0005315292510204017, "learning_rate": 1.4614308847342853e-05, "loss": 0.0282, "step": 22250 }, { "epoch": 52.01, "grad_norm": 0.0015135396970435977, "learning_rate": 1.4595907551891655e-05, "loss": 0.065, "step": 22260 }, { "epoch": 52.01, "eval_accuracy": 0.7556306306306306, "eval_loss": 1.6631345748901367, "eval_runtime": 41.1878, "eval_samples_per_second": 21.56, "eval_steps_per_second": 1.797, "step": 22260 }, { "epoch": 53.0, "grad_norm": 0.001163458451628685, "learning_rate": 1.4577506256440453e-05, "loss": 0.0001, "step": 22270 }, { "epoch": 53.0, "grad_norm": 0.0030114694964140654, "learning_rate": 1.4559104960989253e-05, "loss": 0.0023, "step": 22280 }, { "epoch": 53.0, "grad_norm": 0.007465164177119732, "learning_rate": 1.4540703665538055e-05, "loss": 0.0678, "step": 22290 }, { "epoch": 53.0, "grad_norm": 0.001718674087896943, "learning_rate": 1.4522302370086854e-05, "loss": 0.0006, "step": 22300 }, { "epoch": 53.0, "grad_norm": 0.08228703588247299, "learning_rate": 1.4503901074635655e-05, "loss": 0.0093, "step": 22310 }, { "epoch": 53.0, "grad_norm": 0.00858758483082056, "learning_rate": 1.4485499779184456e-05, "loss": 0.0002, "step": 22320 }, { "epoch": 53.0, "grad_norm": 36.46480178833008, "learning_rate": 1.4467098483733254e-05, "loss": 0.0389, "step": 22330 }, { "epoch": 53.0, "grad_norm": 0.0019482868956401944, "learning_rate": 1.4448697188282056e-05, "loss": 0.0009, "step": 22340 }, { "epoch": 53.0, "grad_norm": 0.020813103765249252, "learning_rate": 1.4430295892830856e-05, "loss": 0.0001, "step": 22350 }, { "epoch": 53.0, "grad_norm": 0.09965714067220688, "learning_rate": 1.4411894597379654e-05, "loss": 0.051, "step": 22360 }, { "epoch": 53.0, "grad_norm": 0.0007514178869314492, "learning_rate": 1.4393493301928456e-05, "loss": 0.0001, "step": 22370 }, { "epoch": 53.0, "grad_norm": 0.0009033564128912985, "learning_rate": 1.4375092006477256e-05, "loss": 0.0484, "step": 22380 }, { "epoch": 53.0, "grad_norm": 0.0006943594198673964, "learning_rate": 1.4356690711026058e-05, "loss": 0.0002, "step": 22390 }, { "epoch": 53.0, "grad_norm": 0.003571439301595092, "learning_rate": 1.4338289415574857e-05, "loss": 0.0001, "step": 22400 }, { "epoch": 53.0, "grad_norm": 0.06675244867801666, "learning_rate": 1.4319888120123657e-05, "loss": 0.0632, "step": 22410 }, { "epoch": 53.01, "grad_norm": 13.023971557617188, "learning_rate": 1.4301486824672458e-05, "loss": 0.0606, "step": 22420 }, { "epoch": 53.01, "grad_norm": 0.3222871720790863, "learning_rate": 1.4283085529221257e-05, "loss": 0.1178, "step": 22430 }, { "epoch": 53.01, "grad_norm": 0.0009562097257003188, "learning_rate": 1.4264684233770059e-05, "loss": 0.0001, "step": 22440 }, { "epoch": 53.01, "grad_norm": 0.06681627035140991, "learning_rate": 1.4246282938318859e-05, "loss": 0.0311, "step": 22450 }, { "epoch": 53.01, "grad_norm": 0.017502538859844208, "learning_rate": 1.4227881642867657e-05, "loss": 0.006, "step": 22460 }, { "epoch": 53.01, "grad_norm": 0.0009717533830553293, "learning_rate": 1.4209480347416459e-05, "loss": 0.0001, "step": 22470 }, { "epoch": 53.01, "grad_norm": 0.0013100715586915612, "learning_rate": 1.4191079051965259e-05, "loss": 0.0001, "step": 22480 }, { "epoch": 53.01, "grad_norm": 0.08804736286401749, "learning_rate": 1.4172677756514061e-05, "loss": 0.0001, "step": 22490 }, { "epoch": 53.01, "grad_norm": 0.06496791541576385, "learning_rate": 1.415427646106286e-05, "loss": 0.0004, "step": 22500 }, { "epoch": 53.01, "grad_norm": 0.008714644238352776, "learning_rate": 1.413587516561166e-05, "loss": 0.0585, "step": 22510 }, { "epoch": 53.01, "grad_norm": 0.0048969946801662445, "learning_rate": 1.4117473870160461e-05, "loss": 0.0001, "step": 22520 }, { "epoch": 53.01, "grad_norm": 0.006837640888988972, "learning_rate": 1.409907257470926e-05, "loss": 0.0003, "step": 22530 }, { "epoch": 53.01, "grad_norm": 0.004497607238590717, "learning_rate": 1.408067127925806e-05, "loss": 0.0001, "step": 22540 }, { "epoch": 53.01, "grad_norm": 0.0015444208402186632, "learning_rate": 1.4062269983806862e-05, "loss": 0.0004, "step": 22550 }, { "epoch": 53.01, "grad_norm": 0.002304267603904009, "learning_rate": 1.404386868835566e-05, "loss": 0.0095, "step": 22560 }, { "epoch": 53.01, "grad_norm": 0.0013628543820232153, "learning_rate": 1.4025467392904462e-05, "loss": 0.0001, "step": 22570 }, { "epoch": 53.01, "grad_norm": 0.0009183231741189957, "learning_rate": 1.4007066097453262e-05, "loss": 0.0001, "step": 22580 }, { "epoch": 53.01, "grad_norm": 0.0007543744286522269, "learning_rate": 1.398866480200206e-05, "loss": 0.0242, "step": 22590 }, { "epoch": 53.01, "grad_norm": 0.056899409741163254, "learning_rate": 1.3970263506550862e-05, "loss": 0.0001, "step": 22600 }, { "epoch": 53.01, "grad_norm": 0.002434780355542898, "learning_rate": 1.3951862211099662e-05, "loss": 0.0079, "step": 22610 }, { "epoch": 53.01, "grad_norm": 0.0021614390425384045, "learning_rate": 1.3933460915648464e-05, "loss": 0.0891, "step": 22620 }, { "epoch": 53.01, "grad_norm": 0.00045263092033565044, "learning_rate": 1.3915059620197263e-05, "loss": 0.0678, "step": 22630 }, { "epoch": 53.01, "grad_norm": 0.0013739608693867922, "learning_rate": 1.3896658324746063e-05, "loss": 0.0743, "step": 22640 }, { "epoch": 53.01, "grad_norm": 0.002203054493293166, "learning_rate": 1.3878257029294864e-05, "loss": 0.0001, "step": 22650 }, { "epoch": 53.01, "grad_norm": 1.5518015623092651, "learning_rate": 1.3859855733843663e-05, "loss": 0.0007, "step": 22660 }, { "epoch": 53.01, "grad_norm": 0.17362336814403534, "learning_rate": 1.3841454438392461e-05, "loss": 0.043, "step": 22670 }, { "epoch": 53.01, "grad_norm": 0.0003964619245380163, "learning_rate": 1.3823053142941265e-05, "loss": 0.0424, "step": 22680 }, { "epoch": 53.01, "eval_accuracy": 0.7398648648648649, "eval_loss": 1.9176682233810425, "eval_runtime": 40.6837, "eval_samples_per_second": 21.827, "eval_steps_per_second": 1.819, "step": 22680 }, { "epoch": 54.0, "grad_norm": 0.006115254946053028, "learning_rate": 1.3804651847490063e-05, "loss": 0.0733, "step": 22690 }, { "epoch": 54.0, "grad_norm": 0.0023755570873618126, "learning_rate": 1.3786250552038865e-05, "loss": 0.0001, "step": 22700 }, { "epoch": 54.0, "grad_norm": 0.003317100927233696, "learning_rate": 1.3767849256587663e-05, "loss": 0.001, "step": 22710 }, { "epoch": 54.0, "grad_norm": 0.0010947121772915125, "learning_rate": 1.3749447961136464e-05, "loss": 0.0031, "step": 22720 }, { "epoch": 54.0, "grad_norm": 0.0013286080211400986, "learning_rate": 1.3731046665685265e-05, "loss": 0.0001, "step": 22730 }, { "epoch": 54.0, "grad_norm": 11.449026107788086, "learning_rate": 1.3712645370234064e-05, "loss": 0.0022, "step": 22740 }, { "epoch": 54.0, "grad_norm": 0.0035011095460504293, "learning_rate": 1.3694244074782867e-05, "loss": 0.0001, "step": 22750 }, { "epoch": 54.0, "grad_norm": 0.00158277852460742, "learning_rate": 1.3675842779331666e-05, "loss": 0.0001, "step": 22760 }, { "epoch": 54.0, "grad_norm": 0.010587048716843128, "learning_rate": 1.3657441483880464e-05, "loss": 0.0001, "step": 22770 }, { "epoch": 54.0, "grad_norm": 0.00178767298348248, "learning_rate": 1.3639040188429266e-05, "loss": 0.0008, "step": 22780 }, { "epoch": 54.0, "grad_norm": 0.002173611195757985, "learning_rate": 1.3620638892978066e-05, "loss": 0.0001, "step": 22790 }, { "epoch": 54.0, "grad_norm": 0.000830679084174335, "learning_rate": 1.3602237597526868e-05, "loss": 0.0161, "step": 22800 }, { "epoch": 54.0, "grad_norm": 0.00802791677415371, "learning_rate": 1.3583836302075666e-05, "loss": 0.0515, "step": 22810 }, { "epoch": 54.0, "grad_norm": 0.006544044241309166, "learning_rate": 1.3565435006624466e-05, "loss": 0.0001, "step": 22820 }, { "epoch": 54.0, "grad_norm": 0.0021956958808004856, "learning_rate": 1.3547033711173268e-05, "loss": 0.0025, "step": 22830 }, { "epoch": 54.01, "grad_norm": 0.004467161372303963, "learning_rate": 1.3528632415722067e-05, "loss": 0.0001, "step": 22840 }, { "epoch": 54.01, "grad_norm": 0.0011982051655650139, "learning_rate": 1.3510231120270867e-05, "loss": 0.0354, "step": 22850 }, { "epoch": 54.01, "grad_norm": 0.0005390816950239241, "learning_rate": 1.3491829824819669e-05, "loss": 0.0001, "step": 22860 }, { "epoch": 54.01, "grad_norm": 0.0041159712709486485, "learning_rate": 1.3473428529368467e-05, "loss": 0.0001, "step": 22870 }, { "epoch": 54.01, "grad_norm": 0.004346021916717291, "learning_rate": 1.3455027233917269e-05, "loss": 0.0265, "step": 22880 }, { "epoch": 54.01, "grad_norm": 0.0595536045730114, "learning_rate": 1.3436625938466069e-05, "loss": 0.0039, "step": 22890 }, { "epoch": 54.01, "grad_norm": 0.005771622993052006, "learning_rate": 1.3418224643014867e-05, "loss": 0.0001, "step": 22900 }, { "epoch": 54.01, "grad_norm": 0.2933156490325928, "learning_rate": 1.339982334756367e-05, "loss": 0.0003, "step": 22910 }, { "epoch": 54.01, "grad_norm": 0.0017350377747789025, "learning_rate": 1.338142205211247e-05, "loss": 0.0005, "step": 22920 }, { "epoch": 54.01, "grad_norm": 0.0007483300869353116, "learning_rate": 1.3363020756661271e-05, "loss": 0.0001, "step": 22930 }, { "epoch": 54.01, "grad_norm": 0.0008262667688541114, "learning_rate": 1.334461946121007e-05, "loss": 0.0001, "step": 22940 }, { "epoch": 54.01, "grad_norm": 0.002304819645360112, "learning_rate": 1.332621816575887e-05, "loss": 0.0001, "step": 22950 }, { "epoch": 54.01, "grad_norm": 0.0025526960380375385, "learning_rate": 1.3307816870307671e-05, "loss": 0.0565, "step": 22960 }, { "epoch": 54.01, "grad_norm": 0.00723969517275691, "learning_rate": 1.328941557485647e-05, "loss": 0.0648, "step": 22970 }, { "epoch": 54.01, "grad_norm": 0.006033417768776417, "learning_rate": 1.327101427940527e-05, "loss": 0.0002, "step": 22980 }, { "epoch": 54.01, "grad_norm": 0.0007365961209870875, "learning_rate": 1.3252612983954072e-05, "loss": 0.0001, "step": 22990 }, { "epoch": 54.01, "grad_norm": 0.013198223896324635, "learning_rate": 1.323421168850287e-05, "loss": 0.0001, "step": 23000 }, { "epoch": 54.01, "grad_norm": 0.0013402728363871574, "learning_rate": 1.3215810393051672e-05, "loss": 0.0004, "step": 23010 }, { "epoch": 54.01, "grad_norm": 0.015665873885154724, "learning_rate": 1.3197409097600472e-05, "loss": 0.0001, "step": 23020 }, { "epoch": 54.01, "grad_norm": 0.0020574575755745173, "learning_rate": 1.317900780214927e-05, "loss": 0.0025, "step": 23030 }, { "epoch": 54.01, "grad_norm": 0.004725297912955284, "learning_rate": 1.3160606506698072e-05, "loss": 0.1044, "step": 23040 }, { "epoch": 54.01, "grad_norm": 0.0010245188605040312, "learning_rate": 1.3142205211246873e-05, "loss": 0.0001, "step": 23050 }, { "epoch": 54.01, "grad_norm": 35.485294342041016, "learning_rate": 1.3123803915795674e-05, "loss": 0.0717, "step": 23060 }, { "epoch": 54.01, "grad_norm": 0.0023749656975269318, "learning_rate": 1.3105402620344473e-05, "loss": 0.0001, "step": 23070 }, { "epoch": 54.01, "grad_norm": 0.028252527117729187, "learning_rate": 1.3087001324893273e-05, "loss": 0.0001, "step": 23080 }, { "epoch": 54.01, "grad_norm": 0.0029571724589914083, "learning_rate": 1.3068600029442075e-05, "loss": 0.0007, "step": 23090 }, { "epoch": 54.01, "grad_norm": 0.0011734621366485953, "learning_rate": 1.3050198733990873e-05, "loss": 0.0001, "step": 23100 }, { "epoch": 54.01, "eval_accuracy": 0.7364864864864865, "eval_loss": 1.8990436792373657, "eval_runtime": 39.9552, "eval_samples_per_second": 22.225, "eval_steps_per_second": 1.852, "step": 23100 }, { "epoch": 55.0, "grad_norm": 0.0017267257208004594, "learning_rate": 1.3031797438539675e-05, "loss": 0.0001, "step": 23110 }, { "epoch": 55.0, "grad_norm": 0.004545911680907011, "learning_rate": 1.3013396143088475e-05, "loss": 0.0342, "step": 23120 }, { "epoch": 55.0, "grad_norm": 0.02509649097919464, "learning_rate": 1.2994994847637273e-05, "loss": 0.0001, "step": 23130 }, { "epoch": 55.0, "grad_norm": 5.717916011810303, "learning_rate": 1.2976593552186075e-05, "loss": 0.0023, "step": 23140 }, { "epoch": 55.0, "grad_norm": 0.005418738350272179, "learning_rate": 1.2958192256734875e-05, "loss": 0.0001, "step": 23150 }, { "epoch": 55.0, "grad_norm": 0.0012522684410214424, "learning_rate": 1.2939790961283674e-05, "loss": 0.0001, "step": 23160 }, { "epoch": 55.0, "grad_norm": 0.003435043152421713, "learning_rate": 1.2921389665832476e-05, "loss": 0.015, "step": 23170 }, { "epoch": 55.0, "grad_norm": 0.0026250167284160852, "learning_rate": 1.2902988370381274e-05, "loss": 0.0016, "step": 23180 }, { "epoch": 55.0, "grad_norm": 0.00528159411624074, "learning_rate": 1.2884587074930078e-05, "loss": 0.0001, "step": 23190 }, { "epoch": 55.0, "grad_norm": 0.002424650127068162, "learning_rate": 1.2866185779478876e-05, "loss": 0.0, "step": 23200 }, { "epoch": 55.0, "grad_norm": 0.0007755476981401443, "learning_rate": 1.2847784484027674e-05, "loss": 0.0, "step": 23210 }, { "epoch": 55.0, "grad_norm": 0.0006827607867307961, "learning_rate": 1.2829383188576478e-05, "loss": 0.0001, "step": 23220 }, { "epoch": 55.0, "grad_norm": 0.0006369174807332456, "learning_rate": 1.2810981893125276e-05, "loss": 0.0001, "step": 23230 }, { "epoch": 55.0, "grad_norm": 0.0015232323203235865, "learning_rate": 1.2792580597674078e-05, "loss": 0.0001, "step": 23240 }, { "epoch": 55.0, "grad_norm": 0.0009344415739178658, "learning_rate": 1.2774179302222877e-05, "loss": 0.0, "step": 23250 }, { "epoch": 55.01, "grad_norm": 0.0012839095434173942, "learning_rate": 1.2755778006771677e-05, "loss": 0.0001, "step": 23260 }, { "epoch": 55.01, "grad_norm": 0.0006341927219182253, "learning_rate": 1.2737376711320478e-05, "loss": 0.0423, "step": 23270 }, { "epoch": 55.01, "grad_norm": 0.008475244976580143, "learning_rate": 1.2718975415869277e-05, "loss": 0.0001, "step": 23280 }, { "epoch": 55.01, "grad_norm": 0.0011694369604811072, "learning_rate": 1.2700574120418077e-05, "loss": 0.0558, "step": 23290 }, { "epoch": 55.01, "grad_norm": 1.4542289972305298, "learning_rate": 1.2682172824966879e-05, "loss": 0.0043, "step": 23300 }, { "epoch": 55.01, "grad_norm": 0.0018425858579576015, "learning_rate": 1.2663771529515677e-05, "loss": 0.0001, "step": 23310 }, { "epoch": 55.01, "grad_norm": 0.0018493493553251028, "learning_rate": 1.2645370234064479e-05, "loss": 0.0001, "step": 23320 }, { "epoch": 55.01, "grad_norm": 0.0013431626139208674, "learning_rate": 1.2626968938613279e-05, "loss": 0.0002, "step": 23330 }, { "epoch": 55.01, "grad_norm": 0.0005847708671353757, "learning_rate": 1.2608567643162078e-05, "loss": 0.0001, "step": 23340 }, { "epoch": 55.01, "grad_norm": 0.0038386487867683172, "learning_rate": 1.259016634771088e-05, "loss": 0.0, "step": 23350 }, { "epoch": 55.01, "grad_norm": 0.0013865749351680279, "learning_rate": 1.257176505225968e-05, "loss": 0.0, "step": 23360 }, { "epoch": 55.01, "grad_norm": 0.0010828847298398614, "learning_rate": 1.2553363756808481e-05, "loss": 0.0661, "step": 23370 }, { "epoch": 55.01, "grad_norm": 0.0009535017306916416, "learning_rate": 1.253496246135728e-05, "loss": 0.0006, "step": 23380 }, { "epoch": 55.01, "grad_norm": 0.0013523722300305963, "learning_rate": 1.251656116590608e-05, "loss": 0.0, "step": 23390 }, { "epoch": 55.01, "grad_norm": 0.0022847868967801332, "learning_rate": 1.249815987045488e-05, "loss": 0.0, "step": 23400 }, { "epoch": 55.01, "grad_norm": 0.0003955428546760231, "learning_rate": 1.247975857500368e-05, "loss": 0.0, "step": 23410 }, { "epoch": 55.01, "grad_norm": 0.0007086016703397036, "learning_rate": 1.2461357279552482e-05, "loss": 0.0, "step": 23420 }, { "epoch": 55.01, "grad_norm": 0.0007778684375807643, "learning_rate": 1.2442955984101282e-05, "loss": 0.0001, "step": 23430 }, { "epoch": 55.01, "grad_norm": 0.0017607809277251363, "learning_rate": 1.2424554688650082e-05, "loss": 0.0001, "step": 23440 }, { "epoch": 55.01, "grad_norm": 0.1019691601395607, "learning_rate": 1.240615339319888e-05, "loss": 0.0226, "step": 23450 }, { "epoch": 55.01, "grad_norm": 0.0013777402928099036, "learning_rate": 1.2387752097747682e-05, "loss": 0.0, "step": 23460 }, { "epoch": 55.01, "grad_norm": 0.0017346058739349246, "learning_rate": 1.2369350802296482e-05, "loss": 0.0102, "step": 23470 }, { "epoch": 55.01, "grad_norm": 0.0004230730119161308, "learning_rate": 1.2350949506845283e-05, "loss": 0.0, "step": 23480 }, { "epoch": 55.01, "grad_norm": 0.00860625971108675, "learning_rate": 1.2332548211394083e-05, "loss": 0.0013, "step": 23490 }, { "epoch": 55.01, "grad_norm": 0.0013665214646607637, "learning_rate": 1.2314146915942883e-05, "loss": 0.0077, "step": 23500 }, { "epoch": 55.01, "grad_norm": 0.0007249111076816916, "learning_rate": 1.2295745620491683e-05, "loss": 0.0002, "step": 23510 }, { "epoch": 55.01, "grad_norm": 0.00032015485339798033, "learning_rate": 1.2277344325040483e-05, "loss": 0.0, "step": 23520 }, { "epoch": 55.01, "eval_accuracy": 0.7454954954954955, "eval_loss": 2.0621790885925293, "eval_runtime": 39.7983, "eval_samples_per_second": 22.312, "eval_steps_per_second": 1.859, "step": 23520 }, { "epoch": 56.0, "grad_norm": 0.0004462806973606348, "learning_rate": 1.2258943029589285e-05, "loss": 0.0016, "step": 23530 }, { "epoch": 56.0, "grad_norm": 3.490710735321045, "learning_rate": 1.2240541734138083e-05, "loss": 0.0054, "step": 23540 }, { "epoch": 56.0, "grad_norm": 0.0019791782833635807, "learning_rate": 1.2222140438686883e-05, "loss": 0.1066, "step": 23550 }, { "epoch": 56.0, "grad_norm": 0.010116740129888058, "learning_rate": 1.2203739143235685e-05, "loss": 0.0, "step": 23560 }, { "epoch": 56.0, "grad_norm": 0.0034511485137045383, "learning_rate": 1.2185337847784485e-05, "loss": 0.0488, "step": 23570 }, { "epoch": 56.0, "grad_norm": 0.01684543490409851, "learning_rate": 1.2166936552333284e-05, "loss": 0.0001, "step": 23580 }, { "epoch": 56.0, "grad_norm": 0.0020218139979988337, "learning_rate": 1.2148535256882086e-05, "loss": 0.0001, "step": 23590 }, { "epoch": 56.0, "grad_norm": 0.0003876470436807722, "learning_rate": 1.2130133961430886e-05, "loss": 0.0001, "step": 23600 }, { "epoch": 56.0, "grad_norm": 0.0014865519478917122, "learning_rate": 1.2111732665979686e-05, "loss": 0.0009, "step": 23610 }, { "epoch": 56.0, "grad_norm": 0.0008013385813683271, "learning_rate": 1.2093331370528486e-05, "loss": 0.0, "step": 23620 }, { "epoch": 56.0, "grad_norm": 0.0025243335403501987, "learning_rate": 1.2074930075077286e-05, "loss": 0.0714, "step": 23630 }, { "epoch": 56.0, "grad_norm": 0.001431646873243153, "learning_rate": 1.2056528779626086e-05, "loss": 0.0071, "step": 23640 }, { "epoch": 56.0, "grad_norm": 0.0007202305714599788, "learning_rate": 1.2038127484174886e-05, "loss": 0.0287, "step": 23650 }, { "epoch": 56.0, "grad_norm": 0.0022084242664277554, "learning_rate": 1.2019726188723688e-05, "loss": 0.0, "step": 23660 }, { "epoch": 56.0, "grad_norm": 0.0018906533950939775, "learning_rate": 1.2001324893272486e-05, "loss": 0.0, "step": 23670 }, { "epoch": 56.01, "grad_norm": 0.012800070457160473, "learning_rate": 1.1982923597821287e-05, "loss": 0.0572, "step": 23680 }, { "epoch": 56.01, "grad_norm": 5.582653045654297, "learning_rate": 1.1964522302370088e-05, "loss": 0.0847, "step": 23690 }, { "epoch": 56.01, "grad_norm": 0.0026636242400854826, "learning_rate": 1.1946121006918888e-05, "loss": 0.0001, "step": 23700 }, { "epoch": 56.01, "grad_norm": 0.0005984007730148733, "learning_rate": 1.1927719711467687e-05, "loss": 0.0399, "step": 23710 }, { "epoch": 56.01, "grad_norm": 0.0014047266449779272, "learning_rate": 1.1909318416016487e-05, "loss": 0.0411, "step": 23720 }, { "epoch": 56.01, "grad_norm": 0.00455590570345521, "learning_rate": 1.1890917120565289e-05, "loss": 0.0003, "step": 23730 }, { "epoch": 56.01, "grad_norm": 0.004561448935419321, "learning_rate": 1.1872515825114089e-05, "loss": 0.0682, "step": 23740 }, { "epoch": 56.01, "grad_norm": 0.004140935372561216, "learning_rate": 1.1854114529662889e-05, "loss": 0.0001, "step": 23750 }, { "epoch": 56.01, "grad_norm": 0.0024083659518510103, "learning_rate": 1.183571323421169e-05, "loss": 0.0001, "step": 23760 }, { "epoch": 56.01, "grad_norm": 0.001977574313059449, "learning_rate": 1.181731193876049e-05, "loss": 0.0001, "step": 23770 }, { "epoch": 56.01, "grad_norm": 0.004812993109226227, "learning_rate": 1.179891064330929e-05, "loss": 0.0003, "step": 23780 }, { "epoch": 56.01, "grad_norm": 0.0009600510820746422, "learning_rate": 1.178050934785809e-05, "loss": 0.0001, "step": 23790 }, { "epoch": 56.01, "grad_norm": 7.466368198394775, "learning_rate": 1.176210805240689e-05, "loss": 0.049, "step": 23800 }, { "epoch": 56.01, "grad_norm": 0.21662557125091553, "learning_rate": 1.174370675695569e-05, "loss": 0.0001, "step": 23810 }, { "epoch": 56.01, "grad_norm": 0.00443542143329978, "learning_rate": 1.172530546150449e-05, "loss": 0.0096, "step": 23820 }, { "epoch": 56.01, "grad_norm": 3.03646183013916, "learning_rate": 1.1706904166053292e-05, "loss": 0.0139, "step": 23830 }, { "epoch": 56.01, "grad_norm": 0.002869045827537775, "learning_rate": 1.1688502870602092e-05, "loss": 0.0, "step": 23840 }, { "epoch": 56.01, "grad_norm": 0.0007343690376728773, "learning_rate": 1.167010157515089e-05, "loss": 0.0001, "step": 23850 }, { "epoch": 56.01, "grad_norm": 0.13527420163154602, "learning_rate": 1.1651700279699692e-05, "loss": 0.0017, "step": 23860 }, { "epoch": 56.01, "grad_norm": 0.002437218092381954, "learning_rate": 1.1633298984248492e-05, "loss": 0.0251, "step": 23870 }, { "epoch": 56.01, "grad_norm": 0.013060529716312885, "learning_rate": 1.1614897688797292e-05, "loss": 0.0001, "step": 23880 }, { "epoch": 56.01, "grad_norm": 0.0010747129563242197, "learning_rate": 1.1596496393346092e-05, "loss": 0.0001, "step": 23890 }, { "epoch": 56.01, "grad_norm": 0.018001865595579147, "learning_rate": 1.1578095097894892e-05, "loss": 0.0772, "step": 23900 }, { "epoch": 56.01, "grad_norm": 3.493405342102051, "learning_rate": 1.1559693802443693e-05, "loss": 0.1437, "step": 23910 }, { "epoch": 56.01, "grad_norm": 0.05796351283788681, "learning_rate": 1.1541292506992493e-05, "loss": 0.0068, "step": 23920 }, { "epoch": 56.01, "grad_norm": 45.330631256103516, "learning_rate": 1.1522891211541295e-05, "loss": 0.0343, "step": 23930 }, { "epoch": 56.01, "grad_norm": 0.00885185319930315, "learning_rate": 1.1504489916090093e-05, "loss": 0.0582, "step": 23940 }, { "epoch": 56.01, "eval_accuracy": 0.7443693693693694, "eval_loss": 1.4820666313171387, "eval_runtime": 39.7674, "eval_samples_per_second": 22.33, "eval_steps_per_second": 1.861, "step": 23940 }, { "epoch": 57.0, "grad_norm": 0.0019421263132244349, "learning_rate": 1.1486088620638893e-05, "loss": 0.0004, "step": 23950 }, { "epoch": 57.0, "grad_norm": 0.00982197280973196, "learning_rate": 1.1467687325187693e-05, "loss": 0.0493, "step": 23960 }, { "epoch": 57.0, "grad_norm": 0.048929549753665924, "learning_rate": 1.1449286029736495e-05, "loss": 0.0003, "step": 23970 }, { "epoch": 57.0, "grad_norm": 0.0012740027159452438, "learning_rate": 1.1430884734285293e-05, "loss": 0.0038, "step": 23980 }, { "epoch": 57.0, "grad_norm": 0.004289202857762575, "learning_rate": 1.1412483438834094e-05, "loss": 0.0031, "step": 23990 }, { "epoch": 57.0, "grad_norm": 0.0018165657529607415, "learning_rate": 1.1394082143382895e-05, "loss": 0.0002, "step": 24000 }, { "epoch": 57.0, "grad_norm": 0.021423837170004845, "learning_rate": 1.1375680847931695e-05, "loss": 0.0066, "step": 24010 }, { "epoch": 57.0, "grad_norm": 0.01438127364963293, "learning_rate": 1.1357279552480494e-05, "loss": 0.0136, "step": 24020 }, { "epoch": 57.0, "grad_norm": 0.0032948721200227737, "learning_rate": 1.1338878257029296e-05, "loss": 0.0002, "step": 24030 }, { "epoch": 57.0, "grad_norm": 22.448226928710938, "learning_rate": 1.1320476961578096e-05, "loss": 0.0026, "step": 24040 }, { "epoch": 57.0, "grad_norm": 0.0013019995531067252, "learning_rate": 1.1302075666126896e-05, "loss": 0.0001, "step": 24050 }, { "epoch": 57.0, "grad_norm": 0.009402398020029068, "learning_rate": 1.1283674370675696e-05, "loss": 0.0004, "step": 24060 }, { "epoch": 57.0, "grad_norm": 0.005693600047379732, "learning_rate": 1.1265273075224496e-05, "loss": 0.0001, "step": 24070 }, { "epoch": 57.0, "grad_norm": 0.0012536332942545414, "learning_rate": 1.1246871779773296e-05, "loss": 0.0002, "step": 24080 }, { "epoch": 57.0, "grad_norm": 0.0019443167839199305, "learning_rate": 1.1228470484322096e-05, "loss": 0.0101, "step": 24090 }, { "epoch": 57.01, "grad_norm": 0.006146451458334923, "learning_rate": 1.1210069188870898e-05, "loss": 0.0001, "step": 24100 }, { "epoch": 57.01, "grad_norm": 31.494211196899414, "learning_rate": 1.1191667893419697e-05, "loss": 0.0022, "step": 24110 }, { "epoch": 57.01, "grad_norm": 0.0124431187286973, "learning_rate": 1.1173266597968497e-05, "loss": 0.0002, "step": 24120 }, { "epoch": 57.01, "grad_norm": 0.0027170858811587095, "learning_rate": 1.1154865302517299e-05, "loss": 0.042, "step": 24130 }, { "epoch": 57.01, "grad_norm": 63.92512512207031, "learning_rate": 1.1136464007066099e-05, "loss": 0.0493, "step": 24140 }, { "epoch": 57.01, "grad_norm": 0.0034329602494835854, "learning_rate": 1.1118062711614899e-05, "loss": 0.006, "step": 24150 }, { "epoch": 57.01, "grad_norm": 0.0025639557279646397, "learning_rate": 1.1099661416163699e-05, "loss": 0.0001, "step": 24160 }, { "epoch": 57.01, "grad_norm": 0.0006689762230962515, "learning_rate": 1.1081260120712499e-05, "loss": 0.0002, "step": 24170 }, { "epoch": 57.01, "grad_norm": 0.0014391548465937376, "learning_rate": 1.1062858825261299e-05, "loss": 0.0021, "step": 24180 }, { "epoch": 57.01, "grad_norm": 0.0008516352972947061, "learning_rate": 1.10444575298101e-05, "loss": 0.0001, "step": 24190 }, { "epoch": 57.01, "grad_norm": 0.04408324137330055, "learning_rate": 1.10260562343589e-05, "loss": 0.0422, "step": 24200 }, { "epoch": 57.01, "grad_norm": 0.008414952084422112, "learning_rate": 1.10076549389077e-05, "loss": 0.0002, "step": 24210 }, { "epoch": 57.01, "grad_norm": 0.002066493732854724, "learning_rate": 1.09892536434565e-05, "loss": 0.0007, "step": 24220 }, { "epoch": 57.01, "grad_norm": 0.12472962588071823, "learning_rate": 1.09708523480053e-05, "loss": 0.0184, "step": 24230 }, { "epoch": 57.01, "grad_norm": 0.0016410744283348322, "learning_rate": 1.09524510525541e-05, "loss": 0.001, "step": 24240 }, { "epoch": 57.01, "grad_norm": 0.0014384149108082056, "learning_rate": 1.09340497571029e-05, "loss": 0.0001, "step": 24250 }, { "epoch": 57.01, "grad_norm": 0.018821023404598236, "learning_rate": 1.09156484616517e-05, "loss": 0.0551, "step": 24260 }, { "epoch": 57.01, "grad_norm": 0.0015349116874858737, "learning_rate": 1.0897247166200502e-05, "loss": 0.0116, "step": 24270 }, { "epoch": 57.01, "grad_norm": 0.009763521142303944, "learning_rate": 1.0878845870749302e-05, "loss": 0.0647, "step": 24280 }, { "epoch": 57.01, "grad_norm": 0.0007470657583326101, "learning_rate": 1.08604445752981e-05, "loss": 0.0055, "step": 24290 }, { "epoch": 57.01, "grad_norm": 0.0014731376431882381, "learning_rate": 1.0842043279846902e-05, "loss": 0.0262, "step": 24300 }, { "epoch": 57.01, "grad_norm": 0.0022547373082488775, "learning_rate": 1.0823641984395702e-05, "loss": 0.001, "step": 24310 }, { "epoch": 57.01, "grad_norm": 0.0013335180701687932, "learning_rate": 1.0805240688944502e-05, "loss": 0.0001, "step": 24320 }, { "epoch": 57.01, "grad_norm": 0.0014102818677201867, "learning_rate": 1.0786839393493303e-05, "loss": 0.0001, "step": 24330 }, { "epoch": 57.01, "grad_norm": 0.002224271185696125, "learning_rate": 1.0768438098042103e-05, "loss": 0.0001, "step": 24340 }, { "epoch": 57.01, "grad_norm": 0.0031190093141049147, "learning_rate": 1.0750036802590903e-05, "loss": 0.0001, "step": 24350 }, { "epoch": 57.01, "grad_norm": 0.0007708192570134997, "learning_rate": 1.0731635507139703e-05, "loss": 0.0001, "step": 24360 }, { "epoch": 57.01, "eval_accuracy": 0.7623873873873874, "eval_loss": 1.6253712177276611, "eval_runtime": 39.4401, "eval_samples_per_second": 22.515, "eval_steps_per_second": 1.876, "step": 24360 }, { "epoch": 58.0, "grad_norm": 0.001005275989882648, "learning_rate": 1.0713234211688505e-05, "loss": 0.0001, "step": 24370 }, { "epoch": 58.0, "grad_norm": 0.0007201577536761761, "learning_rate": 1.0694832916237303e-05, "loss": 0.0146, "step": 24380 }, { "epoch": 58.0, "grad_norm": 0.0415475107729435, "learning_rate": 1.0676431620786103e-05, "loss": 0.0001, "step": 24390 }, { "epoch": 58.0, "grad_norm": 0.000509662670083344, "learning_rate": 1.0658030325334905e-05, "loss": 0.0001, "step": 24400 }, { "epoch": 58.0, "grad_norm": 0.0016784222098067403, "learning_rate": 1.0639629029883705e-05, "loss": 0.0061, "step": 24410 }, { "epoch": 58.0, "grad_norm": 0.001808426110073924, "learning_rate": 1.0621227734432504e-05, "loss": 0.0001, "step": 24420 }, { "epoch": 58.0, "grad_norm": 0.00614283187314868, "learning_rate": 1.0602826438981304e-05, "loss": 0.0001, "step": 24430 }, { "epoch": 58.0, "grad_norm": 0.0009809379698708653, "learning_rate": 1.0584425143530105e-05, "loss": 0.0002, "step": 24440 }, { "epoch": 58.0, "grad_norm": 0.0018786874134093523, "learning_rate": 1.0566023848078906e-05, "loss": 0.0001, "step": 24450 }, { "epoch": 58.0, "grad_norm": 0.0009892601519823074, "learning_rate": 1.0547622552627706e-05, "loss": 0.0, "step": 24460 }, { "epoch": 58.0, "grad_norm": 0.0009283177787438035, "learning_rate": 1.0529221257176506e-05, "loss": 0.0002, "step": 24470 }, { "epoch": 58.0, "grad_norm": 0.0007727140327915549, "learning_rate": 1.0510819961725306e-05, "loss": 0.0, "step": 24480 }, { "epoch": 58.0, "grad_norm": 0.0018267113482579589, "learning_rate": 1.0492418666274106e-05, "loss": 0.0025, "step": 24490 }, { "epoch": 58.0, "grad_norm": 0.0010749399662017822, "learning_rate": 1.0474017370822906e-05, "loss": 0.0435, "step": 24500 }, { "epoch": 58.0, "grad_norm": 0.0004598453233484179, "learning_rate": 1.0455616075371706e-05, "loss": 0.0001, "step": 24510 }, { "epoch": 58.01, "grad_norm": 0.0019546435214579105, "learning_rate": 1.0437214779920506e-05, "loss": 0.0067, "step": 24520 }, { "epoch": 58.01, "grad_norm": 0.0007498570485040545, "learning_rate": 1.0418813484469307e-05, "loss": 0.0001, "step": 24530 }, { "epoch": 58.01, "grad_norm": 0.0015781412366777658, "learning_rate": 1.0400412189018108e-05, "loss": 0.0277, "step": 24540 }, { "epoch": 58.01, "grad_norm": 0.5236406922340393, "learning_rate": 1.0382010893566907e-05, "loss": 0.0004, "step": 24550 }, { "epoch": 58.01, "grad_norm": 0.0012669709976762533, "learning_rate": 1.0363609598115707e-05, "loss": 0.0001, "step": 24560 }, { "epoch": 58.01, "grad_norm": 0.0012568996753543615, "learning_rate": 1.0345208302664509e-05, "loss": 0.0004, "step": 24570 }, { "epoch": 58.01, "grad_norm": 0.043358806520700455, "learning_rate": 1.0326807007213309e-05, "loss": 0.0001, "step": 24580 }, { "epoch": 58.01, "grad_norm": 0.0007607506704516709, "learning_rate": 1.0308405711762109e-05, "loss": 0.0003, "step": 24590 }, { "epoch": 58.01, "grad_norm": 0.0019133040914312005, "learning_rate": 1.0290004416310909e-05, "loss": 0.0002, "step": 24600 }, { "epoch": 58.01, "grad_norm": 0.0040087285451591015, "learning_rate": 1.0271603120859709e-05, "loss": 0.0001, "step": 24610 }, { "epoch": 58.01, "grad_norm": 0.005804694723337889, "learning_rate": 1.025320182540851e-05, "loss": 0.0, "step": 24620 }, { "epoch": 58.01, "grad_norm": 0.003029964864253998, "learning_rate": 1.023480052995731e-05, "loss": 0.0473, "step": 24630 }, { "epoch": 58.01, "grad_norm": 0.0011636598501354456, "learning_rate": 1.021639923450611e-05, "loss": 0.0001, "step": 24640 }, { "epoch": 58.01, "grad_norm": 0.002884934889152646, "learning_rate": 1.019799793905491e-05, "loss": 0.0001, "step": 24650 }, { "epoch": 58.01, "grad_norm": 0.11349088698625565, "learning_rate": 1.017959664360371e-05, "loss": 0.0125, "step": 24660 }, { "epoch": 58.01, "grad_norm": 0.002500980393961072, "learning_rate": 1.0161195348152512e-05, "loss": 0.0003, "step": 24670 }, { "epoch": 58.01, "grad_norm": 0.0008682305924594402, "learning_rate": 1.0142794052701312e-05, "loss": 0.0, "step": 24680 }, { "epoch": 58.01, "grad_norm": 0.014253217726945877, "learning_rate": 1.012439275725011e-05, "loss": 0.0001, "step": 24690 }, { "epoch": 58.01, "grad_norm": 0.0009429533965885639, "learning_rate": 1.010599146179891e-05, "loss": 0.077, "step": 24700 }, { "epoch": 58.01, "grad_norm": 0.0008992166840471327, "learning_rate": 1.0087590166347712e-05, "loss": 0.007, "step": 24710 }, { "epoch": 58.01, "grad_norm": 0.0009217716287821531, "learning_rate": 1.0069188870896512e-05, "loss": 0.0001, "step": 24720 }, { "epoch": 58.01, "grad_norm": 0.003018986666575074, "learning_rate": 1.005078757544531e-05, "loss": 0.0001, "step": 24730 }, { "epoch": 58.01, "grad_norm": 0.0009471423109062016, "learning_rate": 1.0032386279994112e-05, "loss": 0.0001, "step": 24740 }, { "epoch": 58.01, "grad_norm": 0.0012168257962912321, "learning_rate": 1.0013984984542912e-05, "loss": 0.0001, "step": 24750 }, { "epoch": 58.01, "grad_norm": 0.0008903819834813476, "learning_rate": 9.995583689091713e-06, "loss": 0.0001, "step": 24760 }, { "epoch": 58.01, "grad_norm": 5.247084617614746, "learning_rate": 9.977182393640513e-06, "loss": 0.0009, "step": 24770 }, { "epoch": 58.01, "grad_norm": 0.00031116019818000495, "learning_rate": 9.958781098189313e-06, "loss": 0.0, "step": 24780 }, { "epoch": 58.01, "eval_accuracy": 0.7545045045045045, "eval_loss": 1.8023662567138672, "eval_runtime": 39.4053, "eval_samples_per_second": 22.535, "eval_steps_per_second": 1.878, "step": 24780 }, { "epoch": 59.0, "grad_norm": 0.0064481995068490505, "learning_rate": 9.940379802738113e-06, "loss": 0.0, "step": 24790 }, { "epoch": 59.0, "grad_norm": 0.0012003867886960506, "learning_rate": 9.921978507286913e-06, "loss": 0.0001, "step": 24800 }, { "epoch": 59.0, "grad_norm": 0.000993055640719831, "learning_rate": 9.903577211835715e-06, "loss": 0.0, "step": 24810 }, { "epoch": 59.0, "grad_norm": 0.0007150658057071269, "learning_rate": 9.885175916384513e-06, "loss": 0.0, "step": 24820 }, { "epoch": 59.0, "grad_norm": 0.004784159362316132, "learning_rate": 9.866774620933313e-06, "loss": 0.0, "step": 24830 }, { "epoch": 59.0, "grad_norm": 0.0005838919896632433, "learning_rate": 9.848373325482115e-06, "loss": 0.0001, "step": 24840 }, { "epoch": 59.0, "grad_norm": 0.0009973630076274276, "learning_rate": 9.829972030030915e-06, "loss": 0.0, "step": 24850 }, { "epoch": 59.0, "grad_norm": 0.000762183393817395, "learning_rate": 9.811570734579714e-06, "loss": 0.0001, "step": 24860 }, { "epoch": 59.0, "grad_norm": 0.0015864388551563025, "learning_rate": 9.793169439128516e-06, "loss": 0.0499, "step": 24870 }, { "epoch": 59.0, "grad_norm": 0.0018210052512586117, "learning_rate": 9.774768143677316e-06, "loss": 0.0, "step": 24880 }, { "epoch": 59.0, "grad_norm": 0.012355053797364235, "learning_rate": 9.756366848226116e-06, "loss": 0.0058, "step": 24890 }, { "epoch": 59.0, "grad_norm": 0.0005313998553901911, "learning_rate": 9.737965552774916e-06, "loss": 0.0, "step": 24900 }, { "epoch": 59.0, "grad_norm": 0.0005420492379926145, "learning_rate": 9.719564257323716e-06, "loss": 0.0, "step": 24910 }, { "epoch": 59.0, "grad_norm": 0.0017370874993503094, "learning_rate": 9.701162961872516e-06, "loss": 0.1459, "step": 24920 }, { "epoch": 59.0, "grad_norm": 14.99138069152832, "learning_rate": 9.682761666421316e-06, "loss": 0.0016, "step": 24930 }, { "epoch": 59.01, "grad_norm": 0.0008930095937103033, "learning_rate": 9.664360370970118e-06, "loss": 0.0001, "step": 24940 }, { "epoch": 59.01, "grad_norm": 0.0012798807583749294, "learning_rate": 9.645959075518916e-06, "loss": 0.0001, "step": 24950 }, { "epoch": 59.01, "grad_norm": 38.88914108276367, "learning_rate": 9.627557780067717e-06, "loss": 0.0334, "step": 24960 }, { "epoch": 59.01, "grad_norm": 0.0011596691329032183, "learning_rate": 9.609156484616517e-06, "loss": 0.0001, "step": 24970 }, { "epoch": 59.01, "grad_norm": 0.1758793741464615, "learning_rate": 9.590755189165319e-06, "loss": 0.0187, "step": 24980 }, { "epoch": 59.01, "grad_norm": 0.003486826317384839, "learning_rate": 9.572353893714119e-06, "loss": 0.0001, "step": 24990 }, { "epoch": 59.01, "grad_norm": 1.0641165971755981, "learning_rate": 9.553952598262917e-06, "loss": 0.0181, "step": 25000 }, { "epoch": 59.01, "grad_norm": 0.6313084959983826, "learning_rate": 9.535551302811719e-06, "loss": 0.003, "step": 25010 }, { "epoch": 59.01, "grad_norm": 0.0015551378019154072, "learning_rate": 9.517150007360519e-06, "loss": 0.0001, "step": 25020 }, { "epoch": 59.01, "grad_norm": 0.0037916789297014475, "learning_rate": 9.498748711909319e-06, "loss": 0.0002, "step": 25030 }, { "epoch": 59.01, "grad_norm": 10.630950927734375, "learning_rate": 9.48034741645812e-06, "loss": 0.1094, "step": 25040 }, { "epoch": 59.01, "grad_norm": 0.002494827611371875, "learning_rate": 9.46194612100692e-06, "loss": 0.0464, "step": 25050 }, { "epoch": 59.01, "grad_norm": 0.0022337674163281918, "learning_rate": 9.44354482555572e-06, "loss": 0.0025, "step": 25060 }, { "epoch": 59.01, "grad_norm": 0.02816508710384369, "learning_rate": 9.42514353010452e-06, "loss": 0.0605, "step": 25070 }, { "epoch": 59.01, "grad_norm": 0.0023118199314922094, "learning_rate": 9.406742234653321e-06, "loss": 0.0056, "step": 25080 }, { "epoch": 59.01, "grad_norm": 0.17282597720623016, "learning_rate": 9.38834093920212e-06, "loss": 0.0029, "step": 25090 }, { "epoch": 59.01, "grad_norm": 0.007972006686031818, "learning_rate": 9.36993964375092e-06, "loss": 0.0001, "step": 25100 }, { "epoch": 59.01, "grad_norm": 0.0013798163272440434, "learning_rate": 9.351538348299722e-06, "loss": 0.0196, "step": 25110 }, { "epoch": 59.01, "grad_norm": 0.0026264681946486235, "learning_rate": 9.333137052848522e-06, "loss": 0.0004, "step": 25120 }, { "epoch": 59.01, "grad_norm": 0.0016933433944359422, "learning_rate": 9.31473575739732e-06, "loss": 0.001, "step": 25130 }, { "epoch": 59.01, "grad_norm": 0.0603969544172287, "learning_rate": 9.296334461946122e-06, "loss": 0.0002, "step": 25140 }, { "epoch": 59.01, "grad_norm": 0.001932127634063363, "learning_rate": 9.277933166494922e-06, "loss": 0.0362, "step": 25150 }, { "epoch": 59.01, "grad_norm": 0.001338793197646737, "learning_rate": 9.259531871043722e-06, "loss": 0.0002, "step": 25160 }, { "epoch": 59.01, "grad_norm": 0.0024657603353261948, "learning_rate": 9.24113057559252e-06, "loss": 0.0001, "step": 25170 }, { "epoch": 59.01, "grad_norm": 0.001342698698863387, "learning_rate": 9.222729280141323e-06, "loss": 0.0001, "step": 25180 }, { "epoch": 59.01, "grad_norm": 0.0012493436224758625, "learning_rate": 9.204327984690123e-06, "loss": 0.0001, "step": 25190 }, { "epoch": 59.01, "grad_norm": 0.0005642689066007733, "learning_rate": 9.185926689238923e-06, "loss": 0.0486, "step": 25200 }, { "epoch": 59.01, "eval_accuracy": 0.7545045045045045, "eval_loss": 1.6803523302078247, "eval_runtime": 39.5194, "eval_samples_per_second": 22.47, "eval_steps_per_second": 1.872, "step": 25200 }, { "epoch": 60.0, "grad_norm": 0.002399663208052516, "learning_rate": 9.167525393787723e-06, "loss": 0.0001, "step": 25210 }, { "epoch": 60.0, "grad_norm": 0.12855449318885803, "learning_rate": 9.149124098336523e-06, "loss": 0.0002, "step": 25220 }, { "epoch": 60.0, "grad_norm": 0.002423466183245182, "learning_rate": 9.130722802885323e-06, "loss": 0.0048, "step": 25230 }, { "epoch": 60.0, "grad_norm": 0.003666324308142066, "learning_rate": 9.112321507434123e-06, "loss": 0.0002, "step": 25240 }, { "epoch": 60.0, "grad_norm": 0.015467680059373379, "learning_rate": 9.093920211982925e-06, "loss": 0.0002, "step": 25250 }, { "epoch": 60.0, "grad_norm": 0.002297742525115609, "learning_rate": 9.075518916531723e-06, "loss": 0.0427, "step": 25260 }, { "epoch": 60.0, "grad_norm": 0.002047237940132618, "learning_rate": 9.057117621080524e-06, "loss": 0.0001, "step": 25270 }, { "epoch": 60.0, "grad_norm": 0.004319984000176191, "learning_rate": 9.038716325629325e-06, "loss": 0.0001, "step": 25280 }, { "epoch": 60.0, "grad_norm": 0.01682116463780403, "learning_rate": 9.020315030178125e-06, "loss": 0.0001, "step": 25290 }, { "epoch": 60.0, "grad_norm": 0.07739716023206711, "learning_rate": 9.001913734726926e-06, "loss": 0.0003, "step": 25300 }, { "epoch": 60.0, "grad_norm": 0.0007034876034595072, "learning_rate": 8.983512439275726e-06, "loss": 0.0001, "step": 25310 }, { "epoch": 60.0, "grad_norm": 0.0012769020395353436, "learning_rate": 8.965111143824526e-06, "loss": 0.0001, "step": 25320 }, { "epoch": 60.0, "grad_norm": 0.0047972784377634525, "learning_rate": 8.946709848373326e-06, "loss": 0.0001, "step": 25330 }, { "epoch": 60.0, "grad_norm": 0.005869260523468256, "learning_rate": 8.928308552922126e-06, "loss": 0.0442, "step": 25340 }, { "epoch": 60.0, "grad_norm": 0.0843689814209938, "learning_rate": 8.909907257470926e-06, "loss": 0.0001, "step": 25350 }, { "epoch": 60.01, "grad_norm": 0.008844790048897266, "learning_rate": 8.891505962019726e-06, "loss": 0.0001, "step": 25360 }, { "epoch": 60.01, "grad_norm": 0.0014894960913807154, "learning_rate": 8.873104666568526e-06, "loss": 0.0001, "step": 25370 }, { "epoch": 60.01, "grad_norm": 0.002464174758642912, "learning_rate": 8.854703371117328e-06, "loss": 0.0001, "step": 25380 }, { "epoch": 60.01, "grad_norm": 0.013474004343152046, "learning_rate": 8.836302075666128e-06, "loss": 0.0001, "step": 25390 }, { "epoch": 60.01, "grad_norm": 0.0005289826076477766, "learning_rate": 8.817900780214927e-06, "loss": 0.0001, "step": 25400 }, { "epoch": 60.01, "grad_norm": 0.001807571155950427, "learning_rate": 8.799499484763729e-06, "loss": 0.0039, "step": 25410 }, { "epoch": 60.01, "grad_norm": 0.0014281687326729298, "learning_rate": 8.781098189312529e-06, "loss": 0.0001, "step": 25420 }, { "epoch": 60.01, "grad_norm": 0.001596023328602314, "learning_rate": 8.762696893861329e-06, "loss": 0.0001, "step": 25430 }, { "epoch": 60.01, "grad_norm": 0.0017493361374363303, "learning_rate": 8.744295598410127e-06, "loss": 0.0033, "step": 25440 }, { "epoch": 60.01, "grad_norm": 0.0007126057171262801, "learning_rate": 8.725894302958929e-06, "loss": 0.0041, "step": 25450 }, { "epoch": 60.01, "grad_norm": 0.00909177865833044, "learning_rate": 8.707493007507729e-06, "loss": 0.0001, "step": 25460 }, { "epoch": 60.01, "grad_norm": 10.200445175170898, "learning_rate": 8.68909171205653e-06, "loss": 0.0827, "step": 25470 }, { "epoch": 60.01, "grad_norm": 0.014321324415504932, "learning_rate": 8.67069041660533e-06, "loss": 0.0001, "step": 25480 }, { "epoch": 60.01, "grad_norm": 0.141360342502594, "learning_rate": 8.65228912115413e-06, "loss": 0.0002, "step": 25490 }, { "epoch": 60.01, "grad_norm": 0.0010122742969542742, "learning_rate": 8.63388782570293e-06, "loss": 0.0002, "step": 25500 }, { "epoch": 60.01, "grad_norm": 0.0054549286141991615, "learning_rate": 8.61548653025173e-06, "loss": 0.0196, "step": 25510 }, { "epoch": 60.01, "grad_norm": 0.002311424119397998, "learning_rate": 8.597085234800532e-06, "loss": 0.0001, "step": 25520 }, { "epoch": 60.01, "grad_norm": 0.0010803097393363714, "learning_rate": 8.57868393934933e-06, "loss": 0.0112, "step": 25530 }, { "epoch": 60.01, "grad_norm": 0.0027651439886540174, "learning_rate": 8.56028264389813e-06, "loss": 0.006, "step": 25540 }, { "epoch": 60.01, "grad_norm": 0.0009034753893502057, "learning_rate": 8.541881348446932e-06, "loss": 0.0001, "step": 25550 }, { "epoch": 60.01, "grad_norm": 0.002131812274456024, "learning_rate": 8.523480052995732e-06, "loss": 0.0, "step": 25560 }, { "epoch": 60.01, "grad_norm": 0.0011010438902303576, "learning_rate": 8.50507875754453e-06, "loss": 0.0001, "step": 25570 }, { "epoch": 60.01, "grad_norm": 0.0020116691011935472, "learning_rate": 8.486677462093332e-06, "loss": 0.036, "step": 25580 }, { "epoch": 60.01, "grad_norm": 0.008289303630590439, "learning_rate": 8.468276166642132e-06, "loss": 0.0001, "step": 25590 }, { "epoch": 60.01, "grad_norm": 0.004598530940711498, "learning_rate": 8.449874871190932e-06, "loss": 0.001, "step": 25600 }, { "epoch": 60.01, "grad_norm": 0.0016972459852695465, "learning_rate": 8.431473575739733e-06, "loss": 0.0001, "step": 25610 }, { "epoch": 60.01, "grad_norm": 0.002091147704049945, "learning_rate": 8.413072280288533e-06, "loss": 0.0001, "step": 25620 }, { "epoch": 60.01, "eval_accuracy": 0.7522522522522522, "eval_loss": 1.7991126775741577, "eval_runtime": 39.379, "eval_samples_per_second": 22.55, "eval_steps_per_second": 1.879, "step": 25620 }, { "epoch": 61.0, "grad_norm": 0.0016870342660695314, "learning_rate": 8.394670984837333e-06, "loss": 0.0, "step": 25630 }, { "epoch": 61.0, "grad_norm": 0.001305489568039775, "learning_rate": 8.376269689386133e-06, "loss": 0.0034, "step": 25640 }, { "epoch": 61.0, "grad_norm": 0.0020442858804017305, "learning_rate": 8.357868393934935e-06, "loss": 0.0001, "step": 25650 }, { "epoch": 61.0, "grad_norm": 0.0010033355792984366, "learning_rate": 8.339467098483733e-06, "loss": 0.0105, "step": 25660 }, { "epoch": 61.0, "grad_norm": 0.015114572830498219, "learning_rate": 8.321065803032533e-06, "loss": 0.0212, "step": 25670 }, { "epoch": 61.0, "grad_norm": 0.001860952703282237, "learning_rate": 8.302664507581333e-06, "loss": 0.0001, "step": 25680 }, { "epoch": 61.0, "grad_norm": 0.002813951577991247, "learning_rate": 8.284263212130135e-06, "loss": 0.0001, "step": 25690 }, { "epoch": 61.0, "grad_norm": 0.00690379599109292, "learning_rate": 8.265861916678935e-06, "loss": 0.0001, "step": 25700 }, { "epoch": 61.0, "grad_norm": 0.0035497399512678385, "learning_rate": 8.247460621227734e-06, "loss": 0.0043, "step": 25710 }, { "epoch": 61.0, "grad_norm": 0.0012824188452214003, "learning_rate": 8.229059325776536e-06, "loss": 0.0001, "step": 25720 }, { "epoch": 61.0, "grad_norm": 0.0018320622621104121, "learning_rate": 8.210658030325336e-06, "loss": 0.0004, "step": 25730 }, { "epoch": 61.0, "grad_norm": 0.0008166917832568288, "learning_rate": 8.192256734874136e-06, "loss": 0.0, "step": 25740 }, { "epoch": 61.0, "grad_norm": 0.000992298242636025, "learning_rate": 8.173855439422936e-06, "loss": 0.061, "step": 25750 }, { "epoch": 61.0, "grad_norm": 0.001933283288963139, "learning_rate": 8.155454143971736e-06, "loss": 0.0, "step": 25760 }, { "epoch": 61.0, "grad_norm": 0.002809977624565363, "learning_rate": 8.137052848520536e-06, "loss": 0.0001, "step": 25770 }, { "epoch": 61.01, "grad_norm": 0.0009199827327392995, "learning_rate": 8.118651553069336e-06, "loss": 0.0, "step": 25780 }, { "epoch": 61.01, "grad_norm": 0.0006985805230215192, "learning_rate": 8.100250257618136e-06, "loss": 0.0002, "step": 25790 }, { "epoch": 61.01, "grad_norm": 0.0004489361308515072, "learning_rate": 8.081848962166936e-06, "loss": 0.0001, "step": 25800 }, { "epoch": 61.01, "grad_norm": 0.0005292571731843054, "learning_rate": 8.063447666715737e-06, "loss": 0.0, "step": 25810 }, { "epoch": 61.01, "grad_norm": 0.00038617965765297413, "learning_rate": 8.045046371264538e-06, "loss": 0.0001, "step": 25820 }, { "epoch": 61.01, "grad_norm": 0.0032925093546509743, "learning_rate": 8.026645075813338e-06, "loss": 0.0, "step": 25830 }, { "epoch": 61.01, "grad_norm": 0.0006309591117314994, "learning_rate": 8.008243780362137e-06, "loss": 0.0105, "step": 25840 }, { "epoch": 61.01, "grad_norm": 0.000899383972864598, "learning_rate": 7.989842484910939e-06, "loss": 0.0001, "step": 25850 }, { "epoch": 61.01, "grad_norm": 0.013826681300997734, "learning_rate": 7.971441189459739e-06, "loss": 0.0159, "step": 25860 }, { "epoch": 61.01, "grad_norm": 0.0012361772824078798, "learning_rate": 7.953039894008539e-06, "loss": 0.0, "step": 25870 }, { "epoch": 61.01, "grad_norm": 0.000654926523566246, "learning_rate": 7.934638598557337e-06, "loss": 0.0001, "step": 25880 }, { "epoch": 61.01, "grad_norm": 0.00048330603749491274, "learning_rate": 7.91623730310614e-06, "loss": 0.0, "step": 25890 }, { "epoch": 61.01, "grad_norm": 0.0006944110500626266, "learning_rate": 7.89783600765494e-06, "loss": 0.0005, "step": 25900 }, { "epoch": 61.01, "grad_norm": 0.0006543623167090118, "learning_rate": 7.87943471220374e-06, "loss": 0.0004, "step": 25910 }, { "epoch": 61.01, "grad_norm": 0.03564361482858658, "learning_rate": 7.861033416752541e-06, "loss": 0.0001, "step": 25920 }, { "epoch": 61.01, "grad_norm": 0.0036910008639097214, "learning_rate": 7.84263212130134e-06, "loss": 0.0, "step": 25930 }, { "epoch": 61.01, "grad_norm": 66.63784790039062, "learning_rate": 7.82423082585014e-06, "loss": 0.0335, "step": 25940 }, { "epoch": 61.01, "grad_norm": 0.0006726859137415886, "learning_rate": 7.80582953039894e-06, "loss": 0.0001, "step": 25950 }, { "epoch": 61.01, "grad_norm": 0.0008944363798946142, "learning_rate": 7.787428234947742e-06, "loss": 0.0023, "step": 25960 }, { "epoch": 61.01, "grad_norm": 0.0020627244375646114, "learning_rate": 7.76902693949654e-06, "loss": 0.0, "step": 25970 }, { "epoch": 61.01, "grad_norm": 0.004097119905054569, "learning_rate": 7.75062564404534e-06, "loss": 0.0001, "step": 25980 }, { "epoch": 61.01, "grad_norm": 0.0006828421028330922, "learning_rate": 7.732224348594142e-06, "loss": 0.0, "step": 25990 }, { "epoch": 61.01, "grad_norm": 0.9965651631355286, "learning_rate": 7.713823053142942e-06, "loss": 0.0002, "step": 26000 }, { "epoch": 61.01, "grad_norm": 0.001254824921488762, "learning_rate": 7.69542175769174e-06, "loss": 0.0, "step": 26010 }, { "epoch": 61.01, "grad_norm": 0.0006275831838138402, "learning_rate": 7.677020462240542e-06, "loss": 0.0001, "step": 26020 }, { "epoch": 61.01, "grad_norm": 0.0017072111368179321, "learning_rate": 7.658619166789342e-06, "loss": 0.0001, "step": 26030 }, { "epoch": 61.01, "grad_norm": 0.00032576528610661626, "learning_rate": 7.640217871338143e-06, "loss": 0.0, "step": 26040 }, { "epoch": 61.01, "eval_accuracy": 0.7511261261261262, "eval_loss": 1.8280649185180664, "eval_runtime": 38.5276, "eval_samples_per_second": 23.048, "eval_steps_per_second": 1.921, "step": 26040 }, { "epoch": 62.0, "grad_norm": 0.015026925131678581, "learning_rate": 7.6218165758869436e-06, "loss": 0.0083, "step": 26050 }, { "epoch": 62.0, "grad_norm": 0.00498881796374917, "learning_rate": 7.603415280435743e-06, "loss": 0.0001, "step": 26060 }, { "epoch": 62.0, "grad_norm": 95.47059631347656, "learning_rate": 7.585013984984543e-06, "loss": 0.045, "step": 26070 }, { "epoch": 62.0, "grad_norm": 0.004085544031113386, "learning_rate": 7.566612689533344e-06, "loss": 0.0001, "step": 26080 }, { "epoch": 62.0, "grad_norm": 0.0010452407877892256, "learning_rate": 7.548211394082144e-06, "loss": 0.0001, "step": 26090 }, { "epoch": 62.0, "grad_norm": 0.0003794306539930403, "learning_rate": 7.529810098630943e-06, "loss": 0.0, "step": 26100 }, { "epoch": 62.0, "grad_norm": 0.00040680027450434864, "learning_rate": 7.511408803179744e-06, "loss": 0.0004, "step": 26110 }, { "epoch": 62.0, "grad_norm": 0.0007529466529376805, "learning_rate": 7.493007507728544e-06, "loss": 0.0, "step": 26120 }, { "epoch": 62.0, "grad_norm": 0.016848569735884666, "learning_rate": 7.474606212277345e-06, "loss": 0.0001, "step": 26130 }, { "epoch": 62.0, "grad_norm": 0.0005395954358391464, "learning_rate": 7.4562049168261454e-06, "loss": 0.0, "step": 26140 }, { "epoch": 62.0, "grad_norm": 0.009298846125602722, "learning_rate": 7.437803621374945e-06, "loss": 0.0, "step": 26150 }, { "epoch": 62.0, "grad_norm": 0.004532150458544493, "learning_rate": 7.419402325923745e-06, "loss": 0.0001, "step": 26160 }, { "epoch": 62.0, "grad_norm": 0.0007697180844843388, "learning_rate": 7.401001030472546e-06, "loss": 0.0373, "step": 26170 }, { "epoch": 62.0, "grad_norm": 0.0009432808728888631, "learning_rate": 7.382599735021346e-06, "loss": 0.0611, "step": 26180 }, { "epoch": 62.0, "grad_norm": 0.0006800147821195424, "learning_rate": 7.364198439570145e-06, "loss": 0.0765, "step": 26190 }, { "epoch": 62.01, "grad_norm": 0.0006660166545771062, "learning_rate": 7.345797144118946e-06, "loss": 0.0001, "step": 26200 }, { "epoch": 62.01, "grad_norm": 0.0007884202641434968, "learning_rate": 7.327395848667746e-06, "loss": 0.0, "step": 26210 }, { "epoch": 62.01, "grad_norm": 0.0006302872789092362, "learning_rate": 7.308994553216547e-06, "loss": 0.0, "step": 26220 }, { "epoch": 62.01, "grad_norm": 0.0007916418253444135, "learning_rate": 7.290593257765347e-06, "loss": 0.0696, "step": 26230 }, { "epoch": 62.01, "grad_norm": 0.0010438553290441632, "learning_rate": 7.272191962314147e-06, "loss": 0.0001, "step": 26240 }, { "epoch": 62.01, "grad_norm": 0.004884437192231417, "learning_rate": 7.2537906668629476e-06, "loss": 0.0207, "step": 26250 }, { "epoch": 62.01, "grad_norm": 0.001540785189718008, "learning_rate": 7.235389371411748e-06, "loss": 0.0003, "step": 26260 }, { "epoch": 62.01, "grad_norm": 0.0025039571337401867, "learning_rate": 7.216988075960549e-06, "loss": 0.0001, "step": 26270 }, { "epoch": 62.01, "grad_norm": 0.001049380050972104, "learning_rate": 7.198586780509348e-06, "loss": 0.0, "step": 26280 }, { "epoch": 62.01, "grad_norm": 0.0032229118514806032, "learning_rate": 7.180185485058148e-06, "loss": 0.0003, "step": 26290 }, { "epoch": 62.01, "grad_norm": 0.0008956646779552102, "learning_rate": 7.161784189606949e-06, "loss": 0.0001, "step": 26300 }, { "epoch": 62.01, "grad_norm": 0.014946524985134602, "learning_rate": 7.143382894155749e-06, "loss": 0.0001, "step": 26310 }, { "epoch": 62.01, "grad_norm": 0.0014685116475448012, "learning_rate": 7.124981598704548e-06, "loss": 0.0001, "step": 26320 }, { "epoch": 62.01, "grad_norm": 0.001738280989229679, "learning_rate": 7.106580303253349e-06, "loss": 0.0211, "step": 26330 }, { "epoch": 62.01, "grad_norm": 0.0006036240374669433, "learning_rate": 7.0881790078021495e-06, "loss": 0.0002, "step": 26340 }, { "epoch": 62.01, "grad_norm": 0.0010698206024244428, "learning_rate": 7.06977771235095e-06, "loss": 0.0002, "step": 26350 }, { "epoch": 62.01, "grad_norm": 0.00232374994084239, "learning_rate": 7.0513764168997505e-06, "loss": 0.0001, "step": 26360 }, { "epoch": 62.01, "grad_norm": 0.0006454029935412109, "learning_rate": 7.03297512144855e-06, "loss": 0.0, "step": 26370 }, { "epoch": 62.01, "grad_norm": 0.002342033665627241, "learning_rate": 7.014573825997351e-06, "loss": 0.0002, "step": 26380 }, { "epoch": 62.01, "grad_norm": 0.0028809804935008287, "learning_rate": 6.996172530546151e-06, "loss": 0.0, "step": 26390 }, { "epoch": 62.01, "grad_norm": 0.008059854619204998, "learning_rate": 6.977771235094952e-06, "loss": 0.0002, "step": 26400 }, { "epoch": 62.01, "grad_norm": 0.0008460727403871715, "learning_rate": 6.95936993964375e-06, "loss": 0.0002, "step": 26410 }, { "epoch": 62.01, "grad_norm": 0.02910284884274006, "learning_rate": 6.940968644192551e-06, "loss": 0.0001, "step": 26420 }, { "epoch": 62.01, "grad_norm": 0.005363269243389368, "learning_rate": 6.922567348741351e-06, "loss": 0.0041, "step": 26430 }, { "epoch": 62.01, "grad_norm": 0.0016924857627600431, "learning_rate": 6.904166053290152e-06, "loss": 0.0001, "step": 26440 }, { "epoch": 62.01, "grad_norm": 0.00135804305318743, "learning_rate": 6.885764757838952e-06, "loss": 0.0001, "step": 26450 }, { "epoch": 62.01, "grad_norm": 0.000540624838322401, "learning_rate": 6.867363462387752e-06, "loss": 0.0022, "step": 26460 }, { "epoch": 62.01, "eval_accuracy": 0.75, "eval_loss": 1.8172096014022827, "eval_runtime": 38.4534, "eval_samples_per_second": 23.093, "eval_steps_per_second": 1.924, "step": 26460 }, { "epoch": 63.0, "grad_norm": 0.0026345832739025354, "learning_rate": 6.848962166936553e-06, "loss": 0.0002, "step": 26470 }, { "epoch": 63.0, "grad_norm": 0.0017918674275279045, "learning_rate": 6.830560871485353e-06, "loss": 0.0, "step": 26480 }, { "epoch": 63.0, "grad_norm": 0.0011441055685281754, "learning_rate": 6.812159576034154e-06, "loss": 0.0822, "step": 26490 }, { "epoch": 63.0, "grad_norm": 0.0020869425497949123, "learning_rate": 6.793758280582953e-06, "loss": 0.0067, "step": 26500 }, { "epoch": 63.0, "grad_norm": 0.003741749795153737, "learning_rate": 6.775356985131753e-06, "loss": 0.0001, "step": 26510 }, { "epoch": 63.0, "grad_norm": 0.001574240275658667, "learning_rate": 6.756955689680554e-06, "loss": 0.0, "step": 26520 }, { "epoch": 63.0, "grad_norm": 0.08041319251060486, "learning_rate": 6.738554394229354e-06, "loss": 0.0001, "step": 26530 }, { "epoch": 63.0, "grad_norm": 0.001029444974847138, "learning_rate": 6.720153098778155e-06, "loss": 0.0001, "step": 26540 }, { "epoch": 63.0, "grad_norm": 0.0004752335953526199, "learning_rate": 6.701751803326954e-06, "loss": 0.0001, "step": 26550 }, { "epoch": 63.0, "grad_norm": 0.0011958446120843291, "learning_rate": 6.6833505078757545e-06, "loss": 0.0001, "step": 26560 }, { "epoch": 63.0, "grad_norm": 0.10393550246953964, "learning_rate": 6.6649492124245555e-06, "loss": 0.0001, "step": 26570 }, { "epoch": 63.0, "grad_norm": 0.002258807886391878, "learning_rate": 6.646547916973356e-06, "loss": 0.0001, "step": 26580 }, { "epoch": 63.0, "grad_norm": 0.007346214726567268, "learning_rate": 6.628146621522155e-06, "loss": 0.0001, "step": 26590 }, { "epoch": 63.0, "grad_norm": 0.0009463768219575286, "learning_rate": 6.609745326070956e-06, "loss": 0.0002, "step": 26600 }, { "epoch": 63.0, "grad_norm": 0.0016016251174733043, "learning_rate": 6.591344030619756e-06, "loss": 0.0001, "step": 26610 }, { "epoch": 63.01, "grad_norm": 0.0009076215210370719, "learning_rate": 6.572942735168557e-06, "loss": 0.0001, "step": 26620 }, { "epoch": 63.01, "grad_norm": 0.013018609955906868, "learning_rate": 6.554541439717355e-06, "loss": 0.003, "step": 26630 }, { "epoch": 63.01, "grad_norm": 0.003185126930475235, "learning_rate": 6.536140144266156e-06, "loss": 0.0, "step": 26640 }, { "epoch": 63.01, "grad_norm": 0.0011634600814431906, "learning_rate": 6.517738848814956e-06, "loss": 0.0001, "step": 26650 }, { "epoch": 63.01, "grad_norm": 0.0022628027945756912, "learning_rate": 6.499337553363757e-06, "loss": 0.0001, "step": 26660 }, { "epoch": 63.01, "grad_norm": 0.0006948548834770918, "learning_rate": 6.480936257912558e-06, "loss": 0.0001, "step": 26670 }, { "epoch": 63.01, "grad_norm": 0.006657461170107126, "learning_rate": 6.462534962461357e-06, "loss": 0.0, "step": 26680 }, { "epoch": 63.01, "grad_norm": 0.0006550468970090151, "learning_rate": 6.444133667010158e-06, "loss": 0.0, "step": 26690 }, { "epoch": 63.01, "grad_norm": 0.009990106336772442, "learning_rate": 6.425732371558958e-06, "loss": 0.0001, "step": 26700 }, { "epoch": 63.01, "grad_norm": 0.0010792514076456428, "learning_rate": 6.407331076107759e-06, "loss": 0.0032, "step": 26710 }, { "epoch": 63.01, "grad_norm": 0.007197075989097357, "learning_rate": 6.388929780656558e-06, "loss": 0.0001, "step": 26720 }, { "epoch": 63.01, "grad_norm": 0.0025517232716083527, "learning_rate": 6.370528485205358e-06, "loss": 0.0001, "step": 26730 }, { "epoch": 63.01, "grad_norm": 0.0018759402446448803, "learning_rate": 6.352127189754159e-06, "loss": 0.0001, "step": 26740 }, { "epoch": 63.01, "grad_norm": 0.0016400377498939633, "learning_rate": 6.333725894302959e-06, "loss": 0.0005, "step": 26750 }, { "epoch": 63.01, "grad_norm": 0.001803527120500803, "learning_rate": 6.31532459885176e-06, "loss": 0.0003, "step": 26760 }, { "epoch": 63.01, "grad_norm": 0.0019011934055015445, "learning_rate": 6.2969233034005595e-06, "loss": 0.0, "step": 26770 }, { "epoch": 63.01, "grad_norm": 0.0007527913548983634, "learning_rate": 6.27852200794936e-06, "loss": 0.0036, "step": 26780 }, { "epoch": 63.01, "grad_norm": 0.0012235046597197652, "learning_rate": 6.2601207124981606e-06, "loss": 0.0252, "step": 26790 }, { "epoch": 63.01, "grad_norm": 0.0019505118252709508, "learning_rate": 6.24171941704696e-06, "loss": 0.0128, "step": 26800 }, { "epoch": 63.01, "grad_norm": 0.0008968331967480481, "learning_rate": 6.223318121595761e-06, "loss": 0.0078, "step": 26810 }, { "epoch": 63.01, "grad_norm": 0.0012221608776599169, "learning_rate": 6.204916826144561e-06, "loss": 0.0001, "step": 26820 }, { "epoch": 63.01, "grad_norm": 0.0025473625864833593, "learning_rate": 6.186515530693361e-06, "loss": 0.0001, "step": 26830 }, { "epoch": 63.01, "grad_norm": 0.004625910427421331, "learning_rate": 6.168114235242161e-06, "loss": 0.0003, "step": 26840 }, { "epoch": 63.01, "grad_norm": 0.0007219272665679455, "learning_rate": 6.149712939790961e-06, "loss": 0.0001, "step": 26850 }, { "epoch": 63.01, "grad_norm": 0.0008147148182615638, "learning_rate": 6.131311644339762e-06, "loss": 0.0, "step": 26860 }, { "epoch": 63.01, "grad_norm": 0.001891309511847794, "learning_rate": 6.112910348888562e-06, "loss": 0.0001, "step": 26870 }, { "epoch": 63.01, "grad_norm": 0.001844471669755876, "learning_rate": 6.0945090534373625e-06, "loss": 0.0001, "step": 26880 }, { "epoch": 63.01, "eval_accuracy": 0.7488738738738738, "eval_loss": 1.9532489776611328, "eval_runtime": 38.719, "eval_samples_per_second": 22.934, "eval_steps_per_second": 1.911, "step": 26880 }, { "epoch": 64.0, "grad_norm": 0.0005300881457515061, "learning_rate": 6.0761077579861626e-06, "loss": 0.0, "step": 26890 }, { "epoch": 64.0, "grad_norm": 0.0009176667081192136, "learning_rate": 6.057706462534963e-06, "loss": 0.0001, "step": 26900 }, { "epoch": 64.0, "grad_norm": 0.00210366933606565, "learning_rate": 6.039305167083763e-06, "loss": 0.0, "step": 26910 }, { "epoch": 64.0, "grad_norm": 0.0005858144722878933, "learning_rate": 6.020903871632563e-06, "loss": 0.0, "step": 26920 }, { "epoch": 64.0, "grad_norm": 0.001394058228470385, "learning_rate": 6.002502576181363e-06, "loss": 0.0013, "step": 26930 }, { "epoch": 64.0, "grad_norm": 0.0003652074665296823, "learning_rate": 5.984101280730164e-06, "loss": 0.0, "step": 26940 }, { "epoch": 64.0, "grad_norm": 0.00038057751953601837, "learning_rate": 5.965699985278964e-06, "loss": 0.0002, "step": 26950 }, { "epoch": 64.0, "grad_norm": 0.0007543888641521335, "learning_rate": 5.947298689827764e-06, "loss": 0.0, "step": 26960 }, { "epoch": 64.0, "grad_norm": 0.0008908085874281824, "learning_rate": 5.928897394376564e-06, "loss": 0.0, "step": 26970 }, { "epoch": 64.0, "grad_norm": 0.0010865289950743318, "learning_rate": 5.9104960989253645e-06, "loss": 0.0088, "step": 26980 }, { "epoch": 64.0, "grad_norm": 0.0018082386814057827, "learning_rate": 5.8920948034741654e-06, "loss": 0.0, "step": 26990 }, { "epoch": 64.0, "grad_norm": 0.001776510733179748, "learning_rate": 5.873693508022965e-06, "loss": 0.0, "step": 27000 }, { "epoch": 64.0, "grad_norm": 0.005542340688407421, "learning_rate": 5.855292212571766e-06, "loss": 0.0, "step": 27010 }, { "epoch": 64.0, "grad_norm": 0.0004181989061180502, "learning_rate": 5.836890917120565e-06, "loss": 0.0, "step": 27020 }, { "epoch": 64.0, "grad_norm": 0.00034455108107067645, "learning_rate": 5.818489621669366e-06, "loss": 0.0271, "step": 27030 }, { "epoch": 64.01, "grad_norm": 0.012929372489452362, "learning_rate": 5.800088326218166e-06, "loss": 0.0, "step": 27040 }, { "epoch": 64.01, "grad_norm": 0.0004051885043736547, "learning_rate": 5.781687030766966e-06, "loss": 0.0, "step": 27050 }, { "epoch": 64.01, "grad_norm": 0.000558310654014349, "learning_rate": 5.763285735315767e-06, "loss": 0.0, "step": 27060 }, { "epoch": 64.01, "grad_norm": 0.005561283323913813, "learning_rate": 5.744884439864566e-06, "loss": 0.0, "step": 27070 }, { "epoch": 64.01, "grad_norm": 0.0005570474895648658, "learning_rate": 5.726483144413367e-06, "loss": 0.0001, "step": 27080 }, { "epoch": 64.01, "grad_norm": 0.0005421972018666565, "learning_rate": 5.7080818489621674e-06, "loss": 0.0001, "step": 27090 }, { "epoch": 64.01, "grad_norm": 0.0008089053444564342, "learning_rate": 5.6896805535109675e-06, "loss": 0.0884, "step": 27100 }, { "epoch": 64.01, "grad_norm": 0.0032031191512942314, "learning_rate": 5.671279258059768e-06, "loss": 0.0479, "step": 27110 }, { "epoch": 64.01, "grad_norm": 0.0016441026236861944, "learning_rate": 5.652877962608568e-06, "loss": 0.0094, "step": 27120 }, { "epoch": 64.01, "grad_norm": 0.03962777182459831, "learning_rate": 5.634476667157368e-06, "loss": 0.0, "step": 27130 }, { "epoch": 64.01, "grad_norm": 0.0017225542105734348, "learning_rate": 5.616075371706169e-06, "loss": 0.0, "step": 27140 }, { "epoch": 64.01, "grad_norm": 0.002667994936928153, "learning_rate": 5.597674076254968e-06, "loss": 0.0002, "step": 27150 }, { "epoch": 64.01, "grad_norm": 0.00042104258318431675, "learning_rate": 5.579272780803769e-06, "loss": 0.0, "step": 27160 }, { "epoch": 64.01, "grad_norm": 0.00831848755478859, "learning_rate": 5.560871485352569e-06, "loss": 0.0, "step": 27170 }, { "epoch": 64.01, "grad_norm": 0.01557249017059803, "learning_rate": 5.542470189901369e-06, "loss": 0.0001, "step": 27180 }, { "epoch": 64.01, "grad_norm": 0.000592841359321028, "learning_rate": 5.5240688944501694e-06, "loss": 0.0126, "step": 27190 }, { "epoch": 64.01, "grad_norm": 0.0004729072388727218, "learning_rate": 5.5056675989989695e-06, "loss": 0.0014, "step": 27200 }, { "epoch": 64.01, "grad_norm": 0.0007256526732817292, "learning_rate": 5.4872663035477705e-06, "loss": 0.0211, "step": 27210 }, { "epoch": 64.01, "grad_norm": 0.0019068483961746097, "learning_rate": 5.46886500809657e-06, "loss": 0.0, "step": 27220 }, { "epoch": 64.01, "grad_norm": 0.000938325421884656, "learning_rate": 5.450463712645371e-06, "loss": 0.0, "step": 27230 }, { "epoch": 64.01, "grad_norm": 0.0009550242102704942, "learning_rate": 5.432062417194171e-06, "loss": 0.0, "step": 27240 }, { "epoch": 64.01, "grad_norm": 0.00045210865209810436, "learning_rate": 5.413661121742971e-06, "loss": 0.0001, "step": 27250 }, { "epoch": 64.01, "grad_norm": 0.0003768078749999404, "learning_rate": 5.395259826291772e-06, "loss": 0.0001, "step": 27260 }, { "epoch": 64.01, "grad_norm": 0.0008792284643277526, "learning_rate": 5.376858530840571e-06, "loss": 0.0, "step": 27270 }, { "epoch": 64.01, "grad_norm": 0.00046390038914978504, "learning_rate": 5.358457235389372e-06, "loss": 0.0001, "step": 27280 }, { "epoch": 64.01, "grad_norm": 0.0005584588507190347, "learning_rate": 5.3400559399381714e-06, "loss": 0.0083, "step": 27290 }, { "epoch": 64.01, "grad_norm": 0.0006229018326848745, "learning_rate": 5.321654644486972e-06, "loss": 0.0, "step": 27300 }, { "epoch": 64.01, "eval_accuracy": 0.7477477477477478, "eval_loss": 1.9208874702453613, "eval_runtime": 38.3099, "eval_samples_per_second": 23.179, "eval_steps_per_second": 1.932, "step": 27300 }, { "epoch": 65.0, "grad_norm": 0.0037541654892265797, "learning_rate": 5.3032533490357725e-06, "loss": 0.0, "step": 27310 }, { "epoch": 65.0, "grad_norm": 0.1503923088312149, "learning_rate": 5.284852053584573e-06, "loss": 0.0001, "step": 27320 }, { "epoch": 65.0, "grad_norm": 0.0005371617735363543, "learning_rate": 5.266450758133373e-06, "loss": 0.0, "step": 27330 }, { "epoch": 65.0, "grad_norm": 0.0006791293271817267, "learning_rate": 5.248049462682173e-06, "loss": 0.0051, "step": 27340 }, { "epoch": 65.0, "grad_norm": 0.00040025872294791043, "learning_rate": 5.229648167230973e-06, "loss": 0.0304, "step": 27350 }, { "epoch": 65.0, "grad_norm": 0.0007911776774562895, "learning_rate": 5.211246871779774e-06, "loss": 0.0, "step": 27360 }, { "epoch": 65.0, "grad_norm": 0.0005377155030146241, "learning_rate": 5.192845576328574e-06, "loss": 0.0, "step": 27370 }, { "epoch": 65.0, "grad_norm": 0.001017910661175847, "learning_rate": 5.174444280877374e-06, "loss": 0.0, "step": 27380 }, { "epoch": 65.0, "grad_norm": 0.0012564313365146518, "learning_rate": 5.156042985426174e-06, "loss": 0.0001, "step": 27390 }, { "epoch": 65.0, "grad_norm": 0.0011663263430818915, "learning_rate": 5.137641689974974e-06, "loss": 0.0001, "step": 27400 }, { "epoch": 65.0, "grad_norm": 0.001976665109395981, "learning_rate": 5.1192403945237745e-06, "loss": 0.0, "step": 27410 }, { "epoch": 65.0, "grad_norm": 0.0006329611060209572, "learning_rate": 5.100839099072575e-06, "loss": 0.0, "step": 27420 }, { "epoch": 65.0, "grad_norm": 0.0004533329338300973, "learning_rate": 5.082437803621376e-06, "loss": 0.0004, "step": 27430 }, { "epoch": 65.0, "grad_norm": 0.0014712655683979392, "learning_rate": 5.064036508170175e-06, "loss": 0.0, "step": 27440 }, { "epoch": 65.0, "grad_norm": 0.0005582648445852101, "learning_rate": 5.045635212718976e-06, "loss": 0.0, "step": 27450 }, { "epoch": 65.01, "grad_norm": 0.0004935372853651643, "learning_rate": 5.027233917267776e-06, "loss": 0.0005, "step": 27460 }, { "epoch": 65.01, "grad_norm": 0.0016245003789663315, "learning_rate": 5.008832621816576e-06, "loss": 0.0, "step": 27470 }, { "epoch": 65.01, "grad_norm": 0.0008186784689314663, "learning_rate": 4.990431326365377e-06, "loss": 0.0001, "step": 27480 }, { "epoch": 65.01, "grad_norm": 0.0021012003999203444, "learning_rate": 4.972030030914176e-06, "loss": 0.0002, "step": 27490 }, { "epoch": 65.01, "grad_norm": 0.0007192987250164151, "learning_rate": 4.953628735462977e-06, "loss": 0.0001, "step": 27500 }, { "epoch": 65.01, "grad_norm": 0.0005266540683805943, "learning_rate": 4.9352274400117765e-06, "loss": 0.0, "step": 27510 }, { "epoch": 65.01, "grad_norm": 0.0004016093735117465, "learning_rate": 4.9168261445605775e-06, "loss": 0.0194, "step": 27520 }, { "epoch": 65.01, "grad_norm": 0.0014533177018165588, "learning_rate": 4.898424849109378e-06, "loss": 0.0001, "step": 27530 }, { "epoch": 65.01, "grad_norm": 0.0005293239373713732, "learning_rate": 4.880023553658178e-06, "loss": 0.0, "step": 27540 }, { "epoch": 65.01, "grad_norm": 0.0020739659667015076, "learning_rate": 4.861622258206978e-06, "loss": 0.0, "step": 27550 }, { "epoch": 65.01, "grad_norm": 0.1151590347290039, "learning_rate": 4.843220962755778e-06, "loss": 0.0006, "step": 27560 }, { "epoch": 65.01, "grad_norm": 1.1214478015899658, "learning_rate": 4.824819667304578e-06, "loss": 0.0064, "step": 27570 }, { "epoch": 65.01, "grad_norm": 0.0003342399431858212, "learning_rate": 4.806418371853379e-06, "loss": 0.0001, "step": 27580 }, { "epoch": 65.01, "grad_norm": 0.0003858323907479644, "learning_rate": 4.788017076402179e-06, "loss": 0.0, "step": 27590 }, { "epoch": 65.01, "grad_norm": 0.0004176953516434878, "learning_rate": 4.769615780950979e-06, "loss": 0.0, "step": 27600 }, { "epoch": 65.01, "grad_norm": 0.0005611016531474888, "learning_rate": 4.751214485499779e-06, "loss": 0.0, "step": 27610 }, { "epoch": 65.01, "grad_norm": 0.0020231925882399082, "learning_rate": 4.7328131900485795e-06, "loss": 0.0, "step": 27620 }, { "epoch": 65.01, "grad_norm": 0.0022195447236299515, "learning_rate": 4.7144118945973804e-06, "loss": 0.0001, "step": 27630 }, { "epoch": 65.01, "grad_norm": 0.0003322585253044963, "learning_rate": 4.69601059914618e-06, "loss": 0.0, "step": 27640 }, { "epoch": 65.01, "grad_norm": 0.0010606865398585796, "learning_rate": 4.677609303694981e-06, "loss": 0.0, "step": 27650 }, { "epoch": 65.01, "grad_norm": 0.006328342016786337, "learning_rate": 4.65920800824378e-06, "loss": 0.0, "step": 27660 }, { "epoch": 65.01, "grad_norm": 0.0013095044996589422, "learning_rate": 4.640806712792581e-06, "loss": 0.0, "step": 27670 }, { "epoch": 65.01, "grad_norm": 0.0007367559592239559, "learning_rate": 4.622405417341381e-06, "loss": 0.0, "step": 27680 }, { "epoch": 65.01, "grad_norm": 0.0003999292675871402, "learning_rate": 4.604004121890181e-06, "loss": 0.0138, "step": 27690 }, { "epoch": 65.01, "grad_norm": 0.0006201888318173587, "learning_rate": 4.585602826438982e-06, "loss": 0.0106, "step": 27700 }, { "epoch": 65.01, "grad_norm": 0.0005041907425038517, "learning_rate": 4.567201530987781e-06, "loss": 0.0142, "step": 27710 }, { "epoch": 65.01, "grad_norm": 0.002044772496446967, "learning_rate": 4.548800235536582e-06, "loss": 0.0, "step": 27720 }, { "epoch": 65.01, "eval_accuracy": 0.7578828828828829, "eval_loss": 1.9100127220153809, "eval_runtime": 38.7134, "eval_samples_per_second": 22.938, "eval_steps_per_second": 1.911, "step": 27720 }, { "epoch": 66.0, "grad_norm": 0.001818476477637887, "learning_rate": 4.5303989400853824e-06, "loss": 0.003, "step": 27730 }, { "epoch": 66.0, "grad_norm": 0.0004958516801707447, "learning_rate": 4.5119976446341826e-06, "loss": 0.0076, "step": 27740 }, { "epoch": 66.0, "grad_norm": 0.0003787777677644044, "learning_rate": 4.493596349182983e-06, "loss": 0.0, "step": 27750 }, { "epoch": 66.0, "grad_norm": 0.00042003163252957165, "learning_rate": 4.475195053731783e-06, "loss": 0.0, "step": 27760 }, { "epoch": 66.0, "grad_norm": 0.000411301531130448, "learning_rate": 4.456793758280583e-06, "loss": 0.0, "step": 27770 }, { "epoch": 66.0, "grad_norm": 0.0021449129562824965, "learning_rate": 4.438392462829383e-06, "loss": 0.0435, "step": 27780 }, { "epoch": 66.0, "grad_norm": 0.0018116917926818132, "learning_rate": 4.419991167378184e-06, "loss": 0.0008, "step": 27790 }, { "epoch": 66.0, "grad_norm": 0.0003143279755022377, "learning_rate": 4.401589871926984e-06, "loss": 0.0, "step": 27800 }, { "epoch": 66.0, "grad_norm": 0.0009564289357513189, "learning_rate": 4.383188576475784e-06, "loss": 0.0, "step": 27810 }, { "epoch": 66.0, "grad_norm": 0.000356840348104015, "learning_rate": 4.364787281024584e-06, "loss": 0.0, "step": 27820 }, { "epoch": 66.0, "grad_norm": 0.0005603270838037133, "learning_rate": 4.3463859855733844e-06, "loss": 0.0, "step": 27830 }, { "epoch": 66.0, "grad_norm": 0.0020791892893612385, "learning_rate": 4.3279846901221846e-06, "loss": 0.0, "step": 27840 }, { "epoch": 66.0, "grad_norm": 0.0005288394168019295, "learning_rate": 4.3095833946709855e-06, "loss": 0.0, "step": 27850 }, { "epoch": 66.0, "grad_norm": 0.0009745580609887838, "learning_rate": 4.291182099219785e-06, "loss": 0.0, "step": 27860 }, { "epoch": 66.0, "grad_norm": 0.0019139602081850171, "learning_rate": 4.272780803768586e-06, "loss": 0.0063, "step": 27870 }, { "epoch": 66.01, "grad_norm": 0.0005171472439542413, "learning_rate": 4.254379508317385e-06, "loss": 0.0, "step": 27880 }, { "epoch": 66.01, "grad_norm": 0.0031747317407280207, "learning_rate": 4.235978212866186e-06, "loss": 0.0, "step": 27890 }, { "epoch": 66.01, "grad_norm": 0.0003597049508243799, "learning_rate": 4.217576917414986e-06, "loss": 0.0, "step": 27900 }, { "epoch": 66.01, "grad_norm": 0.0008909418829716742, "learning_rate": 4.199175621963786e-06, "loss": 0.0, "step": 27910 }, { "epoch": 66.01, "grad_norm": 0.0005209531518630683, "learning_rate": 4.180774326512587e-06, "loss": 0.0153, "step": 27920 }, { "epoch": 66.01, "grad_norm": 0.00047349196393042803, "learning_rate": 4.1623730310613864e-06, "loss": 0.0433, "step": 27930 }, { "epoch": 66.01, "grad_norm": 0.00046791709610261023, "learning_rate": 4.143971735610187e-06, "loss": 0.0001, "step": 27940 }, { "epoch": 66.01, "grad_norm": 0.0037538569886237383, "learning_rate": 4.1255704401589875e-06, "loss": 0.0, "step": 27950 }, { "epoch": 66.01, "grad_norm": 0.003720578271895647, "learning_rate": 4.107169144707788e-06, "loss": 0.0, "step": 27960 }, { "epoch": 66.01, "grad_norm": 0.006445688661187887, "learning_rate": 4.088767849256588e-06, "loss": 0.0004, "step": 27970 }, { "epoch": 66.01, "grad_norm": 0.000625148881226778, "learning_rate": 4.070366553805388e-06, "loss": 0.0002, "step": 27980 }, { "epoch": 66.01, "grad_norm": 0.0010217278031632304, "learning_rate": 4.051965258354189e-06, "loss": 0.0, "step": 27990 }, { "epoch": 66.01, "grad_norm": 0.0008879475644789636, "learning_rate": 4.033563962902988e-06, "loss": 0.0, "step": 28000 }, { "epoch": 66.01, "grad_norm": 0.3618137836456299, "learning_rate": 4.015162667451789e-06, "loss": 0.0001, "step": 28010 }, { "epoch": 66.01, "grad_norm": 0.00038238533306866884, "learning_rate": 3.996761372000589e-06, "loss": 0.0001, "step": 28020 }, { "epoch": 66.01, "grad_norm": 0.0006025524926371872, "learning_rate": 3.978360076549389e-06, "loss": 0.0, "step": 28030 }, { "epoch": 66.01, "grad_norm": 0.0008740944904275239, "learning_rate": 3.959958781098189e-06, "loss": 0.0, "step": 28040 }, { "epoch": 66.01, "grad_norm": 0.0003794162184931338, "learning_rate": 3.9415574856469895e-06, "loss": 0.0, "step": 28050 }, { "epoch": 66.01, "grad_norm": 0.0004938667407259345, "learning_rate": 3.92315619019579e-06, "loss": 0.0, "step": 28060 }, { "epoch": 66.01, "grad_norm": 0.0031972068827599287, "learning_rate": 3.904754894744591e-06, "loss": 0.0, "step": 28070 }, { "epoch": 66.01, "grad_norm": 0.0005764389061369002, "learning_rate": 3.88635359929339e-06, "loss": 0.0, "step": 28080 }, { "epoch": 66.01, "grad_norm": 0.0011365560349076986, "learning_rate": 3.867952303842191e-06, "loss": 0.0, "step": 28090 }, { "epoch": 66.01, "grad_norm": 0.0017140272539108992, "learning_rate": 3.849551008390991e-06, "loss": 0.0, "step": 28100 }, { "epoch": 66.01, "grad_norm": 0.0005445133429020643, "learning_rate": 3.831149712939791e-06, "loss": 0.0022, "step": 28110 }, { "epoch": 66.01, "grad_norm": 0.0008768728584982455, "learning_rate": 3.8127484174885916e-06, "loss": 0.0, "step": 28120 }, { "epoch": 66.01, "grad_norm": 0.0003417479747440666, "learning_rate": 3.7943471220373913e-06, "loss": 0.0, "step": 28130 }, { "epoch": 66.01, "grad_norm": 0.0004897731123492122, "learning_rate": 3.775945826586192e-06, "loss": 0.0, "step": 28140 }, { "epoch": 66.01, "eval_accuracy": 0.7533783783783784, "eval_loss": 1.957198143005371, "eval_runtime": 39.1151, "eval_samples_per_second": 22.702, "eval_steps_per_second": 1.892, "step": 28140 }, { "epoch": 67.0, "grad_norm": 0.0009105826611630619, "learning_rate": 3.757544531134992e-06, "loss": 0.0, "step": 28150 }, { "epoch": 67.0, "grad_norm": 0.0004885323578491807, "learning_rate": 3.7391432356837925e-06, "loss": 0.0, "step": 28160 }, { "epoch": 67.0, "grad_norm": 0.0008118312689475715, "learning_rate": 3.720741940232592e-06, "loss": 0.0, "step": 28170 }, { "epoch": 67.0, "grad_norm": 0.0014635213883593678, "learning_rate": 3.7023406447813927e-06, "loss": 0.0001, "step": 28180 }, { "epoch": 67.0, "grad_norm": 0.0004924469976685941, "learning_rate": 3.683939349330193e-06, "loss": 0.0, "step": 28190 }, { "epoch": 67.0, "grad_norm": 0.011600484140217304, "learning_rate": 3.6655380538789934e-06, "loss": 0.0001, "step": 28200 }, { "epoch": 67.0, "grad_norm": 0.000475892738904804, "learning_rate": 3.647136758427794e-06, "loss": 0.0046, "step": 28210 }, { "epoch": 67.0, "grad_norm": 0.00038393758586607873, "learning_rate": 3.6287354629765936e-06, "loss": 0.0, "step": 28220 }, { "epoch": 67.0, "grad_norm": 0.00033145310590043664, "learning_rate": 3.610334167525394e-06, "loss": 0.0001, "step": 28230 }, { "epoch": 67.0, "grad_norm": 0.0016265606973320246, "learning_rate": 3.5919328720741943e-06, "loss": 0.0, "step": 28240 }, { "epoch": 67.0, "grad_norm": 0.013253612443804741, "learning_rate": 3.573531576622995e-06, "loss": 0.0, "step": 28250 }, { "epoch": 67.0, "grad_norm": 0.0012261946685612202, "learning_rate": 3.5551302811717945e-06, "loss": 0.0, "step": 28260 }, { "epoch": 67.0, "grad_norm": 0.00045756183681078255, "learning_rate": 3.536728985720595e-06, "loss": 0.0113, "step": 28270 }, { "epoch": 67.0, "grad_norm": 0.0007601910037919879, "learning_rate": 3.5183276902693947e-06, "loss": 0.0, "step": 28280 }, { "epoch": 67.0, "grad_norm": 0.0007499887724407017, "learning_rate": 3.4999263948181953e-06, "loss": 0.0, "step": 28290 }, { "epoch": 67.01, "grad_norm": 0.0011956521775573492, "learning_rate": 3.481525099366996e-06, "loss": 0.0, "step": 28300 }, { "epoch": 67.01, "grad_norm": 0.0003698724030982703, "learning_rate": 3.463123803915796e-06, "loss": 0.0, "step": 28310 }, { "epoch": 67.01, "grad_norm": 0.0003628400154411793, "learning_rate": 3.4447225084645964e-06, "loss": 0.0385, "step": 28320 }, { "epoch": 67.01, "grad_norm": 0.0010094452882185578, "learning_rate": 3.426321213013396e-06, "loss": 0.0001, "step": 28330 }, { "epoch": 67.01, "grad_norm": 0.0012927157804369926, "learning_rate": 3.4079199175621967e-06, "loss": 0.0, "step": 28340 }, { "epoch": 67.01, "grad_norm": 0.000493381405249238, "learning_rate": 3.389518622110997e-06, "loss": 0.0, "step": 28350 }, { "epoch": 67.01, "grad_norm": 0.0003783382708206773, "learning_rate": 3.3711173266597973e-06, "loss": 0.0, "step": 28360 }, { "epoch": 67.01, "grad_norm": 0.0012582663912326097, "learning_rate": 3.352716031208597e-06, "loss": 0.0, "step": 28370 }, { "epoch": 67.01, "grad_norm": 0.00036473103682510555, "learning_rate": 3.3343147357573976e-06, "loss": 0.0003, "step": 28380 }, { "epoch": 67.01, "grad_norm": 0.0003472709213383496, "learning_rate": 3.3159134403061973e-06, "loss": 0.0, "step": 28390 }, { "epoch": 67.01, "grad_norm": 0.004546219948679209, "learning_rate": 3.297512144854998e-06, "loss": 0.0, "step": 28400 }, { "epoch": 67.01, "grad_norm": 0.0006614047451876104, "learning_rate": 3.2791108494037983e-06, "loss": 0.0134, "step": 28410 }, { "epoch": 67.01, "grad_norm": 0.00038801078335382044, "learning_rate": 3.2607095539525985e-06, "loss": 0.0067, "step": 28420 }, { "epoch": 67.01, "grad_norm": 0.00122673693113029, "learning_rate": 3.242308258501399e-06, "loss": 0.0, "step": 28430 }, { "epoch": 67.01, "grad_norm": 0.0005847996799275279, "learning_rate": 3.2239069630501987e-06, "loss": 0.0001, "step": 28440 }, { "epoch": 67.01, "grad_norm": 0.0003658024943433702, "learning_rate": 3.2055056675989992e-06, "loss": 0.0112, "step": 28450 }, { "epoch": 67.01, "grad_norm": 0.011086560785770416, "learning_rate": 3.1871043721477993e-06, "loss": 0.0, "step": 28460 }, { "epoch": 67.01, "grad_norm": 0.000345290289260447, "learning_rate": 3.1687030766966e-06, "loss": 0.0192, "step": 28470 }, { "epoch": 67.01, "grad_norm": 0.00038937729550525546, "learning_rate": 3.1503017812453996e-06, "loss": 0.0, "step": 28480 }, { "epoch": 67.01, "grad_norm": 0.00036691149580292404, "learning_rate": 3.1319004857942e-06, "loss": 0.0702, "step": 28490 }, { "epoch": 67.01, "grad_norm": 0.00034635685733519495, "learning_rate": 3.1134991903430002e-06, "loss": 0.0, "step": 28500 }, { "epoch": 67.01, "grad_norm": 0.0003403785522095859, "learning_rate": 3.0950978948918003e-06, "loss": 0.0, "step": 28510 }, { "epoch": 67.01, "grad_norm": 0.0004076850600540638, "learning_rate": 3.076696599440601e-06, "loss": 0.0, "step": 28520 }, { "epoch": 67.01, "grad_norm": 0.0008295393199659884, "learning_rate": 3.058295303989401e-06, "loss": 0.0, "step": 28530 }, { "epoch": 67.01, "grad_norm": 0.00039547396590933204, "learning_rate": 3.039894008538201e-06, "loss": 0.0, "step": 28540 }, { "epoch": 67.01, "grad_norm": 0.004409555811434984, "learning_rate": 3.0214927130870012e-06, "loss": 0.0, "step": 28550 }, { "epoch": 67.01, "grad_norm": 0.0004421043850015849, "learning_rate": 3.0030914176358018e-06, "loss": 0.0007, "step": 28560 }, { "epoch": 67.01, "eval_accuracy": 0.75, "eval_loss": 2.0379910469055176, "eval_runtime": 39.3148, "eval_samples_per_second": 22.587, "eval_steps_per_second": 1.882, "step": 28560 }, { "epoch": 68.0, "grad_norm": 0.0004478379269130528, "learning_rate": 2.984690122184602e-06, "loss": 0.0, "step": 28570 }, { "epoch": 68.0, "grad_norm": 0.040339428931474686, "learning_rate": 2.9662888267334024e-06, "loss": 0.0001, "step": 28580 }, { "epoch": 68.0, "grad_norm": 0.0004363558837212622, "learning_rate": 2.9478875312822025e-06, "loss": 0.0517, "step": 28590 }, { "epoch": 68.0, "grad_norm": 0.0005274597206152976, "learning_rate": 2.9294862358310026e-06, "loss": 0.0001, "step": 28600 }, { "epoch": 68.0, "grad_norm": 1.1247056722640991, "learning_rate": 2.9110849403798028e-06, "loss": 0.006, "step": 28610 }, { "epoch": 68.0, "grad_norm": 0.008089935407042503, "learning_rate": 2.892683644928603e-06, "loss": 0.0, "step": 28620 }, { "epoch": 68.0, "grad_norm": 0.0004192329361103475, "learning_rate": 2.8742823494774034e-06, "loss": 0.0, "step": 28630 }, { "epoch": 68.0, "grad_norm": 0.00038057431811466813, "learning_rate": 2.8558810540262035e-06, "loss": 0.0001, "step": 28640 }, { "epoch": 68.0, "grad_norm": 0.0011997149558737874, "learning_rate": 2.8374797585750036e-06, "loss": 0.0, "step": 28650 }, { "epoch": 68.0, "grad_norm": 0.00041154009522870183, "learning_rate": 2.8190784631238038e-06, "loss": 0.0, "step": 28660 }, { "epoch": 68.0, "grad_norm": 9.79233169555664, "learning_rate": 2.8006771676726043e-06, "loss": 0.0811, "step": 28670 }, { "epoch": 68.0, "grad_norm": 0.001218300429172814, "learning_rate": 2.782275872221405e-06, "loss": 0.0, "step": 28680 }, { "epoch": 68.0, "grad_norm": 0.00035766957444138825, "learning_rate": 2.763874576770205e-06, "loss": 0.0, "step": 28690 }, { "epoch": 68.0, "grad_norm": 0.0008741321507841349, "learning_rate": 2.745473281319005e-06, "loss": 0.0, "step": 28700 }, { "epoch": 68.0, "grad_norm": 0.004477696027606726, "learning_rate": 2.727071985867805e-06, "loss": 0.0, "step": 28710 }, { "epoch": 68.01, "grad_norm": 0.00045869784662500024, "learning_rate": 2.7086706904166053e-06, "loss": 0.0, "step": 28720 }, { "epoch": 68.01, "grad_norm": 0.0005403147079050541, "learning_rate": 2.690269394965406e-06, "loss": 0.0, "step": 28730 }, { "epoch": 68.01, "grad_norm": 0.0004175172944087535, "learning_rate": 2.671868099514206e-06, "loss": 0.0, "step": 28740 }, { "epoch": 68.01, "grad_norm": 0.0007928918348625302, "learning_rate": 2.653466804063006e-06, "loss": 0.0001, "step": 28750 }, { "epoch": 68.01, "grad_norm": 0.001285827485844493, "learning_rate": 2.635065508611806e-06, "loss": 0.0, "step": 28760 }, { "epoch": 68.01, "grad_norm": 0.026378748938441277, "learning_rate": 2.6166642131606067e-06, "loss": 0.0001, "step": 28770 }, { "epoch": 68.01, "grad_norm": 0.0004781365569215268, "learning_rate": 2.598262917709407e-06, "loss": 0.0, "step": 28780 }, { "epoch": 68.01, "grad_norm": 0.0003623313969001174, "learning_rate": 2.5798616222582074e-06, "loss": 0.0, "step": 28790 }, { "epoch": 68.01, "grad_norm": 0.00031867920188233256, "learning_rate": 2.5614603268070075e-06, "loss": 0.0, "step": 28800 }, { "epoch": 68.01, "grad_norm": 0.03376823663711548, "learning_rate": 2.5430590313558076e-06, "loss": 0.0, "step": 28810 }, { "epoch": 68.01, "grad_norm": 0.0004712707013823092, "learning_rate": 2.5246577359046077e-06, "loss": 0.0, "step": 28820 }, { "epoch": 68.01, "grad_norm": 0.0008019096567295492, "learning_rate": 2.506256440453408e-06, "loss": 0.0, "step": 28830 }, { "epoch": 68.01, "grad_norm": 0.0006348516908474267, "learning_rate": 2.4878551450022084e-06, "loss": 0.0, "step": 28840 }, { "epoch": 68.01, "grad_norm": 0.008974735625088215, "learning_rate": 2.4694538495510085e-06, "loss": 0.0025, "step": 28850 }, { "epoch": 68.01, "grad_norm": 0.006573653779923916, "learning_rate": 2.4510525540998086e-06, "loss": 0.0283, "step": 28860 }, { "epoch": 68.01, "grad_norm": 0.0004212147614452988, "learning_rate": 2.4326512586486087e-06, "loss": 0.0, "step": 28870 }, { "epoch": 68.01, "grad_norm": 0.0006103275809437037, "learning_rate": 2.4142499631974093e-06, "loss": 0.0, "step": 28880 }, { "epoch": 68.01, "grad_norm": 0.0003645585966296494, "learning_rate": 2.3958486677462094e-06, "loss": 0.0, "step": 28890 }, { "epoch": 68.01, "grad_norm": 0.0007065955433063209, "learning_rate": 2.37744737229501e-06, "loss": 0.0075, "step": 28900 }, { "epoch": 68.01, "grad_norm": 0.000578741601202637, "learning_rate": 2.35904607684381e-06, "loss": 0.0, "step": 28910 }, { "epoch": 68.01, "grad_norm": 7.712615966796875, "learning_rate": 2.34064478139261e-06, "loss": 0.0346, "step": 28920 }, { "epoch": 68.01, "grad_norm": 0.00035892019513994455, "learning_rate": 2.3222434859414103e-06, "loss": 0.0, "step": 28930 }, { "epoch": 68.01, "grad_norm": 0.0003427111078053713, "learning_rate": 2.3038421904902104e-06, "loss": 0.0, "step": 28940 }, { "epoch": 68.01, "grad_norm": 0.0008762883953750134, "learning_rate": 2.285440895039011e-06, "loss": 0.0, "step": 28950 }, { "epoch": 68.01, "grad_norm": 0.0004895761958323419, "learning_rate": 2.267039599587811e-06, "loss": 0.0, "step": 28960 }, { "epoch": 68.01, "grad_norm": 0.00041842888458631933, "learning_rate": 2.248638304136611e-06, "loss": 0.0019, "step": 28970 }, { "epoch": 68.01, "grad_norm": 0.0004923155647702515, "learning_rate": 2.2302370086854117e-06, "loss": 0.0627, "step": 28980 }, { "epoch": 68.01, "eval_accuracy": 0.7578828828828829, "eval_loss": 1.8911042213439941, "eval_runtime": 39.4147, "eval_samples_per_second": 22.53, "eval_steps_per_second": 1.877, "step": 28980 }, { "epoch": 69.0, "grad_norm": 0.001296228845603764, "learning_rate": 2.211835713234212e-06, "loss": 0.0, "step": 28990 }, { "epoch": 69.0, "grad_norm": 0.005461554042994976, "learning_rate": 2.193434417783012e-06, "loss": 0.0, "step": 29000 }, { "epoch": 69.0, "grad_norm": 0.0005484812427312136, "learning_rate": 2.1750331223318125e-06, "loss": 0.0, "step": 29010 }, { "epoch": 69.0, "grad_norm": 0.0005643205367960036, "learning_rate": 2.1566318268806126e-06, "loss": 0.0, "step": 29020 }, { "epoch": 69.0, "grad_norm": 0.000412571185734123, "learning_rate": 2.1382305314294127e-06, "loss": 0.0067, "step": 29030 }, { "epoch": 69.0, "grad_norm": 0.0004682897706516087, "learning_rate": 2.119829235978213e-06, "loss": 0.0, "step": 29040 }, { "epoch": 69.0, "grad_norm": 0.0003385832242202014, "learning_rate": 2.101427940527013e-06, "loss": 0.0006, "step": 29050 }, { "epoch": 69.0, "grad_norm": 0.0004343747568782419, "learning_rate": 2.0830266450758135e-06, "loss": 0.0, "step": 29060 }, { "epoch": 69.0, "grad_norm": 0.0006563673377968371, "learning_rate": 2.0646253496246136e-06, "loss": 0.0604, "step": 29070 }, { "epoch": 69.0, "grad_norm": 0.0003242646635044366, "learning_rate": 2.046224054173414e-06, "loss": 0.0, "step": 29080 }, { "epoch": 69.0, "grad_norm": 0.00507451081648469, "learning_rate": 2.0278227587222142e-06, "loss": 0.0, "step": 29090 }, { "epoch": 69.0, "grad_norm": 0.0006768747116439044, "learning_rate": 2.0094214632710143e-06, "loss": 0.0, "step": 29100 }, { "epoch": 69.0, "grad_norm": 0.0005010124295949936, "learning_rate": 1.991020167819815e-06, "loss": 0.0, "step": 29110 }, { "epoch": 69.0, "grad_norm": 0.018840555101633072, "learning_rate": 1.972618872368615e-06, "loss": 0.0001, "step": 29120 }, { "epoch": 69.0, "grad_norm": 0.0009619954507797956, "learning_rate": 1.954217576917415e-06, "loss": 0.0, "step": 29130 }, { "epoch": 69.01, "grad_norm": 0.0002928886387962848, "learning_rate": 1.9358162814662152e-06, "loss": 0.0, "step": 29140 }, { "epoch": 69.01, "grad_norm": 0.0005956662353128195, "learning_rate": 1.9174149860150153e-06, "loss": 0.0, "step": 29150 }, { "epoch": 69.01, "grad_norm": 0.0006041673477739096, "learning_rate": 1.8990136905638157e-06, "loss": 0.0, "step": 29160 }, { "epoch": 69.01, "grad_norm": 0.0002941975835710764, "learning_rate": 1.8806123951126158e-06, "loss": 0.0, "step": 29170 }, { "epoch": 69.01, "grad_norm": 0.0005585135659202933, "learning_rate": 1.8622110996614161e-06, "loss": 0.0796, "step": 29180 }, { "epoch": 69.01, "grad_norm": 0.0006412527291104198, "learning_rate": 1.8438098042102167e-06, "loss": 0.047, "step": 29190 }, { "epoch": 69.01, "grad_norm": 0.00156042177695781, "learning_rate": 1.8254085087590168e-06, "loss": 0.0, "step": 29200 }, { "epoch": 69.01, "grad_norm": 0.000436204340076074, "learning_rate": 1.807007213307817e-06, "loss": 0.0, "step": 29210 }, { "epoch": 69.01, "grad_norm": 0.0007626641308888793, "learning_rate": 1.7886059178566172e-06, "loss": 0.0134, "step": 29220 }, { "epoch": 69.01, "grad_norm": 0.0009147358941845596, "learning_rate": 1.7702046224054175e-06, "loss": 0.0001, "step": 29230 }, { "epoch": 69.01, "grad_norm": 0.000774869869928807, "learning_rate": 1.7518033269542177e-06, "loss": 0.0, "step": 29240 }, { "epoch": 69.01, "grad_norm": 0.0009357577655464411, "learning_rate": 1.7334020315030178e-06, "loss": 0.0, "step": 29250 }, { "epoch": 69.01, "grad_norm": 0.00034666634746827185, "learning_rate": 1.715000736051818e-06, "loss": 0.0, "step": 29260 }, { "epoch": 69.01, "grad_norm": 0.0022142743691802025, "learning_rate": 1.6965994406006182e-06, "loss": 0.0, "step": 29270 }, { "epoch": 69.01, "grad_norm": 0.00055282301036641, "learning_rate": 1.6781981451494185e-06, "loss": 0.0001, "step": 29280 }, { "epoch": 69.01, "grad_norm": 0.0010218261741101742, "learning_rate": 1.659796849698219e-06, "loss": 0.0, "step": 29290 }, { "epoch": 69.01, "grad_norm": 0.0004027817049063742, "learning_rate": 1.6413955542470192e-06, "loss": 0.0, "step": 29300 }, { "epoch": 69.01, "grad_norm": 0.0006661299848929048, "learning_rate": 1.6229942587958193e-06, "loss": 0.0, "step": 29310 }, { "epoch": 69.01, "grad_norm": 0.0005064262077212334, "learning_rate": 1.6045929633446196e-06, "loss": 0.0, "step": 29320 }, { "epoch": 69.01, "grad_norm": 0.5367152094841003, "learning_rate": 1.5861916678934198e-06, "loss": 0.0001, "step": 29330 }, { "epoch": 69.01, "grad_norm": 0.005905908532440662, "learning_rate": 1.56779037244222e-06, "loss": 0.0001, "step": 29340 }, { "epoch": 69.01, "grad_norm": 0.00037446129135787487, "learning_rate": 1.5493890769910202e-06, "loss": 0.0, "step": 29350 }, { "epoch": 69.01, "grad_norm": 1.1278949975967407, "learning_rate": 1.5309877815398203e-06, "loss": 0.0069, "step": 29360 }, { "epoch": 69.01, "grad_norm": 0.0008586321491748095, "learning_rate": 1.5125864860886209e-06, "loss": 0.0, "step": 29370 }, { "epoch": 69.01, "grad_norm": 0.00120458600576967, "learning_rate": 1.494185190637421e-06, "loss": 0.0, "step": 29380 }, { "epoch": 69.01, "grad_norm": 0.002725456142798066, "learning_rate": 1.475783895186221e-06, "loss": 0.0, "step": 29390 }, { "epoch": 69.01, "grad_norm": 0.00047182320849969983, "learning_rate": 1.4573825997350214e-06, "loss": 0.0002, "step": 29400 }, { "epoch": 69.01, "eval_accuracy": 0.75, "eval_loss": 1.9255236387252808, "eval_runtime": 38.4915, "eval_samples_per_second": 23.07, "eval_steps_per_second": 1.923, "step": 29400 }, { "epoch": 70.0, "grad_norm": 0.0007249056943692267, "learning_rate": 1.4389813042838215e-06, "loss": 0.0003, "step": 29410 }, { "epoch": 70.0, "grad_norm": 0.0006666265544481575, "learning_rate": 1.420580008832622e-06, "loss": 0.0, "step": 29420 }, { "epoch": 70.0, "grad_norm": 0.0003448748611845076, "learning_rate": 1.4021787133814222e-06, "loss": 0.0033, "step": 29430 }, { "epoch": 70.0, "grad_norm": 0.0004704460152424872, "learning_rate": 1.3837774179302223e-06, "loss": 0.0, "step": 29440 }, { "epoch": 70.0, "grad_norm": 0.0005372213781811297, "learning_rate": 1.3653761224790226e-06, "loss": 0.0, "step": 29450 }, { "epoch": 70.0, "grad_norm": 0.004357726778835058, "learning_rate": 1.3469748270278227e-06, "loss": 0.0001, "step": 29460 }, { "epoch": 70.0, "grad_norm": 0.0006257134955376387, "learning_rate": 1.328573531576623e-06, "loss": 0.0, "step": 29470 }, { "epoch": 70.0, "grad_norm": 0.00044351391261443496, "learning_rate": 1.3101722361254234e-06, "loss": 0.0, "step": 29480 }, { "epoch": 70.0, "grad_norm": 0.0006109604146331549, "learning_rate": 1.2917709406742235e-06, "loss": 0.0, "step": 29490 }, { "epoch": 70.0, "grad_norm": 0.0006328423623926938, "learning_rate": 1.2733696452230238e-06, "loss": 0.0, "step": 29500 }, { "epoch": 70.0, "grad_norm": 0.0005007470608688891, "learning_rate": 1.254968349771824e-06, "loss": 0.0, "step": 29510 }, { "epoch": 70.0, "grad_norm": 0.0008502065320499241, "learning_rate": 1.236567054320624e-06, "loss": 0.0, "step": 29520 }, { "epoch": 70.0, "grad_norm": 0.0011528691975399852, "learning_rate": 1.2181657588694246e-06, "loss": 0.0, "step": 29530 }, { "epoch": 70.0, "grad_norm": 0.0006126620573922992, "learning_rate": 1.1997644634182247e-06, "loss": 0.0099, "step": 29540 }, { "epoch": 70.0, "grad_norm": 0.001300094067119062, "learning_rate": 1.1813631679670248e-06, "loss": 0.0531, "step": 29550 }, { "epoch": 70.01, "grad_norm": 0.027509033679962158, "learning_rate": 1.1629618725158252e-06, "loss": 0.0, "step": 29560 }, { "epoch": 70.01, "grad_norm": 0.00041231224895454943, "learning_rate": 1.1445605770646253e-06, "loss": 0.0, "step": 29570 }, { "epoch": 70.01, "grad_norm": 0.0006713059265166521, "learning_rate": 1.1261592816134256e-06, "loss": 0.0, "step": 29580 }, { "epoch": 70.01, "grad_norm": 0.0016158471116796136, "learning_rate": 1.107757986162226e-06, "loss": 0.0, "step": 29590 }, { "epoch": 70.01, "grad_norm": 0.00035877004847861826, "learning_rate": 1.089356690711026e-06, "loss": 0.0, "step": 29600 }, { "epoch": 70.01, "grad_norm": 0.0011403877288103104, "learning_rate": 1.0709553952598264e-06, "loss": 0.0, "step": 29610 }, { "epoch": 70.01, "grad_norm": 0.000379040400730446, "learning_rate": 1.0525540998086265e-06, "loss": 0.0, "step": 29620 }, { "epoch": 70.01, "grad_norm": 0.0004945829859934747, "learning_rate": 1.0341528043574268e-06, "loss": 0.0, "step": 29630 }, { "epoch": 70.01, "grad_norm": 0.0005318563780747354, "learning_rate": 1.0157515089062271e-06, "loss": 0.0001, "step": 29640 }, { "epoch": 70.01, "grad_norm": 0.0008199013536795974, "learning_rate": 9.973502134550273e-07, "loss": 0.0, "step": 29650 }, { "epoch": 70.01, "grad_norm": 0.0007847716915421188, "learning_rate": 9.789489180038276e-07, "loss": 0.0, "step": 29660 }, { "epoch": 70.01, "grad_norm": 0.000548962561879307, "learning_rate": 9.605476225526277e-07, "loss": 0.0, "step": 29670 }, { "epoch": 70.01, "grad_norm": 0.006194377318024635, "learning_rate": 9.42146327101428e-07, "loss": 0.0, "step": 29680 }, { "epoch": 70.01, "grad_norm": 0.00041105650598183274, "learning_rate": 9.237450316502283e-07, "loss": 0.0, "step": 29690 }, { "epoch": 70.01, "grad_norm": 0.0003589960979297757, "learning_rate": 9.053437361990285e-07, "loss": 0.0, "step": 29700 }, { "epoch": 70.01, "grad_norm": 0.0005238248268142343, "learning_rate": 8.869424407478287e-07, "loss": 0.0021, "step": 29710 }, { "epoch": 70.01, "grad_norm": 0.0009069875814020634, "learning_rate": 8.685411452966289e-07, "loss": 0.0, "step": 29720 }, { "epoch": 70.01, "grad_norm": 0.00038318498991429806, "learning_rate": 8.501398498454292e-07, "loss": 0.0, "step": 29730 }, { "epoch": 70.01, "grad_norm": 0.00046108453534543514, "learning_rate": 8.317385543942295e-07, "loss": 0.0, "step": 29740 }, { "epoch": 70.01, "grad_norm": 0.0004549498262349516, "learning_rate": 8.133372589430297e-07, "loss": 0.0, "step": 29750 }, { "epoch": 70.01, "grad_norm": 0.0012873295927420259, "learning_rate": 7.949359634918298e-07, "loss": 0.0056, "step": 29760 }, { "epoch": 70.01, "grad_norm": 0.0007264292216859758, "learning_rate": 7.765346680406301e-07, "loss": 0.0, "step": 29770 }, { "epoch": 70.01, "grad_norm": 0.0005384713294915855, "learning_rate": 7.581333725894304e-07, "loss": 0.0, "step": 29780 }, { "epoch": 70.01, "grad_norm": 0.0006688968860544264, "learning_rate": 7.397320771382306e-07, "loss": 0.0, "step": 29790 }, { "epoch": 70.01, "grad_norm": 0.000776001950725913, "learning_rate": 7.213307816870308e-07, "loss": 0.0, "step": 29800 }, { "epoch": 70.01, "grad_norm": 0.0006710118614137173, "learning_rate": 7.02929486235831e-07, "loss": 0.0, "step": 29810 }, { "epoch": 70.01, "grad_norm": 0.003348682075738907, "learning_rate": 6.845281907846312e-07, "loss": 0.0, "step": 29820 }, { "epoch": 70.01, "eval_accuracy": 0.7567567567567568, "eval_loss": 1.919495701789856, "eval_runtime": 39.3857, "eval_samples_per_second": 22.546, "eval_steps_per_second": 1.879, "step": 29820 }, { "epoch": 71.0, "grad_norm": 0.0005197848076932132, "learning_rate": 6.661268953334316e-07, "loss": 0.0, "step": 29830 }, { "epoch": 71.0, "grad_norm": 0.0004043731023557484, "learning_rate": 6.477255998822317e-07, "loss": 0.0, "step": 29840 }, { "epoch": 71.0, "grad_norm": 0.000854467274621129, "learning_rate": 6.29324304431032e-07, "loss": 0.0, "step": 29850 }, { "epoch": 71.0, "grad_norm": 0.0010478779440745711, "learning_rate": 6.109230089798322e-07, "loss": 0.0, "step": 29860 }, { "epoch": 71.0, "grad_norm": 0.0005419045337475836, "learning_rate": 5.925217135286324e-07, "loss": 0.0, "step": 29870 }, { "epoch": 71.0, "grad_norm": 0.00035095299244858325, "learning_rate": 5.741204180774327e-07, "loss": 0.0001, "step": 29880 }, { "epoch": 71.0, "grad_norm": 0.0006842725561000407, "learning_rate": 5.557191226262329e-07, "loss": 0.0, "step": 29890 }, { "epoch": 71.0, "grad_norm": 0.0006226678378880024, "learning_rate": 5.373178271750332e-07, "loss": 0.0, "step": 29900 }, { "epoch": 71.0, "grad_norm": 0.000504440104123205, "learning_rate": 5.189165317238333e-07, "loss": 0.0002, "step": 29910 }, { "epoch": 71.0, "grad_norm": 0.00035010086139664054, "learning_rate": 5.005152362726336e-07, "loss": 0.016, "step": 29920 }, { "epoch": 71.0, "grad_norm": 0.0004880847118329257, "learning_rate": 4.821139408214339e-07, "loss": 0.0001, "step": 29930 }, { "epoch": 71.0, "grad_norm": 0.00041001607314683497, "learning_rate": 4.6371264537023405e-07, "loss": 0.0, "step": 29940 }, { "epoch": 71.0, "grad_norm": 0.45223134756088257, "learning_rate": 4.4531134991903427e-07, "loss": 0.0002, "step": 29950 }, { "epoch": 71.0, "grad_norm": 0.0010492827277630568, "learning_rate": 4.2691005446783455e-07, "loss": 0.0, "step": 29960 }, { "epoch": 71.0, "grad_norm": 0.00040615900070406497, "learning_rate": 4.0850875901663477e-07, "loss": 0.0, "step": 29970 }, { "epoch": 71.01, "grad_norm": 1.2249135971069336, "learning_rate": 3.90107463565435e-07, "loss": 0.007, "step": 29980 }, { "epoch": 71.01, "grad_norm": 0.0005851155729033053, "learning_rate": 3.7170616811423526e-07, "loss": 0.0, "step": 29990 }, { "epoch": 71.01, "grad_norm": 0.002505541779100895, "learning_rate": 3.533048726630355e-07, "loss": 0.0002, "step": 30000 }, { "epoch": 71.01, "grad_norm": 0.0007141873356886208, "learning_rate": 3.3490357721183576e-07, "loss": 0.0, "step": 30010 }, { "epoch": 71.01, "grad_norm": 0.0004104756226297468, "learning_rate": 3.1650228176063593e-07, "loss": 0.0, "step": 30020 }, { "epoch": 71.01, "grad_norm": 0.0005138775450177491, "learning_rate": 2.981009863094362e-07, "loss": 0.0, "step": 30030 }, { "epoch": 71.01, "grad_norm": 0.0005845970590598881, "learning_rate": 2.796996908582364e-07, "loss": 0.0, "step": 30040 }, { "epoch": 71.01, "grad_norm": 0.0018744993722066283, "learning_rate": 2.612983954070367e-07, "loss": 0.0001, "step": 30050 }, { "epoch": 71.01, "grad_norm": 0.0002829184231813997, "learning_rate": 2.4289709995583687e-07, "loss": 0.0, "step": 30060 }, { "epoch": 71.01, "grad_norm": 0.004356312565505505, "learning_rate": 2.2449580450463714e-07, "loss": 0.0, "step": 30070 }, { "epoch": 71.01, "grad_norm": 0.00044179134420119226, "learning_rate": 2.0609450905343736e-07, "loss": 0.0042, "step": 30080 }, { "epoch": 71.01, "grad_norm": 0.0003790935152210295, "learning_rate": 1.876932136022376e-07, "loss": 0.0, "step": 30090 }, { "epoch": 71.01, "grad_norm": 0.00037218283978290856, "learning_rate": 1.6929191815103783e-07, "loss": 0.0015, "step": 30100 }, { "epoch": 71.01, "grad_norm": 0.0008030639728531241, "learning_rate": 1.5089062269983808e-07, "loss": 0.0, "step": 30110 }, { "epoch": 71.01, "grad_norm": 0.0006200214847922325, "learning_rate": 1.324893272486383e-07, "loss": 0.0, "step": 30120 }, { "epoch": 71.01, "grad_norm": 0.00036839168751612306, "learning_rate": 1.1408803179743855e-07, "loss": 0.0, "step": 30130 }, { "epoch": 71.01, "grad_norm": 0.0003330856270622462, "learning_rate": 9.568673634623877e-08, "loss": 0.0, "step": 30140 }, { "epoch": 71.01, "grad_norm": 0.0003486187488306314, "learning_rate": 7.7285440895039e-08, "loss": 0.0258, "step": 30150 }, { "epoch": 71.01, "grad_norm": 0.000903045351151377, "learning_rate": 5.888414544383925e-08, "loss": 0.0, "step": 30160 }, { "epoch": 71.01, "grad_norm": 0.0006916265119798481, "learning_rate": 4.048284999263948e-08, "loss": 0.0, "step": 30170 }, { "epoch": 71.01, "grad_norm": 0.0004026548413094133, "learning_rate": 2.2081554541439718e-08, "loss": 0.0, "step": 30180 }, { "epoch": 71.01, "grad_norm": 0.00039476496749557555, "learning_rate": 3.6802590902399533e-09, "loss": 0.0, "step": 30190 }, { "epoch": 71.01, "eval_accuracy": 0.75, "eval_loss": 1.9316831827163696, "eval_runtime": 39.2933, "eval_samples_per_second": 22.599, "eval_steps_per_second": 1.883, "step": 30192 }, { "epoch": 71.01, "step": 30192, "total_flos": 4.507464203371508e+20, "train_loss": 0.11212031152609238, "train_runtime": 37654.5939, "train_samples_per_second": 9.622, "train_steps_per_second": 0.802 }, { "epoch": 71.01, "eval_accuracy": 0.7623873873873874, "eval_loss": 1.6253712177276611, "eval_runtime": 49.894, "eval_samples_per_second": 17.798, "eval_steps_per_second": 1.483, "step": 30192 }, { "epoch": 71.01, "eval_accuracy": 0.7623873873873874, "eval_loss": 1.6253712177276611, "eval_runtime": 38.5437, "eval_samples_per_second": 23.039, "eval_steps_per_second": 1.92, "step": 30192 } ], "logging_steps": 10, "max_steps": 30192, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "total_flos": 4.507464203371508e+20, "train_batch_size": 12, "trial_name": null, "trial_params": null }