{ "best_metric": 0.42281925678253174, "best_model_checkpoint": "Action_all_10_class/checkpoint-1400", "epoch": 10.0, "eval_steps": 100, "global_step": 2790, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.7864842414855957, "learning_rate": 9.96415770609319e-05, "loss": 2.2581, "step": 10 }, { "epoch": 0.07, "grad_norm": 1.855455756187439, "learning_rate": 9.928315412186381e-05, "loss": 2.1701, "step": 20 }, { "epoch": 0.11, "grad_norm": 1.9831140041351318, "learning_rate": 9.892473118279571e-05, "loss": 2.0292, "step": 30 }, { "epoch": 0.14, "grad_norm": 2.3516452312469482, "learning_rate": 9.85663082437276e-05, "loss": 1.9225, "step": 40 }, { "epoch": 0.18, "grad_norm": 2.3717079162597656, "learning_rate": 9.820788530465951e-05, "loss": 1.7377, "step": 50 }, { "epoch": 0.22, "grad_norm": 2.2911033630371094, "learning_rate": 9.78494623655914e-05, "loss": 1.6861, "step": 60 }, { "epoch": 0.25, "grad_norm": 3.3348388671875, "learning_rate": 9.74910394265233e-05, "loss": 1.5431, "step": 70 }, { "epoch": 0.29, "grad_norm": 3.6830499172210693, "learning_rate": 9.716845878136201e-05, "loss": 1.5209, "step": 80 }, { "epoch": 0.32, "grad_norm": 4.707543849945068, "learning_rate": 9.681003584229391e-05, "loss": 1.4562, "step": 90 }, { "epoch": 0.36, "grad_norm": 4.678582191467285, "learning_rate": 9.645161290322581e-05, "loss": 1.3076, "step": 100 }, { "epoch": 0.36, "eval_accuracy": 0.7259713701431493, "eval_loss": 1.150874137878418, "eval_runtime": 16.6221, "eval_samples_per_second": 58.837, "eval_steps_per_second": 7.4, "step": 100 }, { "epoch": 0.39, "grad_norm": 3.086634397506714, "learning_rate": 9.609318996415772e-05, "loss": 1.2684, "step": 110 }, { "epoch": 0.43, "grad_norm": 2.6688361167907715, "learning_rate": 9.573476702508962e-05, "loss": 1.1876, "step": 120 }, { "epoch": 0.47, "grad_norm": 3.8015213012695312, "learning_rate": 9.53763440860215e-05, "loss": 1.135, "step": 130 }, { "epoch": 0.5, "grad_norm": 5.519741535186768, "learning_rate": 9.501792114695342e-05, "loss": 1.1882, "step": 140 }, { "epoch": 0.54, "grad_norm": 2.872791290283203, "learning_rate": 9.465949820788531e-05, "loss": 1.0513, "step": 150 }, { "epoch": 0.57, "grad_norm": 3.185737371444702, "learning_rate": 9.430107526881721e-05, "loss": 1.1026, "step": 160 }, { "epoch": 0.61, "grad_norm": 5.020754814147949, "learning_rate": 9.39426523297491e-05, "loss": 0.9906, "step": 170 }, { "epoch": 0.65, "grad_norm": 2.217747688293457, "learning_rate": 9.358422939068101e-05, "loss": 0.9737, "step": 180 }, { "epoch": 0.68, "grad_norm": 3.152509927749634, "learning_rate": 9.32258064516129e-05, "loss": 1.0249, "step": 190 }, { "epoch": 0.72, "grad_norm": 5.835267066955566, "learning_rate": 9.28673835125448e-05, "loss": 0.9605, "step": 200 }, { "epoch": 0.72, "eval_accuracy": 0.8057259713701431, "eval_loss": 0.7632270455360413, "eval_runtime": 13.1226, "eval_samples_per_second": 74.528, "eval_steps_per_second": 9.373, "step": 200 }, { "epoch": 0.75, "grad_norm": 3.458073616027832, "learning_rate": 9.25089605734767e-05, "loss": 0.8623, "step": 210 }, { "epoch": 0.79, "grad_norm": 3.9055118560791016, "learning_rate": 9.215053763440861e-05, "loss": 0.8075, "step": 220 }, { "epoch": 0.82, "grad_norm": 5.982291221618652, "learning_rate": 9.17921146953405e-05, "loss": 0.8726, "step": 230 }, { "epoch": 0.86, "grad_norm": 6.739437103271484, "learning_rate": 9.143369175627241e-05, "loss": 0.9887, "step": 240 }, { "epoch": 0.9, "grad_norm": 3.962754249572754, "learning_rate": 9.10752688172043e-05, "loss": 0.8384, "step": 250 }, { "epoch": 0.93, "grad_norm": 5.224504470825195, "learning_rate": 9.07168458781362e-05, "loss": 0.8309, "step": 260 }, { "epoch": 0.97, "grad_norm": 6.783730983734131, "learning_rate": 9.03584229390681e-05, "loss": 0.756, "step": 270 }, { "epoch": 1.0, "grad_norm": 1.871099829673767, "learning_rate": 9e-05, "loss": 0.7874, "step": 280 }, { "epoch": 1.04, "grad_norm": 3.4176278114318848, "learning_rate": 8.964157706093191e-05, "loss": 0.7627, "step": 290 }, { "epoch": 1.08, "grad_norm": 2.1947131156921387, "learning_rate": 8.92831541218638e-05, "loss": 0.7508, "step": 300 }, { "epoch": 1.08, "eval_accuracy": 0.8006134969325154, "eval_loss": 0.6995136141777039, "eval_runtime": 13.1501, "eval_samples_per_second": 74.372, "eval_steps_per_second": 9.354, "step": 300 }, { "epoch": 1.11, "grad_norm": 4.5141119956970215, "learning_rate": 8.892473118279571e-05, "loss": 0.9078, "step": 310 }, { "epoch": 1.15, "grad_norm": 3.0271410942077637, "learning_rate": 8.85663082437276e-05, "loss": 0.8313, "step": 320 }, { "epoch": 1.18, "grad_norm": 4.396864891052246, "learning_rate": 8.82078853046595e-05, "loss": 0.735, "step": 330 }, { "epoch": 1.22, "grad_norm": 6.498298168182373, "learning_rate": 8.78494623655914e-05, "loss": 0.6945, "step": 340 }, { "epoch": 1.25, "grad_norm": 3.187629461288452, "learning_rate": 8.74910394265233e-05, "loss": 0.6215, "step": 350 }, { "epoch": 1.29, "grad_norm": 5.515172004699707, "learning_rate": 8.713261648745519e-05, "loss": 0.7575, "step": 360 }, { "epoch": 1.33, "grad_norm": 4.944267749786377, "learning_rate": 8.677419354838711e-05, "loss": 0.7753, "step": 370 }, { "epoch": 1.36, "grad_norm": 4.4278388023376465, "learning_rate": 8.6415770609319e-05, "loss": 0.7316, "step": 380 }, { "epoch": 1.4, "grad_norm": 4.232585906982422, "learning_rate": 8.60573476702509e-05, "loss": 0.7011, "step": 390 }, { "epoch": 1.43, "grad_norm": 2.1777145862579346, "learning_rate": 8.56989247311828e-05, "loss": 0.7542, "step": 400 }, { "epoch": 1.43, "eval_accuracy": 0.83640081799591, "eval_loss": 0.5710071921348572, "eval_runtime": 13.0952, "eval_samples_per_second": 74.684, "eval_steps_per_second": 9.393, "step": 400 }, { "epoch": 1.47, "grad_norm": 3.8884618282318115, "learning_rate": 8.53405017921147e-05, "loss": 0.6824, "step": 410 }, { "epoch": 1.51, "grad_norm": 3.020177125930786, "learning_rate": 8.49820788530466e-05, "loss": 0.6009, "step": 420 }, { "epoch": 1.54, "grad_norm": 5.7138519287109375, "learning_rate": 8.462365591397849e-05, "loss": 0.5374, "step": 430 }, { "epoch": 1.58, "grad_norm": 6.61539888381958, "learning_rate": 8.426523297491041e-05, "loss": 0.7105, "step": 440 }, { "epoch": 1.61, "grad_norm": 3.700871706008911, "learning_rate": 8.39068100358423e-05, "loss": 0.6331, "step": 450 }, { "epoch": 1.65, "grad_norm": 6.238260269165039, "learning_rate": 8.35483870967742e-05, "loss": 0.7435, "step": 460 }, { "epoch": 1.68, "grad_norm": 1.4030262231826782, "learning_rate": 8.32258064516129e-05, "loss": 0.6601, "step": 470 }, { "epoch": 1.72, "grad_norm": 9.980137825012207, "learning_rate": 8.28673835125448e-05, "loss": 0.649, "step": 480 }, { "epoch": 1.76, "grad_norm": 11.748491287231445, "learning_rate": 8.250896057347671e-05, "loss": 0.7075, "step": 490 }, { "epoch": 1.79, "grad_norm": 7.283904552459717, "learning_rate": 8.215053763440861e-05, "loss": 0.6945, "step": 500 }, { "epoch": 1.79, "eval_accuracy": 0.8588957055214724, "eval_loss": 0.5103773474693298, "eval_runtime": 13.2386, "eval_samples_per_second": 73.875, "eval_steps_per_second": 9.291, "step": 500 }, { "epoch": 1.83, "grad_norm": 3.227905035018921, "learning_rate": 8.179211469534051e-05, "loss": 0.5835, "step": 510 }, { "epoch": 1.86, "grad_norm": 4.389718532562256, "learning_rate": 8.14336917562724e-05, "loss": 0.6946, "step": 520 }, { "epoch": 1.9, "grad_norm": 2.76204514503479, "learning_rate": 8.107526881720431e-05, "loss": 0.6389, "step": 530 }, { "epoch": 1.94, "grad_norm": 3.973482370376587, "learning_rate": 8.07168458781362e-05, "loss": 0.5478, "step": 540 }, { "epoch": 1.97, "grad_norm": 7.488864898681641, "learning_rate": 8.03584229390681e-05, "loss": 0.5571, "step": 550 }, { "epoch": 2.01, "grad_norm": 4.2505059242248535, "learning_rate": 8e-05, "loss": 0.7184, "step": 560 }, { "epoch": 2.04, "grad_norm": 1.9729084968566895, "learning_rate": 7.964157706093191e-05, "loss": 0.5643, "step": 570 }, { "epoch": 2.08, "grad_norm": 2.607386589050293, "learning_rate": 7.92831541218638e-05, "loss": 0.4484, "step": 580 }, { "epoch": 2.11, "grad_norm": 4.689638614654541, "learning_rate": 7.892473118279571e-05, "loss": 0.5449, "step": 590 }, { "epoch": 2.15, "grad_norm": 6.3727707862854, "learning_rate": 7.85663082437276e-05, "loss": 0.5218, "step": 600 }, { "epoch": 2.15, "eval_accuracy": 0.8343558282208589, "eval_loss": 0.5389257073402405, "eval_runtime": 13.1796, "eval_samples_per_second": 74.205, "eval_steps_per_second": 9.333, "step": 600 }, { "epoch": 2.19, "grad_norm": 6.6133832931518555, "learning_rate": 7.82078853046595e-05, "loss": 0.5242, "step": 610 }, { "epoch": 2.22, "grad_norm": 6.967311382293701, "learning_rate": 7.784946236559139e-05, "loss": 0.5191, "step": 620 }, { "epoch": 2.26, "grad_norm": 6.818421363830566, "learning_rate": 7.74910394265233e-05, "loss": 0.4675, "step": 630 }, { "epoch": 2.29, "grad_norm": 3.4351966381073, "learning_rate": 7.71326164874552e-05, "loss": 0.538, "step": 640 }, { "epoch": 2.33, "grad_norm": 5.848361968994141, "learning_rate": 7.67741935483871e-05, "loss": 0.513, "step": 650 }, { "epoch": 2.37, "grad_norm": 3.990499496459961, "learning_rate": 7.6415770609319e-05, "loss": 0.5287, "step": 660 }, { "epoch": 2.4, "grad_norm": 3.2983336448669434, "learning_rate": 7.60573476702509e-05, "loss": 0.5794, "step": 670 }, { "epoch": 2.44, "grad_norm": 2.981754779815674, "learning_rate": 7.56989247311828e-05, "loss": 0.3556, "step": 680 }, { "epoch": 2.47, "grad_norm": 7.759634494781494, "learning_rate": 7.53405017921147e-05, "loss": 0.5055, "step": 690 }, { "epoch": 2.51, "grad_norm": 7.100767135620117, "learning_rate": 7.49820788530466e-05, "loss": 0.5365, "step": 700 }, { "epoch": 2.51, "eval_accuracy": 0.8466257668711656, "eval_loss": 0.5034566521644592, "eval_runtime": 13.0487, "eval_samples_per_second": 74.95, "eval_steps_per_second": 9.426, "step": 700 }, { "epoch": 2.54, "grad_norm": 3.7873051166534424, "learning_rate": 7.462365591397849e-05, "loss": 0.7049, "step": 710 }, { "epoch": 2.58, "grad_norm": 4.464657783508301, "learning_rate": 7.42652329749104e-05, "loss": 0.6012, "step": 720 }, { "epoch": 2.62, "grad_norm": 6.518985271453857, "learning_rate": 7.39426523297491e-05, "loss": 0.6301, "step": 730 }, { "epoch": 2.65, "grad_norm": 4.162339210510254, "learning_rate": 7.3584229390681e-05, "loss": 0.4366, "step": 740 }, { "epoch": 2.69, "grad_norm": 5.646885395050049, "learning_rate": 7.32258064516129e-05, "loss": 0.6134, "step": 750 }, { "epoch": 2.72, "grad_norm": 8.28348445892334, "learning_rate": 7.28673835125448e-05, "loss": 0.4007, "step": 760 }, { "epoch": 2.76, "grad_norm": 2.606187105178833, "learning_rate": 7.250896057347671e-05, "loss": 0.5909, "step": 770 }, { "epoch": 2.8, "grad_norm": 8.181116104125977, "learning_rate": 7.215053763440861e-05, "loss": 0.5286, "step": 780 }, { "epoch": 2.83, "grad_norm": 5.969821929931641, "learning_rate": 7.179211469534051e-05, "loss": 0.4924, "step": 790 }, { "epoch": 2.87, "grad_norm": 5.566159248352051, "learning_rate": 7.14336917562724e-05, "loss": 0.5564, "step": 800 }, { "epoch": 2.87, "eval_accuracy": 0.8476482617586912, "eval_loss": 0.48173415660858154, "eval_runtime": 13.1664, "eval_samples_per_second": 74.28, "eval_steps_per_second": 9.342, "step": 800 }, { "epoch": 2.9, "grad_norm": 6.260211944580078, "learning_rate": 7.10752688172043e-05, "loss": 0.3901, "step": 810 }, { "epoch": 2.94, "grad_norm": 5.212798118591309, "learning_rate": 7.07168458781362e-05, "loss": 0.3447, "step": 820 }, { "epoch": 2.97, "grad_norm": 3.084397554397583, "learning_rate": 7.03584229390681e-05, "loss": 0.5139, "step": 830 }, { "epoch": 3.01, "grad_norm": 4.144620418548584, "learning_rate": 7e-05, "loss": 0.5502, "step": 840 }, { "epoch": 3.05, "grad_norm": 4.02711296081543, "learning_rate": 6.964157706093191e-05, "loss": 0.3868, "step": 850 }, { "epoch": 3.08, "grad_norm": 4.601646900177002, "learning_rate": 6.92831541218638e-05, "loss": 0.5119, "step": 860 }, { "epoch": 3.12, "grad_norm": 7.631004810333252, "learning_rate": 6.89247311827957e-05, "loss": 0.4146, "step": 870 }, { "epoch": 3.15, "grad_norm": 3.341120719909668, "learning_rate": 6.85663082437276e-05, "loss": 0.4403, "step": 880 }, { "epoch": 3.19, "grad_norm": 3.3383867740631104, "learning_rate": 6.82078853046595e-05, "loss": 0.4715, "step": 890 }, { "epoch": 3.23, "grad_norm": 5.930158615112305, "learning_rate": 6.78494623655914e-05, "loss": 0.4597, "step": 900 }, { "epoch": 3.23, "eval_accuracy": 0.8599182004089979, "eval_loss": 0.46898409724235535, "eval_runtime": 13.1156, "eval_samples_per_second": 74.568, "eval_steps_per_second": 9.378, "step": 900 }, { "epoch": 3.26, "grad_norm": 8.335565567016602, "learning_rate": 6.74910394265233e-05, "loss": 0.3672, "step": 910 }, { "epoch": 3.3, "grad_norm": 4.635217189788818, "learning_rate": 6.713261648745521e-05, "loss": 0.4008, "step": 920 }, { "epoch": 3.33, "grad_norm": 6.6547322273254395, "learning_rate": 6.67741935483871e-05, "loss": 0.4973, "step": 930 }, { "epoch": 3.37, "grad_norm": 2.2684273719787598, "learning_rate": 6.6415770609319e-05, "loss": 0.3125, "step": 940 }, { "epoch": 3.41, "grad_norm": 6.180244445800781, "learning_rate": 6.60573476702509e-05, "loss": 0.4356, "step": 950 }, { "epoch": 3.44, "grad_norm": 3.5774285793304443, "learning_rate": 6.56989247311828e-05, "loss": 0.4316, "step": 960 }, { "epoch": 3.48, "grad_norm": 4.347971439361572, "learning_rate": 6.534050179211469e-05, "loss": 0.4942, "step": 970 }, { "epoch": 3.51, "grad_norm": 1.7022749185562134, "learning_rate": 6.49820788530466e-05, "loss": 0.5551, "step": 980 }, { "epoch": 3.55, "grad_norm": 5.636997699737549, "learning_rate": 6.46236559139785e-05, "loss": 0.4212, "step": 990 }, { "epoch": 3.58, "grad_norm": 1.4300520420074463, "learning_rate": 6.42652329749104e-05, "loss": 0.3516, "step": 1000 }, { "epoch": 3.58, "eval_accuracy": 0.8507157464212679, "eval_loss": 0.4920569360256195, "eval_runtime": 13.0448, "eval_samples_per_second": 74.972, "eval_steps_per_second": 9.429, "step": 1000 }, { "epoch": 3.62, "grad_norm": 3.524423837661743, "learning_rate": 6.39068100358423e-05, "loss": 0.4023, "step": 1010 }, { "epoch": 3.66, "grad_norm": 7.19805383682251, "learning_rate": 6.35483870967742e-05, "loss": 0.3299, "step": 1020 }, { "epoch": 3.69, "grad_norm": 5.69304084777832, "learning_rate": 6.318996415770609e-05, "loss": 0.4332, "step": 1030 }, { "epoch": 3.73, "grad_norm": 5.353736400604248, "learning_rate": 6.283154121863799e-05, "loss": 0.4798, "step": 1040 }, { "epoch": 3.76, "grad_norm": 5.782208442687988, "learning_rate": 6.247311827956989e-05, "loss": 0.4301, "step": 1050 }, { "epoch": 3.8, "grad_norm": 5.975741386413574, "learning_rate": 6.211469534050179e-05, "loss": 0.3599, "step": 1060 }, { "epoch": 3.84, "grad_norm": 6.242609977722168, "learning_rate": 6.17562724014337e-05, "loss": 0.4275, "step": 1070 }, { "epoch": 3.87, "grad_norm": 5.965251922607422, "learning_rate": 6.13978494623656e-05, "loss": 0.5298, "step": 1080 }, { "epoch": 3.91, "grad_norm": 9.964472770690918, "learning_rate": 6.10394265232975e-05, "loss": 0.479, "step": 1090 }, { "epoch": 3.94, "grad_norm": 2.9509897232055664, "learning_rate": 6.068100358422939e-05, "loss": 0.3741, "step": 1100 }, { "epoch": 3.94, "eval_accuracy": 0.8568507157464212, "eval_loss": 0.4820023775100708, "eval_runtime": 12.9964, "eval_samples_per_second": 75.252, "eval_steps_per_second": 9.464, "step": 1100 }, { "epoch": 3.98, "grad_norm": 4.46811056137085, "learning_rate": 6.0322580645161295e-05, "loss": 0.2701, "step": 1110 }, { "epoch": 4.01, "grad_norm": 3.02339506149292, "learning_rate": 5.996415770609319e-05, "loss": 0.3165, "step": 1120 }, { "epoch": 4.05, "grad_norm": 4.451502323150635, "learning_rate": 5.960573476702509e-05, "loss": 0.4528, "step": 1130 }, { "epoch": 4.09, "grad_norm": 3.656177520751953, "learning_rate": 5.9247311827956994e-05, "loss": 0.3612, "step": 1140 }, { "epoch": 4.12, "grad_norm": 5.621472358703613, "learning_rate": 5.8888888888888896e-05, "loss": 0.3641, "step": 1150 }, { "epoch": 4.16, "grad_norm": 10.402291297912598, "learning_rate": 5.853046594982079e-05, "loss": 0.4598, "step": 1160 }, { "epoch": 4.19, "grad_norm": 5.005283355712891, "learning_rate": 5.8172043010752686e-05, "loss": 0.3746, "step": 1170 }, { "epoch": 4.23, "grad_norm": 6.668752670288086, "learning_rate": 5.7813620071684594e-05, "loss": 0.416, "step": 1180 }, { "epoch": 4.27, "grad_norm": 6.447160720825195, "learning_rate": 5.745519713261649e-05, "loss": 0.3504, "step": 1190 }, { "epoch": 4.3, "grad_norm": 8.023221969604492, "learning_rate": 5.7096774193548384e-05, "loss": 0.4169, "step": 1200 }, { "epoch": 4.3, "eval_accuracy": 0.852760736196319, "eval_loss": 0.47300344705581665, "eval_runtime": 13.1292, "eval_samples_per_second": 74.491, "eval_steps_per_second": 9.368, "step": 1200 }, { "epoch": 4.34, "grad_norm": 2.819065570831299, "learning_rate": 5.673835125448029e-05, "loss": 0.3808, "step": 1210 }, { "epoch": 4.37, "grad_norm": 10.216522216796875, "learning_rate": 5.637992831541219e-05, "loss": 0.3039, "step": 1220 }, { "epoch": 4.41, "grad_norm": 4.38183069229126, "learning_rate": 5.602150537634408e-05, "loss": 0.3508, "step": 1230 }, { "epoch": 4.44, "grad_norm": 3.4877617359161377, "learning_rate": 5.566308243727599e-05, "loss": 0.2875, "step": 1240 }, { "epoch": 4.48, "grad_norm": 5.170544624328613, "learning_rate": 5.530465949820789e-05, "loss": 0.2425, "step": 1250 }, { "epoch": 4.52, "grad_norm": 1.037807583808899, "learning_rate": 5.494623655913979e-05, "loss": 0.2618, "step": 1260 }, { "epoch": 4.55, "grad_norm": 1.1233432292938232, "learning_rate": 5.458781362007169e-05, "loss": 0.4105, "step": 1270 }, { "epoch": 4.59, "grad_norm": 6.105014801025391, "learning_rate": 5.422939068100359e-05, "loss": 0.3639, "step": 1280 }, { "epoch": 4.62, "grad_norm": 5.100095272064209, "learning_rate": 5.387096774193549e-05, "loss": 0.4289, "step": 1290 }, { "epoch": 4.66, "grad_norm": 4.874768257141113, "learning_rate": 5.351254480286738e-05, "loss": 0.5131, "step": 1300 }, { "epoch": 4.66, "eval_accuracy": 0.8680981595092024, "eval_loss": 0.4481976628303528, "eval_runtime": 12.8876, "eval_samples_per_second": 75.887, "eval_steps_per_second": 9.544, "step": 1300 }, { "epoch": 4.7, "grad_norm": 3.7920286655426025, "learning_rate": 5.315412186379929e-05, "loss": 0.3811, "step": 1310 }, { "epoch": 4.73, "grad_norm": 4.628345489501953, "learning_rate": 5.2795698924731186e-05, "loss": 0.3271, "step": 1320 }, { "epoch": 4.77, "grad_norm": 7.546787261962891, "learning_rate": 5.243727598566308e-05, "loss": 0.4059, "step": 1330 }, { "epoch": 4.8, "grad_norm": 3.106943130493164, "learning_rate": 5.207885304659499e-05, "loss": 0.2255, "step": 1340 }, { "epoch": 4.84, "grad_norm": 7.233306407928467, "learning_rate": 5.1720430107526885e-05, "loss": 0.299, "step": 1350 }, { "epoch": 4.87, "grad_norm": 8.275094032287598, "learning_rate": 5.136200716845878e-05, "loss": 0.387, "step": 1360 }, { "epoch": 4.91, "grad_norm": 4.1648640632629395, "learning_rate": 5.100358422939069e-05, "loss": 0.2891, "step": 1370 }, { "epoch": 4.95, "grad_norm": 2.7225232124328613, "learning_rate": 5.064516129032258e-05, "loss": 0.3828, "step": 1380 }, { "epoch": 4.98, "grad_norm": 4.712428092956543, "learning_rate": 5.028673835125448e-05, "loss": 0.3843, "step": 1390 }, { "epoch": 5.02, "grad_norm": 7.5443644523620605, "learning_rate": 4.992831541218638e-05, "loss": 0.3673, "step": 1400 }, { "epoch": 5.02, "eval_accuracy": 0.8752556237218814, "eval_loss": 0.42281925678253174, "eval_runtime": 14.3907, "eval_samples_per_second": 67.961, "eval_steps_per_second": 8.547, "step": 1400 }, { "epoch": 5.05, "grad_norm": 2.7451975345611572, "learning_rate": 4.956989247311828e-05, "loss": 0.3115, "step": 1410 }, { "epoch": 5.09, "grad_norm": 0.5151819586753845, "learning_rate": 4.9211469534050184e-05, "loss": 0.3281, "step": 1420 }, { "epoch": 5.13, "grad_norm": 7.1587419509887695, "learning_rate": 4.8853046594982085e-05, "loss": 0.2715, "step": 1430 }, { "epoch": 5.16, "grad_norm": 6.464181423187256, "learning_rate": 4.849462365591398e-05, "loss": 0.2884, "step": 1440 }, { "epoch": 5.2, "grad_norm": 6.002290725708008, "learning_rate": 4.813620071684588e-05, "loss": 0.3214, "step": 1450 }, { "epoch": 5.23, "grad_norm": 7.911471366882324, "learning_rate": 4.7777777777777784e-05, "loss": 0.3749, "step": 1460 }, { "epoch": 5.27, "grad_norm": 6.183435440063477, "learning_rate": 4.741935483870968e-05, "loss": 0.4746, "step": 1470 }, { "epoch": 5.3, "grad_norm": 2.4367499351501465, "learning_rate": 4.706093189964158e-05, "loss": 0.2526, "step": 1480 }, { "epoch": 5.34, "grad_norm": 3.299232006072998, "learning_rate": 4.6702508960573476e-05, "loss": 0.3413, "step": 1490 }, { "epoch": 5.38, "grad_norm": 5.005585193634033, "learning_rate": 4.634408602150538e-05, "loss": 0.3722, "step": 1500 }, { "epoch": 5.38, "eval_accuracy": 0.8599182004089979, "eval_loss": 0.4820214807987213, "eval_runtime": 13.0602, "eval_samples_per_second": 74.884, "eval_steps_per_second": 9.418, "step": 1500 }, { "epoch": 5.41, "grad_norm": 4.578291416168213, "learning_rate": 4.598566308243728e-05, "loss": 0.2471, "step": 1510 }, { "epoch": 5.45, "grad_norm": 4.164848327636719, "learning_rate": 4.5627240143369175e-05, "loss": 0.2803, "step": 1520 }, { "epoch": 5.48, "grad_norm": 3.4090826511383057, "learning_rate": 4.5268817204301076e-05, "loss": 0.3843, "step": 1530 }, { "epoch": 5.52, "grad_norm": 3.8364851474761963, "learning_rate": 4.491039426523297e-05, "loss": 0.348, "step": 1540 }, { "epoch": 5.56, "grad_norm": 0.9995656609535217, "learning_rate": 4.455197132616487e-05, "loss": 0.2812, "step": 1550 }, { "epoch": 5.59, "grad_norm": 0.11745692044496536, "learning_rate": 4.4193548387096775e-05, "loss": 0.2775, "step": 1560 }, { "epoch": 5.63, "grad_norm": 8.940442085266113, "learning_rate": 4.383512544802868e-05, "loss": 0.3364, "step": 1570 }, { "epoch": 5.66, "grad_norm": 3.694096088409424, "learning_rate": 4.347670250896058e-05, "loss": 0.3487, "step": 1580 }, { "epoch": 5.7, "grad_norm": 5.201242446899414, "learning_rate": 4.3118279569892474e-05, "loss": 0.4002, "step": 1590 }, { "epoch": 5.73, "grad_norm": 1.3319661617279053, "learning_rate": 4.2759856630824376e-05, "loss": 0.3005, "step": 1600 }, { "epoch": 5.73, "eval_accuracy": 0.8732106339468303, "eval_loss": 0.4456250071525574, "eval_runtime": 13.0438, "eval_samples_per_second": 74.978, "eval_steps_per_second": 9.43, "step": 1600 }, { "epoch": 5.77, "grad_norm": 1.4663386344909668, "learning_rate": 4.240143369175628e-05, "loss": 0.3404, "step": 1610 }, { "epoch": 5.81, "grad_norm": 2.7660508155822754, "learning_rate": 4.204301075268817e-05, "loss": 0.3851, "step": 1620 }, { "epoch": 5.84, "grad_norm": 3.7340736389160156, "learning_rate": 4.1684587813620074e-05, "loss": 0.4027, "step": 1630 }, { "epoch": 5.88, "grad_norm": 3.9350473880767822, "learning_rate": 4.1326164874551976e-05, "loss": 0.2706, "step": 1640 }, { "epoch": 5.91, "grad_norm": 6.938910961151123, "learning_rate": 4.096774193548387e-05, "loss": 0.3173, "step": 1650 }, { "epoch": 5.95, "grad_norm": 2.325490713119507, "learning_rate": 4.060931899641577e-05, "loss": 0.2399, "step": 1660 }, { "epoch": 5.99, "grad_norm": 4.819274425506592, "learning_rate": 4.025089605734767e-05, "loss": 0.2409, "step": 1670 }, { "epoch": 6.02, "grad_norm": 1.799713134765625, "learning_rate": 3.989247311827957e-05, "loss": 0.3328, "step": 1680 }, { "epoch": 6.06, "grad_norm": 3.0742380619049072, "learning_rate": 3.953405017921147e-05, "loss": 0.3429, "step": 1690 }, { "epoch": 6.09, "grad_norm": 6.019571304321289, "learning_rate": 3.9175627240143367e-05, "loss": 0.2183, "step": 1700 }, { "epoch": 6.09, "eval_accuracy": 0.8752556237218814, "eval_loss": 0.42549625039100647, "eval_runtime": 13.4069, "eval_samples_per_second": 72.947, "eval_steps_per_second": 9.174, "step": 1700 }, { "epoch": 6.13, "grad_norm": 3.634247303009033, "learning_rate": 3.881720430107527e-05, "loss": 0.2822, "step": 1710 }, { "epoch": 6.16, "grad_norm": 1.7334699630737305, "learning_rate": 3.845878136200717e-05, "loss": 0.342, "step": 1720 }, { "epoch": 6.2, "grad_norm": 2.8848531246185303, "learning_rate": 3.810035842293907e-05, "loss": 0.2375, "step": 1730 }, { "epoch": 6.24, "grad_norm": 8.433989524841309, "learning_rate": 3.7741935483870974e-05, "loss": 0.3862, "step": 1740 }, { "epoch": 6.27, "grad_norm": 5.4030022621154785, "learning_rate": 3.738351254480287e-05, "loss": 0.2464, "step": 1750 }, { "epoch": 6.31, "grad_norm": 0.6388465762138367, "learning_rate": 3.702508960573477e-05, "loss": 0.2487, "step": 1760 }, { "epoch": 6.34, "grad_norm": 2.5588126182556152, "learning_rate": 3.6666666666666666e-05, "loss": 0.315, "step": 1770 }, { "epoch": 6.38, "grad_norm": 2.1331589221954346, "learning_rate": 3.630824372759857e-05, "loss": 0.3208, "step": 1780 }, { "epoch": 6.42, "grad_norm": 5.283192157745361, "learning_rate": 3.594982078853047e-05, "loss": 0.231, "step": 1790 }, { "epoch": 6.45, "grad_norm": 1.1380605697631836, "learning_rate": 3.5591397849462364e-05, "loss": 0.2546, "step": 1800 }, { "epoch": 6.45, "eval_accuracy": 0.8680981595092024, "eval_loss": 0.4643884301185608, "eval_runtime": 13.3117, "eval_samples_per_second": 73.469, "eval_steps_per_second": 9.24, "step": 1800 }, { "epoch": 6.49, "grad_norm": 6.120726585388184, "learning_rate": 3.5232974910394266e-05, "loss": 0.3022, "step": 1810 }, { "epoch": 6.52, "grad_norm": 4.132528305053711, "learning_rate": 3.487455197132617e-05, "loss": 0.2711, "step": 1820 }, { "epoch": 6.56, "grad_norm": 0.2836366295814514, "learning_rate": 3.451612903225806e-05, "loss": 0.2759, "step": 1830 }, { "epoch": 6.59, "grad_norm": 4.501558303833008, "learning_rate": 3.4157706093189965e-05, "loss": 0.2287, "step": 1840 }, { "epoch": 6.63, "grad_norm": 7.157699108123779, "learning_rate": 3.379928315412187e-05, "loss": 0.1862, "step": 1850 }, { "epoch": 6.67, "grad_norm": 6.248540878295898, "learning_rate": 3.344086021505377e-05, "loss": 0.3247, "step": 1860 }, { "epoch": 6.7, "grad_norm": 4.108403205871582, "learning_rate": 3.308243727598567e-05, "loss": 0.3535, "step": 1870 }, { "epoch": 6.74, "grad_norm": 4.134276866912842, "learning_rate": 3.2724014336917565e-05, "loss": 0.217, "step": 1880 }, { "epoch": 6.77, "grad_norm": 2.2216591835021973, "learning_rate": 3.236559139784947e-05, "loss": 0.1901, "step": 1890 }, { "epoch": 6.81, "grad_norm": 3.7293379306793213, "learning_rate": 3.200716845878136e-05, "loss": 0.3798, "step": 1900 }, { "epoch": 6.81, "eval_accuracy": 0.852760736196319, "eval_loss": 0.4917815029621124, "eval_runtime": 13.2098, "eval_samples_per_second": 74.036, "eval_steps_per_second": 9.311, "step": 1900 }, { "epoch": 6.85, "grad_norm": 9.140816688537598, "learning_rate": 3.1648745519713264e-05, "loss": 0.3056, "step": 1910 }, { "epoch": 6.88, "grad_norm": 5.005599498748779, "learning_rate": 3.1290322580645166e-05, "loss": 0.144, "step": 1920 }, { "epoch": 6.92, "grad_norm": 8.45984172821045, "learning_rate": 3.093189964157706e-05, "loss": 0.2561, "step": 1930 }, { "epoch": 6.95, "grad_norm": 9.944540023803711, "learning_rate": 3.057347670250896e-05, "loss": 0.3116, "step": 1940 }, { "epoch": 6.99, "grad_norm": 5.759355545043945, "learning_rate": 3.0215053763440858e-05, "loss": 0.3236, "step": 1950 }, { "epoch": 7.03, "grad_norm": 3.6583213806152344, "learning_rate": 2.985663082437276e-05, "loss": 0.2651, "step": 1960 }, { "epoch": 7.06, "grad_norm": 3.7694921493530273, "learning_rate": 2.949820788530466e-05, "loss": 0.273, "step": 1970 }, { "epoch": 7.1, "grad_norm": 7.003334045410156, "learning_rate": 2.913978494623656e-05, "loss": 0.2574, "step": 1980 }, { "epoch": 7.13, "grad_norm": 5.475174427032471, "learning_rate": 2.878136200716846e-05, "loss": 0.1923, "step": 1990 }, { "epoch": 7.17, "grad_norm": 4.163592338562012, "learning_rate": 2.8422939068100357e-05, "loss": 0.2851, "step": 2000 }, { "epoch": 7.17, "eval_accuracy": 0.8660531697341514, "eval_loss": 0.45738592743873596, "eval_runtime": 13.0257, "eval_samples_per_second": 75.082, "eval_steps_per_second": 9.443, "step": 2000 }, { "epoch": 7.2, "grad_norm": 1.4400478601455688, "learning_rate": 2.806451612903226e-05, "loss": 0.157, "step": 2010 }, { "epoch": 7.24, "grad_norm": 3.958745241165161, "learning_rate": 2.770609318996416e-05, "loss": 0.2556, "step": 2020 }, { "epoch": 7.28, "grad_norm": 2.083286762237549, "learning_rate": 2.734767025089606e-05, "loss": 0.2341, "step": 2030 }, { "epoch": 7.31, "grad_norm": 9.330533027648926, "learning_rate": 2.698924731182796e-05, "loss": 0.2742, "step": 2040 }, { "epoch": 7.35, "grad_norm": 1.6954762935638428, "learning_rate": 2.6630824372759862e-05, "loss": 0.1567, "step": 2050 }, { "epoch": 7.38, "grad_norm": 12.555899620056152, "learning_rate": 2.6272401433691757e-05, "loss": 0.1988, "step": 2060 }, { "epoch": 7.42, "grad_norm": 2.7985002994537354, "learning_rate": 2.591397849462366e-05, "loss": 0.2954, "step": 2070 }, { "epoch": 7.46, "grad_norm": 3.0381124019622803, "learning_rate": 2.5555555555555554e-05, "loss": 0.2345, "step": 2080 }, { "epoch": 7.49, "grad_norm": 4.35617208480835, "learning_rate": 2.5197132616487456e-05, "loss": 0.2736, "step": 2090 }, { "epoch": 7.53, "grad_norm": 7.855186939239502, "learning_rate": 2.4838709677419354e-05, "loss": 0.3897, "step": 2100 }, { "epoch": 7.53, "eval_accuracy": 0.8650306748466258, "eval_loss": 0.47503402829170227, "eval_runtime": 13.2431, "eval_samples_per_second": 73.85, "eval_steps_per_second": 9.288, "step": 2100 }, { "epoch": 7.56, "grad_norm": 8.723631858825684, "learning_rate": 2.4480286738351256e-05, "loss": 0.2814, "step": 2110 }, { "epoch": 7.6, "grad_norm": 8.104424476623535, "learning_rate": 2.4121863799283158e-05, "loss": 0.345, "step": 2120 }, { "epoch": 7.63, "grad_norm": 2.0964772701263428, "learning_rate": 2.3763440860215056e-05, "loss": 0.1643, "step": 2130 }, { "epoch": 7.67, "grad_norm": 6.627722263336182, "learning_rate": 2.3405017921146955e-05, "loss": 0.2667, "step": 2140 }, { "epoch": 7.71, "grad_norm": 8.275358200073242, "learning_rate": 2.3046594982078853e-05, "loss": 0.1747, "step": 2150 }, { "epoch": 7.74, "grad_norm": 2.7480711936950684, "learning_rate": 2.268817204301075e-05, "loss": 0.2789, "step": 2160 }, { "epoch": 7.78, "grad_norm": 4.440505504608154, "learning_rate": 2.2329749103942653e-05, "loss": 0.244, "step": 2170 }, { "epoch": 7.81, "grad_norm": 3.1027097702026367, "learning_rate": 2.1971326164874552e-05, "loss": 0.1924, "step": 2180 }, { "epoch": 7.85, "grad_norm": 0.49080890417099, "learning_rate": 2.1612903225806454e-05, "loss": 0.1648, "step": 2190 }, { "epoch": 7.89, "grad_norm": 7.749617099761963, "learning_rate": 2.1254480286738352e-05, "loss": 0.2879, "step": 2200 }, { "epoch": 7.89, "eval_accuracy": 0.8640081799591002, "eval_loss": 0.4739380478858948, "eval_runtime": 13.2662, "eval_samples_per_second": 73.721, "eval_steps_per_second": 9.272, "step": 2200 }, { "epoch": 7.92, "grad_norm": 4.843421459197998, "learning_rate": 2.0896057347670254e-05, "loss": 0.1315, "step": 2210 }, { "epoch": 7.96, "grad_norm": 11.301743507385254, "learning_rate": 2.0537634408602152e-05, "loss": 0.2476, "step": 2220 }, { "epoch": 7.99, "grad_norm": 5.654230117797852, "learning_rate": 2.017921146953405e-05, "loss": 0.2347, "step": 2230 }, { "epoch": 8.03, "grad_norm": 1.8168110847473145, "learning_rate": 1.982078853046595e-05, "loss": 0.2233, "step": 2240 }, { "epoch": 8.06, "grad_norm": 6.872961521148682, "learning_rate": 1.9462365591397848e-05, "loss": 0.295, "step": 2250 }, { "epoch": 8.1, "grad_norm": 1.988438606262207, "learning_rate": 1.910394265232975e-05, "loss": 0.2193, "step": 2260 }, { "epoch": 8.14, "grad_norm": 2.3927271366119385, "learning_rate": 1.874551971326165e-05, "loss": 0.1755, "step": 2270 }, { "epoch": 8.17, "grad_norm": 9.944202423095703, "learning_rate": 1.838709677419355e-05, "loss": 0.3156, "step": 2280 }, { "epoch": 8.21, "grad_norm": 1.3722007274627686, "learning_rate": 1.8028673835125448e-05, "loss": 0.1877, "step": 2290 }, { "epoch": 8.24, "grad_norm": 2.698289394378662, "learning_rate": 1.767025089605735e-05, "loss": 0.1619, "step": 2300 }, { "epoch": 8.24, "eval_accuracy": 0.8701431492842536, "eval_loss": 0.46446332335472107, "eval_runtime": 13.2992, "eval_samples_per_second": 73.538, "eval_steps_per_second": 9.249, "step": 2300 }, { "epoch": 8.28, "grad_norm": 4.195601463317871, "learning_rate": 1.7311827956989248e-05, "loss": 0.1302, "step": 2310 }, { "epoch": 8.32, "grad_norm": 8.367218971252441, "learning_rate": 1.6953405017921147e-05, "loss": 0.2366, "step": 2320 }, { "epoch": 8.35, "grad_norm": 1.637813925743103, "learning_rate": 1.659498207885305e-05, "loss": 0.1395, "step": 2330 }, { "epoch": 8.39, "grad_norm": 7.9814677238464355, "learning_rate": 1.6236559139784947e-05, "loss": 0.2804, "step": 2340 }, { "epoch": 8.42, "grad_norm": 0.23666299879550934, "learning_rate": 1.587813620071685e-05, "loss": 0.2186, "step": 2350 }, { "epoch": 8.46, "grad_norm": 7.456205368041992, "learning_rate": 1.5519713261648747e-05, "loss": 0.2733, "step": 2360 }, { "epoch": 8.49, "grad_norm": 3.73573899269104, "learning_rate": 1.5161290322580646e-05, "loss": 0.2891, "step": 2370 }, { "epoch": 8.53, "grad_norm": 1.723080039024353, "learning_rate": 1.4802867383512544e-05, "loss": 0.2959, "step": 2380 }, { "epoch": 8.57, "grad_norm": 2.3597002029418945, "learning_rate": 1.4444444444444444e-05, "loss": 0.2784, "step": 2390 }, { "epoch": 8.6, "grad_norm": 0.19159385561943054, "learning_rate": 1.4086021505376346e-05, "loss": 0.1791, "step": 2400 }, { "epoch": 8.6, "eval_accuracy": 0.8670756646216768, "eval_loss": 0.46364837884902954, "eval_runtime": 13.0224, "eval_samples_per_second": 75.101, "eval_steps_per_second": 9.445, "step": 2400 }, { "epoch": 8.64, "grad_norm": 8.25283145904541, "learning_rate": 1.3727598566308244e-05, "loss": 0.2331, "step": 2410 }, { "epoch": 8.67, "grad_norm": 4.816408157348633, "learning_rate": 1.3369175627240143e-05, "loss": 0.1704, "step": 2420 }, { "epoch": 8.71, "grad_norm": 10.364704132080078, "learning_rate": 1.3010752688172043e-05, "loss": 0.2854, "step": 2430 }, { "epoch": 8.75, "grad_norm": 6.165342807769775, "learning_rate": 1.2652329749103945e-05, "loss": 0.1635, "step": 2440 }, { "epoch": 8.78, "grad_norm": 10.325529098510742, "learning_rate": 1.2293906810035843e-05, "loss": 0.2685, "step": 2450 }, { "epoch": 8.82, "grad_norm": 1.4126335382461548, "learning_rate": 1.1935483870967743e-05, "loss": 0.1575, "step": 2460 }, { "epoch": 8.85, "grad_norm": 8.191924095153809, "learning_rate": 1.1577060931899642e-05, "loss": 0.2585, "step": 2470 }, { "epoch": 8.89, "grad_norm": 6.928045272827148, "learning_rate": 1.1218637992831542e-05, "loss": 0.2129, "step": 2480 }, { "epoch": 8.92, "grad_norm": 2.3231565952301025, "learning_rate": 1.086021505376344e-05, "loss": 0.3085, "step": 2490 }, { "epoch": 8.96, "grad_norm": 2.7480709552764893, "learning_rate": 1.0501792114695342e-05, "loss": 0.1697, "step": 2500 }, { "epoch": 8.96, "eval_accuracy": 0.8640081799591002, "eval_loss": 0.4716458022594452, "eval_runtime": 13.2964, "eval_samples_per_second": 73.554, "eval_steps_per_second": 9.251, "step": 2500 }, { "epoch": 9.0, "grad_norm": 6.852436542510986, "learning_rate": 1.014336917562724e-05, "loss": 0.1856, "step": 2510 }, { "epoch": 9.03, "grad_norm": 5.506394863128662, "learning_rate": 9.78494623655914e-06, "loss": 0.1951, "step": 2520 }, { "epoch": 9.07, "grad_norm": 3.1270835399627686, "learning_rate": 9.426523297491039e-06, "loss": 0.1896, "step": 2530 }, { "epoch": 9.1, "grad_norm": 3.797590732574463, "learning_rate": 9.06810035842294e-06, "loss": 0.2276, "step": 2540 }, { "epoch": 9.14, "grad_norm": 4.3236985206604, "learning_rate": 8.70967741935484e-06, "loss": 0.2272, "step": 2550 }, { "epoch": 9.18, "grad_norm": 4.975371837615967, "learning_rate": 8.351254480286738e-06, "loss": 0.2491, "step": 2560 }, { "epoch": 9.21, "grad_norm": 1.405785083770752, "learning_rate": 7.992831541218638e-06, "loss": 0.1985, "step": 2570 }, { "epoch": 9.25, "grad_norm": 0.13112181425094604, "learning_rate": 7.634408602150538e-06, "loss": 0.1134, "step": 2580 }, { "epoch": 9.28, "grad_norm": 2.597059965133667, "learning_rate": 7.275985663082438e-06, "loss": 0.229, "step": 2590 }, { "epoch": 9.32, "grad_norm": 0.1491578221321106, "learning_rate": 6.917562724014337e-06, "loss": 0.1608, "step": 2600 }, { "epoch": 9.32, "eval_accuracy": 0.8742331288343558, "eval_loss": 0.4680761694908142, "eval_runtime": 13.2277, "eval_samples_per_second": 73.936, "eval_steps_per_second": 9.299, "step": 2600 }, { "epoch": 9.35, "grad_norm": 3.264333724975586, "learning_rate": 6.559139784946237e-06, "loss": 0.2228, "step": 2610 }, { "epoch": 9.39, "grad_norm": 3.251711130142212, "learning_rate": 6.200716845878137e-06, "loss": 0.1504, "step": 2620 }, { "epoch": 9.43, "grad_norm": 0.05231141671538353, "learning_rate": 5.842293906810036e-06, "loss": 0.1097, "step": 2630 }, { "epoch": 9.46, "grad_norm": 6.941389083862305, "learning_rate": 5.483870967741936e-06, "loss": 0.2046, "step": 2640 }, { "epoch": 9.5, "grad_norm": 1.9816231727600098, "learning_rate": 5.125448028673835e-06, "loss": 0.2879, "step": 2650 }, { "epoch": 9.53, "grad_norm": 1.5215052366256714, "learning_rate": 4.767025089605735e-06, "loss": 0.2352, "step": 2660 }, { "epoch": 9.57, "grad_norm": 0.055237527936697006, "learning_rate": 4.408602150537635e-06, "loss": 0.1954, "step": 2670 }, { "epoch": 9.61, "grad_norm": 4.320139408111572, "learning_rate": 4.050179211469534e-06, "loss": 0.3845, "step": 2680 }, { "epoch": 9.64, "grad_norm": 3.767547607421875, "learning_rate": 3.6917562724014336e-06, "loss": 0.3031, "step": 2690 }, { "epoch": 9.68, "grad_norm": 0.48145973682403564, "learning_rate": 3.3333333333333333e-06, "loss": 0.2105, "step": 2700 }, { "epoch": 9.68, "eval_accuracy": 0.8721881390593047, "eval_loss": 0.47190210223197937, "eval_runtime": 13.2161, "eval_samples_per_second": 74.001, "eval_steps_per_second": 9.307, "step": 2700 }, { "epoch": 9.71, "grad_norm": 1.6004977226257324, "learning_rate": 2.974910394265233e-06, "loss": 0.2186, "step": 2710 }, { "epoch": 9.75, "grad_norm": 0.4472026228904724, "learning_rate": 2.6164874551971327e-06, "loss": 0.1528, "step": 2720 }, { "epoch": 9.78, "grad_norm": 0.12501764297485352, "learning_rate": 2.2580645161290324e-06, "loss": 0.1218, "step": 2730 }, { "epoch": 9.82, "grad_norm": 2.206127405166626, "learning_rate": 1.8996415770609319e-06, "loss": 0.1324, "step": 2740 }, { "epoch": 9.86, "grad_norm": 0.9404085278511047, "learning_rate": 1.5412186379928316e-06, "loss": 0.2832, "step": 2750 }, { "epoch": 9.89, "grad_norm": 5.883946418762207, "learning_rate": 1.1827956989247313e-06, "loss": 0.2113, "step": 2760 }, { "epoch": 9.93, "grad_norm": 5.288418292999268, "learning_rate": 8.243727598566309e-07, "loss": 0.1662, "step": 2770 }, { "epoch": 9.96, "grad_norm": 8.01339054107666, "learning_rate": 4.6594982078853055e-07, "loss": 0.1672, "step": 2780 }, { "epoch": 10.0, "grad_norm": 7.813081741333008, "learning_rate": 1.0752688172043011e-07, "loss": 0.1825, "step": 2790 }, { "epoch": 10.0, "step": 2790, "total_flos": 3.451740694569861e+18, "train_loss": 0.45333294201922675, "train_runtime": 1524.4216, "train_samples_per_second": 29.218, "train_steps_per_second": 1.83 } ], "logging_steps": 10, "max_steps": 2790, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 3.451740694569861e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }