{ "best_metric": 0.8064516129032258, "best_model_checkpoint": "MAE-CT-M1N0-M12_v8_split2/checkpoint-504", "epoch": 98.0041935483871, "eval_steps": 500, "global_step": 6200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016129032258064516, "grad_norm": 1.6501898765563965, "learning_rate": 1.6129032258064518e-07, "loss": 0.6917, "step": 10 }, { "epoch": 0.0032258064516129032, "grad_norm": 3.3044791221618652, "learning_rate": 3.2258064516129035e-07, "loss": 0.6852, "step": 20 }, { "epoch": 0.004838709677419355, "grad_norm": 3.1445343494415283, "learning_rate": 4.838709677419355e-07, "loss": 0.6804, "step": 30 }, { "epoch": 0.0064516129032258064, "grad_norm": 3.941283702850342, "learning_rate": 6.451612903225807e-07, "loss": 0.6887, "step": 40 }, { "epoch": 0.008064516129032258, "grad_norm": 5.525125980377197, "learning_rate": 8.064516129032258e-07, "loss": 0.6614, "step": 50 }, { "epoch": 0.00967741935483871, "grad_norm": 2.9109671115875244, "learning_rate": 9.67741935483871e-07, "loss": 0.6375, "step": 60 }, { "epoch": 0.010161290322580644, "eval_accuracy": 0.5161290322580645, "eval_loss": 0.6993367075920105, "eval_runtime": 7.1963, "eval_samples_per_second": 4.308, "eval_steps_per_second": 1.112, "step": 63 }, { "epoch": 1.0011290322580646, "grad_norm": 5.106609344482422, "learning_rate": 1.1290322580645162e-06, "loss": 0.667, "step": 70 }, { "epoch": 1.002741935483871, "grad_norm": 4.405478477478027, "learning_rate": 1.2903225806451614e-06, "loss": 0.5354, "step": 80 }, { "epoch": 1.0043548387096775, "grad_norm": 7.3709821701049805, "learning_rate": 1.4516129032258066e-06, "loss": 0.6565, "step": 90 }, { "epoch": 1.0059677419354838, "grad_norm": 15.715628623962402, "learning_rate": 1.6129032258064516e-06, "loss": 0.5875, "step": 100 }, { "epoch": 1.0075806451612903, "grad_norm": 7.487520694732666, "learning_rate": 1.774193548387097e-06, "loss": 0.7175, "step": 110 }, { "epoch": 1.0091935483870969, "grad_norm": 6.219560146331787, "learning_rate": 1.935483870967742e-06, "loss": 0.7565, "step": 120 }, { "epoch": 1.0101612903225807, "eval_accuracy": 0.5161290322580645, "eval_loss": 0.7252107262611389, "eval_runtime": 6.0261, "eval_samples_per_second": 5.144, "eval_steps_per_second": 1.328, "step": 126 }, { "epoch": 2.0006451612903224, "grad_norm": 4.885463714599609, "learning_rate": 2.096774193548387e-06, "loss": 0.6332, "step": 130 }, { "epoch": 2.002258064516129, "grad_norm": 2.572324514389038, "learning_rate": 2.2580645161290324e-06, "loss": 0.5997, "step": 140 }, { "epoch": 2.0038709677419355, "grad_norm": 5.801942348480225, "learning_rate": 2.4193548387096776e-06, "loss": 0.6184, "step": 150 }, { "epoch": 2.005483870967742, "grad_norm": 4.886776447296143, "learning_rate": 2.580645161290323e-06, "loss": 0.6232, "step": 160 }, { "epoch": 2.0070967741935486, "grad_norm": 16.456005096435547, "learning_rate": 2.7419354838709676e-06, "loss": 0.63, "step": 170 }, { "epoch": 2.008709677419355, "grad_norm": 6.376218318939209, "learning_rate": 2.903225806451613e-06, "loss": 0.6926, "step": 180 }, { "epoch": 2.0101612903225807, "eval_accuracy": 0.5161290322580645, "eval_loss": 0.7296160459518433, "eval_runtime": 6.4589, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.239, "step": 189 }, { "epoch": 3.0001612903225805, "grad_norm": 29.707622528076172, "learning_rate": 3.0645161290322584e-06, "loss": 0.6007, "step": 190 }, { "epoch": 3.0017741935483873, "grad_norm": 12.88126277923584, "learning_rate": 3.225806451612903e-06, "loss": 0.4823, "step": 200 }, { "epoch": 3.0033870967741936, "grad_norm": 5.976855278015137, "learning_rate": 3.3870967741935484e-06, "loss": 0.6823, "step": 210 }, { "epoch": 3.005, "grad_norm": 12.777349472045898, "learning_rate": 3.548387096774194e-06, "loss": 0.582, "step": 220 }, { "epoch": 3.0066129032258067, "grad_norm": 15.71567153930664, "learning_rate": 3.7096774193548392e-06, "loss": 0.4713, "step": 230 }, { "epoch": 3.008225806451613, "grad_norm": 8.446873664855957, "learning_rate": 3.870967741935484e-06, "loss": 0.9479, "step": 240 }, { "epoch": 3.0098387096774193, "grad_norm": 6.00430965423584, "learning_rate": 4.032258064516129e-06, "loss": 0.5636, "step": 250 }, { "epoch": 3.0101612903225807, "eval_accuracy": 0.5161290322580645, "eval_loss": 0.7618242502212524, "eval_runtime": 6.2452, "eval_samples_per_second": 4.964, "eval_steps_per_second": 1.281, "step": 252 }, { "epoch": 4.001290322580645, "grad_norm": 23.249408721923828, "learning_rate": 4.193548387096774e-06, "loss": 0.7424, "step": 260 }, { "epoch": 4.002903225806452, "grad_norm": 4.971527099609375, "learning_rate": 4.35483870967742e-06, "loss": 0.6502, "step": 270 }, { "epoch": 4.004516129032258, "grad_norm": 8.129631996154785, "learning_rate": 4.516129032258065e-06, "loss": 0.5874, "step": 280 }, { "epoch": 4.006129032258064, "grad_norm": 11.48269271850586, "learning_rate": 4.67741935483871e-06, "loss": 0.5428, "step": 290 }, { "epoch": 4.007741935483871, "grad_norm": 11.83126163482666, "learning_rate": 4.838709677419355e-06, "loss": 0.5767, "step": 300 }, { "epoch": 4.009354838709678, "grad_norm": 6.538477420806885, "learning_rate": 5e-06, "loss": 0.4721, "step": 310 }, { "epoch": 4.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 1.2407073974609375, "eval_runtime": 6.0698, "eval_samples_per_second": 5.107, "eval_steps_per_second": 1.318, "step": 315 }, { "epoch": 5.000806451612903, "grad_norm": 13.179779052734375, "learning_rate": 5.161290322580646e-06, "loss": 0.6142, "step": 320 }, { "epoch": 5.002419354838709, "grad_norm": 7.851053714752197, "learning_rate": 5.322580645161291e-06, "loss": 0.5494, "step": 330 }, { "epoch": 5.004032258064516, "grad_norm": 13.048491477966309, "learning_rate": 5.483870967741935e-06, "loss": 0.5018, "step": 340 }, { "epoch": 5.005645161290323, "grad_norm": 17.176795959472656, "learning_rate": 5.645161290322582e-06, "loss": 0.5212, "step": 350 }, { "epoch": 5.007258064516129, "grad_norm": 5.25998592376709, "learning_rate": 5.806451612903226e-06, "loss": 0.4964, "step": 360 }, { "epoch": 5.008870967741935, "grad_norm": 11.501073837280273, "learning_rate": 5.967741935483872e-06, "loss": 0.7569, "step": 370 }, { "epoch": 5.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 0.800952672958374, "eval_runtime": 6.442, "eval_samples_per_second": 4.812, "eval_steps_per_second": 1.242, "step": 378 }, { "epoch": 6.000322580645161, "grad_norm": 12.811421394348145, "learning_rate": 6.129032258064517e-06, "loss": 0.4109, "step": 380 }, { "epoch": 6.001935483870968, "grad_norm": 29.298046112060547, "learning_rate": 6.290322580645162e-06, "loss": 0.4469, "step": 390 }, { "epoch": 6.0035483870967745, "grad_norm": 26.019372940063477, "learning_rate": 6.451612903225806e-06, "loss": 0.5437, "step": 400 }, { "epoch": 6.00516129032258, "grad_norm": 7.376840114593506, "learning_rate": 6.612903225806452e-06, "loss": 0.5285, "step": 410 }, { "epoch": 6.006774193548387, "grad_norm": 4.195577144622803, "learning_rate": 6.774193548387097e-06, "loss": 0.3829, "step": 420 }, { "epoch": 6.008387096774194, "grad_norm": 43.06284713745117, "learning_rate": 6.935483870967743e-06, "loss": 0.8423, "step": 430 }, { "epoch": 6.01, "grad_norm": 14.0852689743042, "learning_rate": 7.096774193548388e-06, "loss": 0.384, "step": 440 }, { "epoch": 6.01016129032258, "eval_accuracy": 0.6774193548387096, "eval_loss": 0.6035600304603577, "eval_runtime": 6.2613, "eval_samples_per_second": 4.951, "eval_steps_per_second": 1.278, "step": 441 }, { "epoch": 7.001451612903225, "grad_norm": 9.68275260925293, "learning_rate": 7.258064516129033e-06, "loss": 0.3735, "step": 450 }, { "epoch": 7.003064516129032, "grad_norm": 20.471586227416992, "learning_rate": 7.4193548387096784e-06, "loss": 0.5856, "step": 460 }, { "epoch": 7.004677419354839, "grad_norm": 10.077825546264648, "learning_rate": 7.580645161290323e-06, "loss": 0.3567, "step": 470 }, { "epoch": 7.006290322580645, "grad_norm": 11.653464317321777, "learning_rate": 7.741935483870968e-06, "loss": 0.4027, "step": 480 }, { "epoch": 7.0079032258064515, "grad_norm": 30.958131790161133, "learning_rate": 7.903225806451613e-06, "loss": 0.2881, "step": 490 }, { "epoch": 7.009516129032258, "grad_norm": 31.585803985595703, "learning_rate": 8.064516129032258e-06, "loss": 0.6542, "step": 500 }, { "epoch": 7.01016129032258, "eval_accuracy": 0.8064516129032258, "eval_loss": 0.5454678535461426, "eval_runtime": 6.3492, "eval_samples_per_second": 4.882, "eval_steps_per_second": 1.26, "step": 504 }, { "epoch": 8.000967741935483, "grad_norm": 5.090311050415039, "learning_rate": 8.225806451612904e-06, "loss": 0.4724, "step": 510 }, { "epoch": 8.00258064516129, "grad_norm": 14.630704879760742, "learning_rate": 8.387096774193549e-06, "loss": 0.6171, "step": 520 }, { "epoch": 8.004193548387097, "grad_norm": 4.01310396194458, "learning_rate": 8.548387096774194e-06, "loss": 0.5066, "step": 530 }, { "epoch": 8.005806451612903, "grad_norm": 12.878046035766602, "learning_rate": 8.70967741935484e-06, "loss": 0.2859, "step": 540 }, { "epoch": 8.00741935483871, "grad_norm": 9.791576385498047, "learning_rate": 8.870967741935484e-06, "loss": 0.6018, "step": 550 }, { "epoch": 8.009032258064517, "grad_norm": 45.741233825683594, "learning_rate": 9.03225806451613e-06, "loss": 0.3615, "step": 560 }, { "epoch": 8.01016129032258, "eval_accuracy": 0.5483870967741935, "eval_loss": 1.3505973815917969, "eval_runtime": 6.3628, "eval_samples_per_second": 4.872, "eval_steps_per_second": 1.257, "step": 567 }, { "epoch": 9.000483870967741, "grad_norm": 57.315250396728516, "learning_rate": 9.193548387096775e-06, "loss": 0.3956, "step": 570 }, { "epoch": 9.002096774193548, "grad_norm": 41.033512115478516, "learning_rate": 9.35483870967742e-06, "loss": 0.369, "step": 580 }, { "epoch": 9.003709677419355, "grad_norm": 84.67700958251953, "learning_rate": 9.516129032258065e-06, "loss": 0.3419, "step": 590 }, { "epoch": 9.005322580645162, "grad_norm": 16.714187622070312, "learning_rate": 9.67741935483871e-06, "loss": 0.8527, "step": 600 }, { "epoch": 9.006935483870969, "grad_norm": 12.65001392364502, "learning_rate": 9.838709677419356e-06, "loss": 0.268, "step": 610 }, { "epoch": 9.008548387096774, "grad_norm": 1.9735039472579956, "learning_rate": 1e-05, "loss": 0.2585, "step": 620 }, { "epoch": 9.01016129032258, "grad_norm": 18.82716941833496, "learning_rate": 9.982078853046597e-06, "loss": 0.2246, "step": 630 }, { "epoch": 9.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 1.5498994588851929, "eval_runtime": 6.1565, "eval_samples_per_second": 5.035, "eval_steps_per_second": 1.299, "step": 630 }, { "epoch": 10.001612903225807, "grad_norm": 12.663039207458496, "learning_rate": 9.96415770609319e-06, "loss": 0.4524, "step": 640 }, { "epoch": 10.003225806451614, "grad_norm": 55.03184127807617, "learning_rate": 9.946236559139786e-06, "loss": 0.4213, "step": 650 }, { "epoch": 10.004838709677419, "grad_norm": 13.46599006652832, "learning_rate": 9.928315412186382e-06, "loss": 0.4128, "step": 660 }, { "epoch": 10.006451612903225, "grad_norm": 1.5325831174850464, "learning_rate": 9.910394265232976e-06, "loss": 0.2547, "step": 670 }, { "epoch": 10.008064516129032, "grad_norm": 1.041783332824707, "learning_rate": 9.89247311827957e-06, "loss": 0.712, "step": 680 }, { "epoch": 10.009677419354839, "grad_norm": 63.43698501586914, "learning_rate": 9.874551971326167e-06, "loss": 0.7929, "step": 690 }, { "epoch": 10.01016129032258, "eval_accuracy": 0.6451612903225806, "eval_loss": 1.0719081163406372, "eval_runtime": 5.9853, "eval_samples_per_second": 5.179, "eval_steps_per_second": 1.337, "step": 693 }, { "epoch": 11.001129032258065, "grad_norm": 72.66409301757812, "learning_rate": 9.856630824372761e-06, "loss": 0.3914, "step": 700 }, { "epoch": 11.00274193548387, "grad_norm": 32.180328369140625, "learning_rate": 9.838709677419356e-06, "loss": 0.2827, "step": 710 }, { "epoch": 11.004354838709677, "grad_norm": 113.44245147705078, "learning_rate": 9.820788530465952e-06, "loss": 0.3627, "step": 720 }, { "epoch": 11.005967741935484, "grad_norm": 27.98796272277832, "learning_rate": 9.802867383512546e-06, "loss": 0.1692, "step": 730 }, { "epoch": 11.00758064516129, "grad_norm": 188.5924072265625, "learning_rate": 9.78494623655914e-06, "loss": 0.3925, "step": 740 }, { "epoch": 11.009193548387097, "grad_norm": 65.70986938476562, "learning_rate": 9.767025089605735e-06, "loss": 0.5963, "step": 750 }, { "epoch": 11.01016129032258, "eval_accuracy": 0.6129032258064516, "eval_loss": 0.9214943051338196, "eval_runtime": 6.6117, "eval_samples_per_second": 4.689, "eval_steps_per_second": 1.21, "step": 756 }, { "epoch": 12.000645161290322, "grad_norm": 21.230905532836914, "learning_rate": 9.749103942652331e-06, "loss": 0.4153, "step": 760 }, { "epoch": 12.002258064516129, "grad_norm": 5.523350715637207, "learning_rate": 9.731182795698925e-06, "loss": 0.3299, "step": 770 }, { "epoch": 12.003870967741936, "grad_norm": 5.037207126617432, "learning_rate": 9.71326164874552e-06, "loss": 0.2912, "step": 780 }, { "epoch": 12.005483870967742, "grad_norm": 56.53767013549805, "learning_rate": 9.695340501792116e-06, "loss": 0.5212, "step": 790 }, { "epoch": 12.007096774193549, "grad_norm": 31.97757911682129, "learning_rate": 9.67741935483871e-06, "loss": 0.6257, "step": 800 }, { "epoch": 12.008709677419354, "grad_norm": 8.092595100402832, "learning_rate": 9.659498207885305e-06, "loss": 0.1342, "step": 810 }, { "epoch": 12.01016129032258, "eval_accuracy": 0.6451612903225806, "eval_loss": 0.9187889695167542, "eval_runtime": 6.4955, "eval_samples_per_second": 4.773, "eval_steps_per_second": 1.232, "step": 819 }, { "epoch": 13.00016129032258, "grad_norm": 55.65748977661133, "learning_rate": 9.641577060931901e-06, "loss": 0.2831, "step": 820 }, { "epoch": 13.001774193548387, "grad_norm": 275.52764892578125, "learning_rate": 9.623655913978495e-06, "loss": 0.4742, "step": 830 }, { "epoch": 13.003387096774194, "grad_norm": 19.37176513671875, "learning_rate": 9.60573476702509e-06, "loss": 0.366, "step": 840 }, { "epoch": 13.005, "grad_norm": 178.55467224121094, "learning_rate": 9.587813620071686e-06, "loss": 0.1578, "step": 850 }, { "epoch": 13.006612903225806, "grad_norm": 104.739990234375, "learning_rate": 9.56989247311828e-06, "loss": 0.3892, "step": 860 }, { "epoch": 13.008225806451613, "grad_norm": 83.07843017578125, "learning_rate": 9.551971326164875e-06, "loss": 0.1728, "step": 870 }, { "epoch": 13.00983870967742, "grad_norm": 2.697343349456787, "learning_rate": 9.53405017921147e-06, "loss": 0.2511, "step": 880 }, { "epoch": 13.01016129032258, "eval_accuracy": 0.6451612903225806, "eval_loss": 1.4409942626953125, "eval_runtime": 7.2916, "eval_samples_per_second": 4.251, "eval_steps_per_second": 1.097, "step": 882 }, { "epoch": 14.001290322580646, "grad_norm": 49.517250061035156, "learning_rate": 9.516129032258065e-06, "loss": 0.1073, "step": 890 }, { "epoch": 14.00290322580645, "grad_norm": 187.10006713867188, "learning_rate": 9.49820788530466e-06, "loss": 0.179, "step": 900 }, { "epoch": 14.004516129032258, "grad_norm": 180.89605712890625, "learning_rate": 9.480286738351256e-06, "loss": 0.2171, "step": 910 }, { "epoch": 14.006129032258064, "grad_norm": 146.55459594726562, "learning_rate": 9.46236559139785e-06, "loss": 0.1076, "step": 920 }, { "epoch": 14.007741935483871, "grad_norm": 0.2709847390651703, "learning_rate": 9.444444444444445e-06, "loss": 0.2154, "step": 930 }, { "epoch": 14.009354838709678, "grad_norm": 0.19541990756988525, "learning_rate": 9.42652329749104e-06, "loss": 0.5877, "step": 940 }, { "epoch": 14.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 2.3549857139587402, "eval_runtime": 7.4757, "eval_samples_per_second": 4.147, "eval_steps_per_second": 1.07, "step": 945 }, { "epoch": 15.000806451612902, "grad_norm": 116.17826843261719, "learning_rate": 9.408602150537635e-06, "loss": 0.0714, "step": 950 }, { "epoch": 15.00241935483871, "grad_norm": 0.0583936907351017, "learning_rate": 9.39068100358423e-06, "loss": 0.3206, "step": 960 }, { "epoch": 15.004032258064516, "grad_norm": 1.431188941001892, "learning_rate": 9.372759856630826e-06, "loss": 0.0059, "step": 970 }, { "epoch": 15.005645161290323, "grad_norm": 217.01446533203125, "learning_rate": 9.35483870967742e-06, "loss": 0.1661, "step": 980 }, { "epoch": 15.00725806451613, "grad_norm": 21.670551300048828, "learning_rate": 9.336917562724014e-06, "loss": 0.3769, "step": 990 }, { "epoch": 15.008870967741936, "grad_norm": 0.014820106327533722, "learning_rate": 9.31899641577061e-06, "loss": 0.3261, "step": 1000 }, { "epoch": 15.01016129032258, "eval_accuracy": 0.6774193548387096, "eval_loss": 1.0728644132614136, "eval_runtime": 7.5813, "eval_samples_per_second": 4.089, "eval_steps_per_second": 1.055, "step": 1008 }, { "epoch": 16.00032258064516, "grad_norm": 0.19889932870864868, "learning_rate": 9.301075268817205e-06, "loss": 0.4151, "step": 1010 }, { "epoch": 16.001935483870966, "grad_norm": 0.3380068838596344, "learning_rate": 9.2831541218638e-06, "loss": 0.0236, "step": 1020 }, { "epoch": 16.003548387096775, "grad_norm": 0.47968789935112, "learning_rate": 9.265232974910395e-06, "loss": 0.0729, "step": 1030 }, { "epoch": 16.00516129032258, "grad_norm": 49.29389190673828, "learning_rate": 9.24731182795699e-06, "loss": 0.0505, "step": 1040 }, { "epoch": 16.006774193548388, "grad_norm": 180.01516723632812, "learning_rate": 9.229390681003584e-06, "loss": 0.3145, "step": 1050 }, { "epoch": 16.008387096774193, "grad_norm": 108.78411102294922, "learning_rate": 9.21146953405018e-06, "loss": 0.3628, "step": 1060 }, { "epoch": 16.01, "grad_norm": 0.03119879961013794, "learning_rate": 9.193548387096775e-06, "loss": 0.0425, "step": 1070 }, { "epoch": 16.010161290322582, "eval_accuracy": 0.5806451612903226, "eval_loss": 2.5329673290252686, "eval_runtime": 7.2901, "eval_samples_per_second": 4.252, "eval_steps_per_second": 1.097, "step": 1071 }, { "epoch": 17.001451612903224, "grad_norm": 0.08823108673095703, "learning_rate": 9.17562724014337e-06, "loss": 0.2067, "step": 1080 }, { "epoch": 17.003064516129033, "grad_norm": 7.213440418243408, "learning_rate": 9.157706093189965e-06, "loss": 0.1787, "step": 1090 }, { "epoch": 17.004677419354838, "grad_norm": 0.1837937980890274, "learning_rate": 9.13978494623656e-06, "loss": 0.2074, "step": 1100 }, { "epoch": 17.006290322580647, "grad_norm": 95.1854248046875, "learning_rate": 9.121863799283154e-06, "loss": 0.2085, "step": 1110 }, { "epoch": 17.00790322580645, "grad_norm": 0.008097600191831589, "learning_rate": 9.10394265232975e-06, "loss": 0.1918, "step": 1120 }, { "epoch": 17.009516129032257, "grad_norm": 0.013105648569762707, "learning_rate": 9.086021505376345e-06, "loss": 0.174, "step": 1130 }, { "epoch": 17.010161290322582, "eval_accuracy": 0.5806451612903226, "eval_loss": 2.8189942836761475, "eval_runtime": 7.5349, "eval_samples_per_second": 4.114, "eval_steps_per_second": 1.062, "step": 1134 }, { "epoch": 18.000967741935483, "grad_norm": 0.00844598188996315, "learning_rate": 9.068100358422939e-06, "loss": 0.1075, "step": 1140 }, { "epoch": 18.00258064516129, "grad_norm": 15.8928804397583, "learning_rate": 9.050179211469535e-06, "loss": 0.3331, "step": 1150 }, { "epoch": 18.004193548387097, "grad_norm": 0.16556479036808014, "learning_rate": 9.03225806451613e-06, "loss": 0.0049, "step": 1160 }, { "epoch": 18.0058064516129, "grad_norm": 2.688087224960327, "learning_rate": 9.014336917562726e-06, "loss": 0.1286, "step": 1170 }, { "epoch": 18.00741935483871, "grad_norm": 16.065052032470703, "learning_rate": 8.99641577060932e-06, "loss": 0.4318, "step": 1180 }, { "epoch": 18.009032258064515, "grad_norm": 0.17313583195209503, "learning_rate": 8.978494623655915e-06, "loss": 0.1972, "step": 1190 }, { "epoch": 18.010161290322582, "eval_accuracy": 0.5483870967741935, "eval_loss": 2.4491164684295654, "eval_runtime": 8.1319, "eval_samples_per_second": 3.812, "eval_steps_per_second": 0.984, "step": 1197 }, { "epoch": 19.00048387096774, "grad_norm": 0.08344484865665436, "learning_rate": 8.96057347670251e-06, "loss": 0.1084, "step": 1200 }, { "epoch": 19.00209677419355, "grad_norm": 0.005674073472619057, "learning_rate": 8.942652329749103e-06, "loss": 0.2377, "step": 1210 }, { "epoch": 19.003709677419355, "grad_norm": 188.25645446777344, "learning_rate": 8.9247311827957e-06, "loss": 0.3506, "step": 1220 }, { "epoch": 19.00532258064516, "grad_norm": 8.602683067321777, "learning_rate": 8.906810035842296e-06, "loss": 0.1808, "step": 1230 }, { "epoch": 19.00693548387097, "grad_norm": 0.027648361399769783, "learning_rate": 8.888888888888888e-06, "loss": 0.0297, "step": 1240 }, { "epoch": 19.008548387096774, "grad_norm": 0.027637062594294548, "learning_rate": 8.870967741935484e-06, "loss": 0.0004, "step": 1250 }, { "epoch": 19.010161290322582, "grad_norm": 0.4523034989833832, "learning_rate": 8.85304659498208e-06, "loss": 0.2264, "step": 1260 }, { "epoch": 19.010161290322582, "eval_accuracy": 0.6774193548387096, "eval_loss": 1.934498906135559, "eval_runtime": 7.8216, "eval_samples_per_second": 3.963, "eval_steps_per_second": 1.023, "step": 1260 }, { "epoch": 20.001612903225805, "grad_norm": 23.10691261291504, "learning_rate": 8.835125448028675e-06, "loss": 0.2092, "step": 1270 }, { "epoch": 20.003225806451614, "grad_norm": 9.69577407836914, "learning_rate": 8.81720430107527e-06, "loss": 0.7783, "step": 1280 }, { "epoch": 20.00483870967742, "grad_norm": 0.005470858421176672, "learning_rate": 8.799283154121865e-06, "loss": 0.0958, "step": 1290 }, { "epoch": 20.006451612903227, "grad_norm": 0.1090906411409378, "learning_rate": 8.78136200716846e-06, "loss": 0.004, "step": 1300 }, { "epoch": 20.008064516129032, "grad_norm": 0.09628669172525406, "learning_rate": 8.763440860215054e-06, "loss": 0.2166, "step": 1310 }, { "epoch": 20.009677419354837, "grad_norm": 0.003825843334197998, "learning_rate": 8.74551971326165e-06, "loss": 0.0862, "step": 1320 }, { "epoch": 20.010161290322582, "eval_accuracy": 0.5161290322580645, "eval_loss": 3.3695249557495117, "eval_runtime": 8.1131, "eval_samples_per_second": 3.821, "eval_steps_per_second": 0.986, "step": 1323 }, { "epoch": 21.001129032258063, "grad_norm": 0.05634148046374321, "learning_rate": 8.727598566308245e-06, "loss": 0.2453, "step": 1330 }, { "epoch": 21.002741935483872, "grad_norm": 0.049040358513593674, "learning_rate": 8.70967741935484e-06, "loss": 0.3991, "step": 1340 }, { "epoch": 21.004354838709677, "grad_norm": 340.5539855957031, "learning_rate": 8.691756272401435e-06, "loss": 0.1452, "step": 1350 }, { "epoch": 21.005967741935486, "grad_norm": 0.029688240960240364, "learning_rate": 8.67383512544803e-06, "loss": 0.0068, "step": 1360 }, { "epoch": 21.00758064516129, "grad_norm": 0.02621721290051937, "learning_rate": 8.655913978494624e-06, "loss": 0.0569, "step": 1370 }, { "epoch": 21.009193548387096, "grad_norm": 0.025450093671679497, "learning_rate": 8.63799283154122e-06, "loss": 0.0998, "step": 1380 }, { "epoch": 21.010161290322582, "eval_accuracy": 0.7741935483870968, "eval_loss": 1.4091392755508423, "eval_runtime": 7.8323, "eval_samples_per_second": 3.958, "eval_steps_per_second": 1.021, "step": 1386 }, { "epoch": 22.000645161290322, "grad_norm": 0.44440215826034546, "learning_rate": 8.620071684587815e-06, "loss": 0.0096, "step": 1390 }, { "epoch": 22.00225806451613, "grad_norm": 0.009962552227079868, "learning_rate": 8.602150537634409e-06, "loss": 0.644, "step": 1400 }, { "epoch": 22.003870967741936, "grad_norm": 116.51396942138672, "learning_rate": 8.584229390681005e-06, "loss": 0.0231, "step": 1410 }, { "epoch": 22.00548387096774, "grad_norm": 216.5560302734375, "learning_rate": 8.5663082437276e-06, "loss": 0.2662, "step": 1420 }, { "epoch": 22.00709677419355, "grad_norm": 0.0183710977435112, "learning_rate": 8.548387096774194e-06, "loss": 0.1959, "step": 1430 }, { "epoch": 22.008709677419354, "grad_norm": 0.0647834986448288, "learning_rate": 8.530465949820788e-06, "loss": 0.311, "step": 1440 }, { "epoch": 22.010161290322582, "eval_accuracy": 0.5483870967741935, "eval_loss": 2.7629103660583496, "eval_runtime": 7.6946, "eval_samples_per_second": 4.029, "eval_steps_per_second": 1.04, "step": 1449 }, { "epoch": 23.00016129032258, "grad_norm": 0.016561094671487808, "learning_rate": 8.512544802867385e-06, "loss": 0.0252, "step": 1450 }, { "epoch": 23.001774193548385, "grad_norm": 0.040134500712156296, "learning_rate": 8.494623655913979e-06, "loss": 0.0034, "step": 1460 }, { "epoch": 23.003387096774194, "grad_norm": 142.82286071777344, "learning_rate": 8.476702508960573e-06, "loss": 0.1079, "step": 1470 }, { "epoch": 23.005, "grad_norm": 31.057464599609375, "learning_rate": 8.45878136200717e-06, "loss": 0.3342, "step": 1480 }, { "epoch": 23.006612903225808, "grad_norm": 0.008985779248178005, "learning_rate": 8.440860215053764e-06, "loss": 0.0003, "step": 1490 }, { "epoch": 23.008225806451613, "grad_norm": 0.40310224890708923, "learning_rate": 8.422939068100358e-06, "loss": 0.2469, "step": 1500 }, { "epoch": 23.00983870967742, "grad_norm": 0.006473642308264971, "learning_rate": 8.405017921146954e-06, "loss": 0.0481, "step": 1510 }, { "epoch": 23.010161290322582, "eval_accuracy": 0.6451612903225806, "eval_loss": 2.050615072250366, "eval_runtime": 7.7449, "eval_samples_per_second": 4.003, "eval_steps_per_second": 1.033, "step": 1512 }, { "epoch": 24.001290322580644, "grad_norm": 0.05811922252178192, "learning_rate": 8.387096774193549e-06, "loss": 0.2078, "step": 1520 }, { "epoch": 24.002903225806453, "grad_norm": 1.0772080421447754, "learning_rate": 8.369175627240143e-06, "loss": 0.0825, "step": 1530 }, { "epoch": 24.004516129032258, "grad_norm": 0.027578506618738174, "learning_rate": 8.35125448028674e-06, "loss": 0.1463, "step": 1540 }, { "epoch": 24.006129032258066, "grad_norm": 0.06800124794244766, "learning_rate": 8.333333333333334e-06, "loss": 0.1588, "step": 1550 }, { "epoch": 24.00774193548387, "grad_norm": 0.006699579767882824, "learning_rate": 8.315412186379928e-06, "loss": 0.0007, "step": 1560 }, { "epoch": 24.009354838709676, "grad_norm": 426.5452880859375, "learning_rate": 8.297491039426524e-06, "loss": 0.2109, "step": 1570 }, { "epoch": 24.010161290322582, "eval_accuracy": 0.5806451612903226, "eval_loss": 2.59899640083313, "eval_runtime": 7.8132, "eval_samples_per_second": 3.968, "eval_steps_per_second": 1.024, "step": 1575 }, { "epoch": 25.000806451612902, "grad_norm": 115.23479461669922, "learning_rate": 8.279569892473119e-06, "loss": 0.0221, "step": 1580 }, { "epoch": 25.00241935483871, "grad_norm": 0.0025610916782170534, "learning_rate": 8.261648745519713e-06, "loss": 0.1053, "step": 1590 }, { "epoch": 25.004032258064516, "grad_norm": 0.01656101457774639, "learning_rate": 8.24372759856631e-06, "loss": 0.0004, "step": 1600 }, { "epoch": 25.00564516129032, "grad_norm": 0.0194097850471735, "learning_rate": 8.225806451612904e-06, "loss": 0.0004, "step": 1610 }, { "epoch": 25.00725806451613, "grad_norm": 1.1828374862670898, "learning_rate": 8.207885304659498e-06, "loss": 0.0413, "step": 1620 }, { "epoch": 25.008870967741935, "grad_norm": 0.009847279638051987, "learning_rate": 8.189964157706094e-06, "loss": 0.179, "step": 1630 }, { "epoch": 25.010161290322582, "eval_accuracy": 0.5806451612903226, "eval_loss": 2.781538963317871, "eval_runtime": 7.8013, "eval_samples_per_second": 3.974, "eval_steps_per_second": 1.025, "step": 1638 }, { "epoch": 26.00032258064516, "grad_norm": 0.013599119149148464, "learning_rate": 8.172043010752689e-06, "loss": 0.0073, "step": 1640 }, { "epoch": 26.001935483870966, "grad_norm": 0.011127980425953865, "learning_rate": 8.154121863799283e-06, "loss": 0.0006, "step": 1650 }, { "epoch": 26.003548387096775, "grad_norm": 0.003879767144098878, "learning_rate": 8.136200716845879e-06, "loss": 0.3463, "step": 1660 }, { "epoch": 26.00516129032258, "grad_norm": 0.0031433638650923967, "learning_rate": 8.118279569892473e-06, "loss": 0.0016, "step": 1670 }, { "epoch": 26.006774193548388, "grad_norm": 0.07057037204504013, "learning_rate": 8.100358422939068e-06, "loss": 0.0004, "step": 1680 }, { "epoch": 26.008387096774193, "grad_norm": 0.33540138602256775, "learning_rate": 8.082437275985664e-06, "loss": 0.0002, "step": 1690 }, { "epoch": 26.01, "grad_norm": 0.030621379613876343, "learning_rate": 8.064516129032258e-06, "loss": 0.0002, "step": 1700 }, { "epoch": 26.010161290322582, "eval_accuracy": 0.5161290322580645, "eval_loss": 3.671872615814209, "eval_runtime": 7.6886, "eval_samples_per_second": 4.032, "eval_steps_per_second": 1.04, "step": 1701 }, { "epoch": 27.001451612903224, "grad_norm": 223.8633575439453, "learning_rate": 8.046594982078853e-06, "loss": 0.091, "step": 1710 }, { "epoch": 27.003064516129033, "grad_norm": 0.007805492263287306, "learning_rate": 8.028673835125449e-06, "loss": 0.0065, "step": 1720 }, { "epoch": 27.004677419354838, "grad_norm": 0.008708256296813488, "learning_rate": 8.010752688172043e-06, "loss": 0.1597, "step": 1730 }, { "epoch": 27.006290322580647, "grad_norm": 0.007226979359984398, "learning_rate": 7.992831541218638e-06, "loss": 0.0154, "step": 1740 }, { "epoch": 27.00790322580645, "grad_norm": 0.031198803335428238, "learning_rate": 7.974910394265234e-06, "loss": 0.0002, "step": 1750 }, { "epoch": 27.009516129032257, "grad_norm": 0.00789616722613573, "learning_rate": 7.956989247311828e-06, "loss": 0.0996, "step": 1760 }, { "epoch": 27.010161290322582, "eval_accuracy": 0.5161290322580645, "eval_loss": 3.7617688179016113, "eval_runtime": 7.8441, "eval_samples_per_second": 3.952, "eval_steps_per_second": 1.02, "step": 1764 }, { "epoch": 28.000967741935483, "grad_norm": 0.0034709079191088676, "learning_rate": 7.939068100358424e-06, "loss": 0.4075, "step": 1770 }, { "epoch": 28.00258064516129, "grad_norm": 0.1974840611219406, "learning_rate": 7.921146953405019e-06, "loss": 0.0078, "step": 1780 }, { "epoch": 28.004193548387097, "grad_norm": 0.02149233967065811, "learning_rate": 7.903225806451613e-06, "loss": 0.2262, "step": 1790 }, { "epoch": 28.0058064516129, "grad_norm": 0.9507142901420593, "learning_rate": 7.88530465949821e-06, "loss": 0.0692, "step": 1800 }, { "epoch": 28.00741935483871, "grad_norm": 0.1831965446472168, "learning_rate": 7.867383512544804e-06, "loss": 0.2666, "step": 1810 }, { "epoch": 28.009032258064515, "grad_norm": 0.11429057270288467, "learning_rate": 7.849462365591398e-06, "loss": 0.0002, "step": 1820 }, { "epoch": 28.010161290322582, "eval_accuracy": 0.5483870967741935, "eval_loss": 3.3375070095062256, "eval_runtime": 7.9012, "eval_samples_per_second": 3.923, "eval_steps_per_second": 1.013, "step": 1827 }, { "epoch": 29.00048387096774, "grad_norm": 2.1111268997192383, "learning_rate": 7.831541218637994e-06, "loss": 0.0004, "step": 1830 }, { "epoch": 29.00209677419355, "grad_norm": 2.4912052154541016, "learning_rate": 7.813620071684589e-06, "loss": 0.1103, "step": 1840 }, { "epoch": 29.003709677419355, "grad_norm": 0.0064556049183011055, "learning_rate": 7.795698924731183e-06, "loss": 0.0227, "step": 1850 }, { "epoch": 29.00532258064516, "grad_norm": 0.044650815427303314, "learning_rate": 7.77777777777778e-06, "loss": 0.0007, "step": 1860 }, { "epoch": 29.00693548387097, "grad_norm": 0.0034575294703245163, "learning_rate": 7.759856630824374e-06, "loss": 0.0005, "step": 1870 }, { "epoch": 29.008548387096774, "grad_norm": 0.3267681300640106, "learning_rate": 7.741935483870968e-06, "loss": 0.0002, "step": 1880 }, { "epoch": 29.010161290322582, "grad_norm": 0.01336196530610323, "learning_rate": 7.724014336917564e-06, "loss": 0.0004, "step": 1890 }, { "epoch": 29.010161290322582, "eval_accuracy": 0.6129032258064516, "eval_loss": 2.874981641769409, "eval_runtime": 5.949, "eval_samples_per_second": 5.211, "eval_steps_per_second": 1.345, "step": 1890 }, { "epoch": 30.001612903225805, "grad_norm": 0.011660858988761902, "learning_rate": 7.706093189964159e-06, "loss": 0.0003, "step": 1900 }, { "epoch": 30.003225806451614, "grad_norm": 0.06132528558373451, "learning_rate": 7.688172043010753e-06, "loss": 0.0001, "step": 1910 }, { "epoch": 30.00483870967742, "grad_norm": 1.0501477718353271, "learning_rate": 7.670250896057349e-06, "loss": 0.0002, "step": 1920 }, { "epoch": 30.006451612903227, "grad_norm": 0.0032924411352723837, "learning_rate": 7.652329749103943e-06, "loss": 0.1023, "step": 1930 }, { "epoch": 30.008064516129032, "grad_norm": 23.022029876708984, "learning_rate": 7.634408602150538e-06, "loss": 0.0025, "step": 1940 }, { "epoch": 30.009677419354837, "grad_norm": 0.0036649659741669893, "learning_rate": 7.616487455197133e-06, "loss": 0.0001, "step": 1950 }, { "epoch": 30.010161290322582, "eval_accuracy": 0.6774193548387096, "eval_loss": 2.586686849594116, "eval_runtime": 6.3707, "eval_samples_per_second": 4.866, "eval_steps_per_second": 1.256, "step": 1953 }, { "epoch": 31.001129032258063, "grad_norm": 0.0066962046548724174, "learning_rate": 7.5985663082437275e-06, "loss": 0.0006, "step": 1960 }, { "epoch": 31.002741935483872, "grad_norm": 6.700741767883301, "learning_rate": 7.580645161290323e-06, "loss": 0.0018, "step": 1970 }, { "epoch": 31.004354838709677, "grad_norm": 0.005098952446132898, "learning_rate": 7.562724014336919e-06, "loss": 0.0007, "step": 1980 }, { "epoch": 31.005967741935486, "grad_norm": 0.003655584529042244, "learning_rate": 7.5448028673835125e-06, "loss": 0.0006, "step": 1990 }, { "epoch": 31.00758064516129, "grad_norm": 0.007408137898892164, "learning_rate": 7.526881720430108e-06, "loss": 0.0001, "step": 2000 }, { "epoch": 31.009193548387096, "grad_norm": 0.006466301623731852, "learning_rate": 7.508960573476704e-06, "loss": 0.1188, "step": 2010 }, { "epoch": 31.010161290322582, "eval_accuracy": 0.6774193548387096, "eval_loss": 1.826348900794983, "eval_runtime": 5.5505, "eval_samples_per_second": 5.585, "eval_steps_per_second": 1.441, "step": 2016 }, { "epoch": 32.00064516129032, "grad_norm": 0.006353042554110289, "learning_rate": 7.491039426523297e-06, "loss": 0.0001, "step": 2020 }, { "epoch": 32.00225806451613, "grad_norm": 0.0028045109938830137, "learning_rate": 7.4731182795698935e-06, "loss": 0.0255, "step": 2030 }, { "epoch": 32.00387096774193, "grad_norm": 0.003743524197489023, "learning_rate": 7.455197132616489e-06, "loss": 0.1947, "step": 2040 }, { "epoch": 32.005483870967744, "grad_norm": 327.1103820800781, "learning_rate": 7.437275985663082e-06, "loss": 0.0865, "step": 2050 }, { "epoch": 32.00709677419355, "grad_norm": 0.013567542657256126, "learning_rate": 7.4193548387096784e-06, "loss": 0.0129, "step": 2060 }, { "epoch": 32.008709677419354, "grad_norm": 0.0016795871779322624, "learning_rate": 7.401433691756274e-06, "loss": 0.0295, "step": 2070 }, { "epoch": 32.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.269862651824951, "eval_runtime": 6.2848, "eval_samples_per_second": 4.933, "eval_steps_per_second": 1.273, "step": 2079 }, { "epoch": 33.00016129032258, "grad_norm": 0.0033075993414968252, "learning_rate": 7.383512544802868e-06, "loss": 0.3959, "step": 2080 }, { "epoch": 33.001774193548385, "grad_norm": 0.004525234457105398, "learning_rate": 7.365591397849463e-06, "loss": 0.0002, "step": 2090 }, { "epoch": 33.00338709677419, "grad_norm": 0.0030083286110311747, "learning_rate": 7.347670250896059e-06, "loss": 0.2086, "step": 2100 }, { "epoch": 33.005, "grad_norm": 0.010619004257023335, "learning_rate": 7.329749103942653e-06, "loss": 0.0002, "step": 2110 }, { "epoch": 33.00661290322581, "grad_norm": 0.005634678062051535, "learning_rate": 7.311827956989248e-06, "loss": 0.0002, "step": 2120 }, { "epoch": 33.00822580645161, "grad_norm": 0.004780784249305725, "learning_rate": 7.2939068100358436e-06, "loss": 0.1565, "step": 2130 }, { "epoch": 33.00983870967742, "grad_norm": 56.0381965637207, "learning_rate": 7.275985663082438e-06, "loss": 0.1931, "step": 2140 }, { "epoch": 33.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.353241205215454, "eval_runtime": 6.6039, "eval_samples_per_second": 4.694, "eval_steps_per_second": 1.211, "step": 2142 }, { "epoch": 34.001290322580644, "grad_norm": 0.03778094798326492, "learning_rate": 7.258064516129033e-06, "loss": 0.0189, "step": 2150 }, { "epoch": 34.00290322580645, "grad_norm": 0.003028768114745617, "learning_rate": 7.240143369175628e-06, "loss": 0.0233, "step": 2160 }, { "epoch": 34.00451612903226, "grad_norm": 0.007261109072715044, "learning_rate": 7.222222222222223e-06, "loss": 0.0003, "step": 2170 }, { "epoch": 34.006129032258066, "grad_norm": 0.004371033515781164, "learning_rate": 7.204301075268818e-06, "loss": 0.0001, "step": 2180 }, { "epoch": 34.00774193548387, "grad_norm": 0.0015108762308955193, "learning_rate": 7.186379928315413e-06, "loss": 0.0001, "step": 2190 }, { "epoch": 34.009354838709676, "grad_norm": 0.0016336280386894941, "learning_rate": 7.168458781362008e-06, "loss": 0.0002, "step": 2200 }, { "epoch": 34.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 4.200132369995117, "eval_runtime": 6.1104, "eval_samples_per_second": 5.073, "eval_steps_per_second": 1.309, "step": 2205 }, { "epoch": 35.0008064516129, "grad_norm": 0.06609949469566345, "learning_rate": 7.150537634408603e-06, "loss": 0.0002, "step": 2210 }, { "epoch": 35.00241935483871, "grad_norm": 0.001493967603892088, "learning_rate": 7.1326164874551975e-06, "loss": 0.0001, "step": 2220 }, { "epoch": 35.00403225806452, "grad_norm": 0.0025541477371007204, "learning_rate": 7.114695340501793e-06, "loss": 0.1683, "step": 2230 }, { "epoch": 35.005645161290325, "grad_norm": 0.004474389832466841, "learning_rate": 7.096774193548388e-06, "loss": 0.0001, "step": 2240 }, { "epoch": 35.00725806451613, "grad_norm": 0.00798499770462513, "learning_rate": 7.0788530465949824e-06, "loss": 0.0747, "step": 2250 }, { "epoch": 35.008870967741935, "grad_norm": 0.015142068266868591, "learning_rate": 7.060931899641578e-06, "loss": 0.0001, "step": 2260 }, { "epoch": 35.01016129032258, "eval_accuracy": 0.5483870967741935, "eval_loss": 3.3818588256835938, "eval_runtime": 5.6132, "eval_samples_per_second": 5.523, "eval_steps_per_second": 1.425, "step": 2268 }, { "epoch": 36.00032258064516, "grad_norm": 0.005978843662887812, "learning_rate": 7.043010752688173e-06, "loss": 0.1752, "step": 2270 }, { "epoch": 36.001935483870966, "grad_norm": 0.7413745522499084, "learning_rate": 7.025089605734767e-06, "loss": 0.0354, "step": 2280 }, { "epoch": 36.00354838709677, "grad_norm": 0.005803413223475218, "learning_rate": 7.007168458781363e-06, "loss": 0.0048, "step": 2290 }, { "epoch": 36.00516129032258, "grad_norm": 466.6094055175781, "learning_rate": 6.989247311827958e-06, "loss": 0.1027, "step": 2300 }, { "epoch": 36.00677419354839, "grad_norm": 0.002476946683600545, "learning_rate": 6.971326164874552e-06, "loss": 0.0002, "step": 2310 }, { "epoch": 36.00838709677419, "grad_norm": 3.948978900909424, "learning_rate": 6.9534050179211476e-06, "loss": 0.0004, "step": 2320 }, { "epoch": 36.01, "grad_norm": 0.004969065077602863, "learning_rate": 6.935483870967743e-06, "loss": 0.0001, "step": 2330 }, { "epoch": 36.01016129032258, "eval_accuracy": 0.7096774193548387, "eval_loss": 2.2776196002960205, "eval_runtime": 5.7118, "eval_samples_per_second": 5.427, "eval_steps_per_second": 1.401, "step": 2331 }, { "epoch": 37.001451612903224, "grad_norm": 0.0024395145010203123, "learning_rate": 6.917562724014337e-06, "loss": 0.0001, "step": 2340 }, { "epoch": 37.00306451612903, "grad_norm": 4.106926918029785, "learning_rate": 6.8996415770609325e-06, "loss": 0.1684, "step": 2350 }, { "epoch": 37.00467741935484, "grad_norm": 2.1262402534484863, "learning_rate": 6.881720430107528e-06, "loss": 0.1956, "step": 2360 }, { "epoch": 37.00629032258065, "grad_norm": 0.028283217921853065, "learning_rate": 6.863799283154122e-06, "loss": 0.0045, "step": 2370 }, { "epoch": 37.00790322580645, "grad_norm": 0.024259060621261597, "learning_rate": 6.8458781362007174e-06, "loss": 0.2178, "step": 2380 }, { "epoch": 37.00951612903226, "grad_norm": 0.010875164531171322, "learning_rate": 6.827956989247312e-06, "loss": 0.0007, "step": 2390 }, { "epoch": 37.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 2.851607322692871, "eval_runtime": 5.6167, "eval_samples_per_second": 5.519, "eval_steps_per_second": 1.424, "step": 2394 }, { "epoch": 38.00096774193548, "grad_norm": 0.007449969183653593, "learning_rate": 6.810035842293907e-06, "loss": 0.2137, "step": 2400 }, { "epoch": 38.00258064516129, "grad_norm": 0.03475867584347725, "learning_rate": 6.792114695340502e-06, "loss": 0.0739, "step": 2410 }, { "epoch": 38.0041935483871, "grad_norm": 0.024666184559464455, "learning_rate": 6.774193548387097e-06, "loss": 0.1174, "step": 2420 }, { "epoch": 38.005806451612905, "grad_norm": 0.006377407815307379, "learning_rate": 6.756272401433692e-06, "loss": 0.0005, "step": 2430 }, { "epoch": 38.00741935483871, "grad_norm": 0.005703488364815712, "learning_rate": 6.738351254480287e-06, "loss": 0.184, "step": 2440 }, { "epoch": 38.009032258064515, "grad_norm": 0.020010700449347496, "learning_rate": 6.720430107526882e-06, "loss": 0.0001, "step": 2450 }, { "epoch": 38.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 4.042048454284668, "eval_runtime": 7.591, "eval_samples_per_second": 4.084, "eval_steps_per_second": 1.054, "step": 2457 }, { "epoch": 39.00048387096774, "grad_norm": 0.001750840456224978, "learning_rate": 6.702508960573477e-06, "loss": 0.0001, "step": 2460 }, { "epoch": 39.00209677419355, "grad_norm": 0.4397079348564148, "learning_rate": 6.684587813620072e-06, "loss": 0.0001, "step": 2470 }, { "epoch": 39.00370967741935, "grad_norm": 0.0021402398124337196, "learning_rate": 6.666666666666667e-06, "loss": 0.2989, "step": 2480 }, { "epoch": 39.005322580645164, "grad_norm": 0.0029481856618076563, "learning_rate": 6.648745519713262e-06, "loss": 0.0011, "step": 2490 }, { "epoch": 39.00693548387097, "grad_norm": 0.004642050713300705, "learning_rate": 6.630824372759857e-06, "loss": 0.2142, "step": 2500 }, { "epoch": 39.00854838709677, "grad_norm": 0.0026611685752868652, "learning_rate": 6.612903225806452e-06, "loss": 0.0001, "step": 2510 }, { "epoch": 39.01016129032258, "grad_norm": 0.0027944843750447035, "learning_rate": 6.594982078853047e-06, "loss": 0.0002, "step": 2520 }, { "epoch": 39.01016129032258, "eval_accuracy": 0.6451612903225806, "eval_loss": 2.590102195739746, "eval_runtime": 5.6876, "eval_samples_per_second": 5.451, "eval_steps_per_second": 1.407, "step": 2520 }, { "epoch": 40.001612903225805, "grad_norm": 0.002954120049253106, "learning_rate": 6.577060931899643e-06, "loss": 0.0001, "step": 2530 }, { "epoch": 40.00322580645161, "grad_norm": 0.0023518831003457308, "learning_rate": 6.5591397849462365e-06, "loss": 0.0001, "step": 2540 }, { "epoch": 40.00483870967742, "grad_norm": 0.0016860960749909282, "learning_rate": 6.541218637992832e-06, "loss": 0.0001, "step": 2550 }, { "epoch": 40.00645161290323, "grad_norm": 0.0019111360888928175, "learning_rate": 6.523297491039428e-06, "loss": 0.0001, "step": 2560 }, { "epoch": 40.00806451612903, "grad_norm": 0.0018520368030294776, "learning_rate": 6.5053763440860214e-06, "loss": 0.0001, "step": 2570 }, { "epoch": 40.00967741935484, "grad_norm": 0.001493333256803453, "learning_rate": 6.4874551971326176e-06, "loss": 0.0001, "step": 2580 }, { "epoch": 40.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.5042996406555176, "eval_runtime": 5.8667, "eval_samples_per_second": 5.284, "eval_steps_per_second": 1.364, "step": 2583 }, { "epoch": 41.00112903225806, "grad_norm": 0.0074190120212733746, "learning_rate": 6.469534050179213e-06, "loss": 0.0001, "step": 2590 }, { "epoch": 41.00274193548387, "grad_norm": 0.0020655347034335136, "learning_rate": 6.451612903225806e-06, "loss": 0.0001, "step": 2600 }, { "epoch": 41.00435483870968, "grad_norm": 0.0019045746885240078, "learning_rate": 6.4336917562724025e-06, "loss": 0.0001, "step": 2610 }, { "epoch": 41.005967741935486, "grad_norm": 0.00646065641194582, "learning_rate": 6.415770609318996e-06, "loss": 0.0001, "step": 2620 }, { "epoch": 41.00758064516129, "grad_norm": 0.004453559406101704, "learning_rate": 6.397849462365592e-06, "loss": 0.0001, "step": 2630 }, { "epoch": 41.009193548387096, "grad_norm": 0.0011288289679214358, "learning_rate": 6.379928315412187e-06, "loss": 0.0001, "step": 2640 }, { "epoch": 41.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.5424225330352783, "eval_runtime": 5.8227, "eval_samples_per_second": 5.324, "eval_steps_per_second": 1.374, "step": 2646 }, { "epoch": 42.00064516129032, "grad_norm": 0.0038594440557062626, "learning_rate": 6.362007168458782e-06, "loss": 0.0001, "step": 2650 }, { "epoch": 42.00225806451613, "grad_norm": 0.00955984741449356, "learning_rate": 6.344086021505377e-06, "loss": 0.0998, "step": 2660 }, { "epoch": 42.00387096774193, "grad_norm": 0.002704484388232231, "learning_rate": 6.326164874551972e-06, "loss": 0.1487, "step": 2670 }, { "epoch": 42.005483870967744, "grad_norm": 0.00910151656717062, "learning_rate": 6.308243727598567e-06, "loss": 0.0, "step": 2680 }, { "epoch": 42.00709677419355, "grad_norm": 0.0022732370998710394, "learning_rate": 6.290322580645162e-06, "loss": 0.0001, "step": 2690 }, { "epoch": 42.008709677419354, "grad_norm": 0.0034803785383701324, "learning_rate": 6.272401433691757e-06, "loss": 0.0001, "step": 2700 }, { "epoch": 42.01016129032258, "eval_accuracy": 0.5483870967741935, "eval_loss": 3.873997926712036, "eval_runtime": 5.6078, "eval_samples_per_second": 5.528, "eval_steps_per_second": 1.427, "step": 2709 }, { "epoch": 43.00016129032258, "grad_norm": 1.11460280418396, "learning_rate": 6.254480286738352e-06, "loss": 0.0039, "step": 2710 }, { "epoch": 43.001774193548385, "grad_norm": 0.004692637361586094, "learning_rate": 6.236559139784947e-06, "loss": 0.0001, "step": 2720 }, { "epoch": 43.00338709677419, "grad_norm": 0.0027625139337033033, "learning_rate": 6.218637992831542e-06, "loss": 0.0001, "step": 2730 }, { "epoch": 43.005, "grad_norm": 0.021269548684358597, "learning_rate": 6.200716845878137e-06, "loss": 0.0001, "step": 2740 }, { "epoch": 43.00661290322581, "grad_norm": 0.0014857338974252343, "learning_rate": 6.182795698924732e-06, "loss": 0.0, "step": 2750 }, { "epoch": 43.00822580645161, "grad_norm": 0.017131278291344643, "learning_rate": 6.164874551971327e-06, "loss": 0.0001, "step": 2760 }, { "epoch": 43.00983870967742, "grad_norm": 0.006271045655012131, "learning_rate": 6.1469534050179216e-06, "loss": 0.0001, "step": 2770 }, { "epoch": 43.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.5725624561309814, "eval_runtime": 5.6882, "eval_samples_per_second": 5.45, "eval_steps_per_second": 1.406, "step": 2772 }, { "epoch": 44.001290322580644, "grad_norm": 0.0033605240751057863, "learning_rate": 6.129032258064517e-06, "loss": 0.0001, "step": 2780 }, { "epoch": 44.00290322580645, "grad_norm": 0.0013236167142167687, "learning_rate": 6.111111111111112e-06, "loss": 0.1401, "step": 2790 }, { "epoch": 44.00451612903226, "grad_norm": 0.0010612865444272757, "learning_rate": 6.0931899641577065e-06, "loss": 0.0304, "step": 2800 }, { "epoch": 44.006129032258066, "grad_norm": 0.004824892617762089, "learning_rate": 6.075268817204302e-06, "loss": 0.0001, "step": 2810 }, { "epoch": 44.00774193548387, "grad_norm": 0.0009301839163526893, "learning_rate": 6.057347670250897e-06, "loss": 0.0005, "step": 2820 }, { "epoch": 44.009354838709676, "grad_norm": 0.002125853206962347, "learning_rate": 6.0394265232974914e-06, "loss": 0.0004, "step": 2830 }, { "epoch": 44.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.218432664871216, "eval_runtime": 5.8407, "eval_samples_per_second": 5.308, "eval_steps_per_second": 1.37, "step": 2835 }, { "epoch": 45.0008064516129, "grad_norm": 0.0029689755756407976, "learning_rate": 6.021505376344087e-06, "loss": 0.0001, "step": 2840 }, { "epoch": 45.00241935483871, "grad_norm": 0.0027505734469741583, "learning_rate": 6.003584229390681e-06, "loss": 0.0, "step": 2850 }, { "epoch": 45.00403225806452, "grad_norm": 0.0067787147127091885, "learning_rate": 5.985663082437276e-06, "loss": 0.0031, "step": 2860 }, { "epoch": 45.005645161290325, "grad_norm": 0.001522620441392064, "learning_rate": 5.967741935483872e-06, "loss": 0.0001, "step": 2870 }, { "epoch": 45.00725806451613, "grad_norm": 0.0020584878511726856, "learning_rate": 5.949820788530466e-06, "loss": 0.0001, "step": 2880 }, { "epoch": 45.008870967741935, "grad_norm": 0.0014197188429534435, "learning_rate": 5.931899641577061e-06, "loss": 0.0, "step": 2890 }, { "epoch": 45.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.3346645832061768, "eval_runtime": 6.491, "eval_samples_per_second": 4.776, "eval_steps_per_second": 1.232, "step": 2898 }, { "epoch": 46.00032258064516, "grad_norm": 0.0014989773044362664, "learning_rate": 5.9139784946236566e-06, "loss": 0.0, "step": 2900 }, { "epoch": 46.001935483870966, "grad_norm": 0.0007594978087581694, "learning_rate": 5.896057347670251e-06, "loss": 0.0001, "step": 2910 }, { "epoch": 46.00354838709677, "grad_norm": 0.010089215822517872, "learning_rate": 5.878136200716846e-06, "loss": 0.0007, "step": 2920 }, { "epoch": 46.00516129032258, "grad_norm": 0.0010494289454072714, "learning_rate": 5.8602150537634415e-06, "loss": 0.0, "step": 2930 }, { "epoch": 46.00677419354839, "grad_norm": 0.0013317839475348592, "learning_rate": 5.842293906810036e-06, "loss": 0.0001, "step": 2940 }, { "epoch": 46.00838709677419, "grad_norm": 0.001914470107294619, "learning_rate": 5.824372759856631e-06, "loss": 0.0869, "step": 2950 }, { "epoch": 46.01, "grad_norm": 0.0007147088763304055, "learning_rate": 5.806451612903226e-06, "loss": 0.0001, "step": 2960 }, { "epoch": 46.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.8206071853637695, "eval_runtime": 6.9978, "eval_samples_per_second": 4.43, "eval_steps_per_second": 1.143, "step": 2961 }, { "epoch": 47.001451612903224, "grad_norm": 0.0012741729151457548, "learning_rate": 5.788530465949821e-06, "loss": 0.0, "step": 2970 }, { "epoch": 47.00306451612903, "grad_norm": 0.0006857850239612162, "learning_rate": 5.770609318996416e-06, "loss": 0.0, "step": 2980 }, { "epoch": 47.00467741935484, "grad_norm": 0.007349266204982996, "learning_rate": 5.752688172043011e-06, "loss": 0.0, "step": 2990 }, { "epoch": 47.00629032258065, "grad_norm": 0.002241634065285325, "learning_rate": 5.734767025089606e-06, "loss": 0.0, "step": 3000 }, { "epoch": 47.00790322580645, "grad_norm": 0.002284340327605605, "learning_rate": 5.716845878136201e-06, "loss": 0.0001, "step": 3010 }, { "epoch": 47.00951612903226, "grad_norm": 0.0011396215995773673, "learning_rate": 5.698924731182796e-06, "loss": 0.0, "step": 3020 }, { "epoch": 47.01016129032258, "eval_accuracy": 0.5483870967741935, "eval_loss": 3.7951278686523438, "eval_runtime": 5.8249, "eval_samples_per_second": 5.322, "eval_steps_per_second": 1.373, "step": 3024 }, { "epoch": 48.00096774193548, "grad_norm": 0.0007772017270326614, "learning_rate": 5.681003584229391e-06, "loss": 0.0, "step": 3030 }, { "epoch": 48.00258064516129, "grad_norm": 0.0013548877323046327, "learning_rate": 5.663082437275986e-06, "loss": 0.0, "step": 3040 }, { "epoch": 48.0041935483871, "grad_norm": 0.0007262414437718689, "learning_rate": 5.645161290322582e-06, "loss": 0.0001, "step": 3050 }, { "epoch": 48.005806451612905, "grad_norm": 0.005043108947575092, "learning_rate": 5.627240143369176e-06, "loss": 0.1849, "step": 3060 }, { "epoch": 48.00741935483871, "grad_norm": 0.002217673696577549, "learning_rate": 5.609318996415771e-06, "loss": 0.0198, "step": 3070 }, { "epoch": 48.009032258064515, "grad_norm": 0.0038042226806282997, "learning_rate": 5.591397849462365e-06, "loss": 0.0, "step": 3080 }, { "epoch": 48.01016129032258, "eval_accuracy": 0.6774193548387096, "eval_loss": 2.760403871536255, "eval_runtime": 6.5601, "eval_samples_per_second": 4.726, "eval_steps_per_second": 1.219, "step": 3087 }, { "epoch": 49.00048387096774, "grad_norm": 0.0015270761214196682, "learning_rate": 5.5734767025089606e-06, "loss": 0.0068, "step": 3090 }, { "epoch": 49.00209677419355, "grad_norm": 0.0013837311416864395, "learning_rate": 5.555555555555557e-06, "loss": 0.0, "step": 3100 }, { "epoch": 49.00370967741935, "grad_norm": 0.0016790084773674607, "learning_rate": 5.53763440860215e-06, "loss": 0.0, "step": 3110 }, { "epoch": 49.005322580645164, "grad_norm": 0.06654971837997437, "learning_rate": 5.5197132616487455e-06, "loss": 0.0001, "step": 3120 }, { "epoch": 49.00693548387097, "grad_norm": 0.0037361367139965296, "learning_rate": 5.501792114695342e-06, "loss": 0.0001, "step": 3130 }, { "epoch": 49.00854838709677, "grad_norm": 0.0012724808184430003, "learning_rate": 5.483870967741935e-06, "loss": 0.0007, "step": 3140 }, { "epoch": 49.01016129032258, "grad_norm": 0.024349622428417206, "learning_rate": 5.465949820788531e-06, "loss": 0.0, "step": 3150 }, { "epoch": 49.01016129032258, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.3948893547058105, "eval_runtime": 6.5997, "eval_samples_per_second": 4.697, "eval_steps_per_second": 1.212, "step": 3150 }, { "epoch": 50.001612903225805, "grad_norm": 0.000717709306627512, "learning_rate": 5.4480286738351265e-06, "loss": 0.0001, "step": 3160 }, { "epoch": 50.00322580645161, "grad_norm": 0.0006393700023181736, "learning_rate": 5.43010752688172e-06, "loss": 0.0, "step": 3170 }, { "epoch": 50.00483870967742, "grad_norm": 0.0005427003488875926, "learning_rate": 5.412186379928316e-06, "loss": 0.0, "step": 3180 }, { "epoch": 50.00645161290323, "grad_norm": 0.0008178422576747835, "learning_rate": 5.3942652329749115e-06, "loss": 0.0001, "step": 3190 }, { "epoch": 50.00806451612903, "grad_norm": 0.011196687817573547, "learning_rate": 5.376344086021506e-06, "loss": 0.0, "step": 3200 }, { "epoch": 50.00967741935484, "grad_norm": 0.0005684493808075786, "learning_rate": 5.358422939068101e-06, "loss": 0.0, "step": 3210 }, { "epoch": 50.01016129032258, "eval_accuracy": 0.6774193548387096, "eval_loss": 2.894679069519043, "eval_runtime": 6.7969, "eval_samples_per_second": 4.561, "eval_steps_per_second": 1.177, "step": 3213 }, { "epoch": 51.00112903225806, "grad_norm": 0.01790892891585827, "learning_rate": 5.340501792114696e-06, "loss": 0.0, "step": 3220 }, { "epoch": 51.00274193548387, "grad_norm": 0.0009316179784946144, "learning_rate": 5.322580645161291e-06, "loss": 0.0, "step": 3230 }, { "epoch": 51.00435483870968, "grad_norm": 0.0011778518091887236, "learning_rate": 5.304659498207886e-06, "loss": 0.0, "step": 3240 }, { "epoch": 51.005967741935486, "grad_norm": 0.0008342901128344238, "learning_rate": 5.286738351254481e-06, "loss": 0.0, "step": 3250 }, { "epoch": 51.00758064516129, "grad_norm": 0.0007168396259658039, "learning_rate": 5.268817204301076e-06, "loss": 0.0001, "step": 3260 }, { "epoch": 51.009193548387096, "grad_norm": 0.0004423597129061818, "learning_rate": 5.250896057347671e-06, "loss": 0.0, "step": 3270 }, { "epoch": 51.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 4.241269111633301, "eval_runtime": 5.7942, "eval_samples_per_second": 5.35, "eval_steps_per_second": 1.381, "step": 3276 }, { "epoch": 52.00064516129032, "grad_norm": 0.0022791242226958275, "learning_rate": 5.232974910394266e-06, "loss": 0.1851, "step": 3280 }, { "epoch": 52.00225806451613, "grad_norm": 0.0007295863470062613, "learning_rate": 5.215053763440861e-06, "loss": 0.0, "step": 3290 }, { "epoch": 52.00387096774193, "grad_norm": 0.0007710527861490846, "learning_rate": 5.197132616487456e-06, "loss": 0.0001, "step": 3300 }, { "epoch": 52.005483870967744, "grad_norm": 0.0014357652980834246, "learning_rate": 5.17921146953405e-06, "loss": 0.0, "step": 3310 }, { "epoch": 52.00709677419355, "grad_norm": 0.00046678713988512754, "learning_rate": 5.161290322580646e-06, "loss": 0.0003, "step": 3320 }, { "epoch": 52.008709677419354, "grad_norm": 0.0011630827793851495, "learning_rate": 5.143369175627241e-06, "loss": 0.1268, "step": 3330 }, { "epoch": 52.01016129032258, "eval_accuracy": 0.7096774193548387, "eval_loss": 2.33390736579895, "eval_runtime": 6.7985, "eval_samples_per_second": 4.56, "eval_steps_per_second": 1.177, "step": 3339 }, { "epoch": 53.00016129032258, "grad_norm": 0.0008074539946392179, "learning_rate": 5.125448028673835e-06, "loss": 0.0002, "step": 3340 }, { "epoch": 53.001774193548385, "grad_norm": 0.002845118287950754, "learning_rate": 5.1075268817204305e-06, "loss": 0.0001, "step": 3350 }, { "epoch": 53.00338709677419, "grad_norm": 0.0013125489931553602, "learning_rate": 5.089605734767026e-06, "loss": 0.0001, "step": 3360 }, { "epoch": 53.005, "grad_norm": 0.0009073872934095562, "learning_rate": 5.07168458781362e-06, "loss": 0.0, "step": 3370 }, { "epoch": 53.00661290322581, "grad_norm": 0.0011437920620664954, "learning_rate": 5.0537634408602155e-06, "loss": 0.0, "step": 3380 }, { "epoch": 53.00822580645161, "grad_norm": 0.002645552856847644, "learning_rate": 5.035842293906811e-06, "loss": 0.0, "step": 3390 }, { "epoch": 53.00983870967742, "grad_norm": 0.0009675602777861059, "learning_rate": 5.017921146953405e-06, "loss": 0.0, "step": 3400 }, { "epoch": 53.01016129032258, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.47694730758667, "eval_runtime": 6.8478, "eval_samples_per_second": 4.527, "eval_steps_per_second": 1.168, "step": 3402 }, { "epoch": 54.001290322580644, "grad_norm": 0.0010983716929331422, "learning_rate": 5e-06, "loss": 0.0, "step": 3410 }, { "epoch": 54.00290322580645, "grad_norm": 0.018816445022821426, "learning_rate": 4.982078853046595e-06, "loss": 0.0, "step": 3420 }, { "epoch": 54.00451612903226, "grad_norm": 0.0007217507809400558, "learning_rate": 4.964157706093191e-06, "loss": 0.0, "step": 3430 }, { "epoch": 54.006129032258066, "grad_norm": 0.0012977593578398228, "learning_rate": 4.946236559139785e-06, "loss": 0.0, "step": 3440 }, { "epoch": 54.00774193548387, "grad_norm": 0.0013114138273522258, "learning_rate": 4.928315412186381e-06, "loss": 0.0, "step": 3450 }, { "epoch": 54.009354838709676, "grad_norm": 0.0047512114979326725, "learning_rate": 4.910394265232976e-06, "loss": 0.0, "step": 3460 }, { "epoch": 54.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.514246702194214, "eval_runtime": 7.1937, "eval_samples_per_second": 4.309, "eval_steps_per_second": 1.112, "step": 3465 }, { "epoch": 55.0008064516129, "grad_norm": 0.35970592498779297, "learning_rate": 4.89247311827957e-06, "loss": 0.0001, "step": 3470 }, { "epoch": 55.00241935483871, "grad_norm": 0.004642422776669264, "learning_rate": 4.8745519713261655e-06, "loss": 0.0, "step": 3480 }, { "epoch": 55.00403225806452, "grad_norm": 0.0006195760797709227, "learning_rate": 4.85663082437276e-06, "loss": 0.1694, "step": 3490 }, { "epoch": 55.005645161290325, "grad_norm": 0.0008280796464532614, "learning_rate": 4.838709677419355e-06, "loss": 0.0, "step": 3500 }, { "epoch": 55.00725806451613, "grad_norm": 0.00129673071205616, "learning_rate": 4.8207885304659505e-06, "loss": 0.0007, "step": 3510 }, { "epoch": 55.008870967741935, "grad_norm": 0.0007499546627514064, "learning_rate": 4.802867383512545e-06, "loss": 0.0, "step": 3520 }, { "epoch": 55.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 3.5718395709991455, "eval_runtime": 5.6673, "eval_samples_per_second": 5.47, "eval_steps_per_second": 1.412, "step": 3528 }, { "epoch": 56.00032258064516, "grad_norm": 0.0016676088562235236, "learning_rate": 4.78494623655914e-06, "loss": 0.1411, "step": 3530 }, { "epoch": 56.001935483870966, "grad_norm": 0.001161010004580021, "learning_rate": 4.767025089605735e-06, "loss": 0.0, "step": 3540 }, { "epoch": 56.00354838709677, "grad_norm": 0.00100256921723485, "learning_rate": 4.74910394265233e-06, "loss": 0.0, "step": 3550 }, { "epoch": 56.00516129032258, "grad_norm": 1.753328800201416, "learning_rate": 4.731182795698925e-06, "loss": 0.0002, "step": 3560 }, { "epoch": 56.00677419354839, "grad_norm": 0.000907116977032274, "learning_rate": 4.71326164874552e-06, "loss": 0.0, "step": 3570 }, { "epoch": 56.00838709677419, "grad_norm": 0.0007463452639058232, "learning_rate": 4.695340501792115e-06, "loss": 0.0, "step": 3580 }, { "epoch": 56.01, "grad_norm": 0.025928659364581108, "learning_rate": 4.67741935483871e-06, "loss": 0.0036, "step": 3590 }, { "epoch": 56.01016129032258, "eval_accuracy": 0.4838709677419355, "eval_loss": 4.186675071716309, "eval_runtime": 5.5904, "eval_samples_per_second": 5.545, "eval_steps_per_second": 1.431, "step": 3591 }, { "epoch": 57.001451612903224, "grad_norm": 0.0010404684580862522, "learning_rate": 4.659498207885305e-06, "loss": 0.0, "step": 3600 }, { "epoch": 57.00306451612903, "grad_norm": 0.0005891444161534309, "learning_rate": 4.6415770609319e-06, "loss": 0.0001, "step": 3610 }, { "epoch": 57.00467741935484, "grad_norm": 0.0017374068265780807, "learning_rate": 4.623655913978495e-06, "loss": 0.0, "step": 3620 }, { "epoch": 57.00629032258065, "grad_norm": 0.0034232318866997957, "learning_rate": 4.60573476702509e-06, "loss": 0.0, "step": 3630 }, { "epoch": 57.00790322580645, "grad_norm": 0.0007506791735067964, "learning_rate": 4.587813620071685e-06, "loss": 0.0, "step": 3640 }, { "epoch": 57.00951612903226, "grad_norm": 0.005380168557167053, "learning_rate": 4.56989247311828e-06, "loss": 0.0026, "step": 3650 }, { "epoch": 57.01016129032258, "eval_accuracy": 0.6451612903225806, "eval_loss": 2.7411253452301025, "eval_runtime": 5.9841, "eval_samples_per_second": 5.18, "eval_steps_per_second": 1.337, "step": 3654 }, { "epoch": 58.00096774193548, "grad_norm": 0.001838170806877315, "learning_rate": 4.551971326164875e-06, "loss": 0.0206, "step": 3660 }, { "epoch": 58.00258064516129, "grad_norm": 1.0681326389312744, "learning_rate": 4.5340501792114695e-06, "loss": 0.0001, "step": 3670 }, { "epoch": 58.0041935483871, "grad_norm": 0.0008572031510993838, "learning_rate": 4.516129032258065e-06, "loss": 0.0358, "step": 3680 }, { "epoch": 58.005806451612905, "grad_norm": 0.0013548154383897781, "learning_rate": 4.49820788530466e-06, "loss": 0.0, "step": 3690 }, { "epoch": 58.00741935483871, "grad_norm": 0.0011426446726545691, "learning_rate": 4.480286738351255e-06, "loss": 0.0, "step": 3700 }, { "epoch": 58.009032258064515, "grad_norm": 0.001137315295636654, "learning_rate": 4.46236559139785e-06, "loss": 0.0, "step": 3710 }, { "epoch": 58.01016129032258, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.046436309814453, "eval_runtime": 5.8997, "eval_samples_per_second": 5.255, "eval_steps_per_second": 1.356, "step": 3717 }, { "epoch": 59.00048387096774, "grad_norm": 0.005620701238512993, "learning_rate": 4.444444444444444e-06, "loss": 0.0, "step": 3720 }, { "epoch": 59.00209677419355, "grad_norm": 0.0008327278774231672, "learning_rate": 4.42652329749104e-06, "loss": 0.0001, "step": 3730 }, { "epoch": 59.00370967741935, "grad_norm": 0.0027366376016288996, "learning_rate": 4.408602150537635e-06, "loss": 0.1778, "step": 3740 }, { "epoch": 59.005322580645164, "grad_norm": 0.0034029860980808735, "learning_rate": 4.39068100358423e-06, "loss": 0.0, "step": 3750 }, { "epoch": 59.00693548387097, "grad_norm": 0.003737010294571519, "learning_rate": 4.372759856630825e-06, "loss": 0.2502, "step": 3760 }, { "epoch": 59.00854838709677, "grad_norm": 0.0034590535797178745, "learning_rate": 4.35483870967742e-06, "loss": 0.0001, "step": 3770 }, { "epoch": 59.01016129032258, "grad_norm": 0.00876407790929079, "learning_rate": 4.336917562724015e-06, "loss": 0.0001, "step": 3780 }, { "epoch": 59.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.6255438327789307, "eval_runtime": 5.7494, "eval_samples_per_second": 5.392, "eval_steps_per_second": 1.391, "step": 3780 }, { "epoch": 60.001612903225805, "grad_norm": 0.01270359754562378, "learning_rate": 4.31899641577061e-06, "loss": 0.0001, "step": 3790 }, { "epoch": 60.00322580645161, "grad_norm": 0.004537724889814854, "learning_rate": 4.3010752688172045e-06, "loss": 0.0096, "step": 3800 }, { "epoch": 60.00483870967742, "grad_norm": 761.9090576171875, "learning_rate": 4.2831541218638e-06, "loss": 0.0731, "step": 3810 }, { "epoch": 60.00645161290323, "grad_norm": 0.0018560753669589758, "learning_rate": 4.265232974910394e-06, "loss": 0.0001, "step": 3820 }, { "epoch": 60.00806451612903, "grad_norm": 0.005290852393954992, "learning_rate": 4.2473118279569895e-06, "loss": 0.0026, "step": 3830 }, { "epoch": 60.00967741935484, "grad_norm": 0.005584435071796179, "learning_rate": 4.229390681003585e-06, "loss": 0.0, "step": 3840 }, { "epoch": 60.01016129032258, "eval_accuracy": 0.5161290322580645, "eval_loss": 4.729151248931885, "eval_runtime": 5.7587, "eval_samples_per_second": 5.383, "eval_steps_per_second": 1.389, "step": 3843 }, { "epoch": 61.00112903225806, "grad_norm": 0.004037478938698769, "learning_rate": 4.211469534050179e-06, "loss": 0.0002, "step": 3850 }, { "epoch": 61.00274193548387, "grad_norm": 0.296836793422699, "learning_rate": 4.193548387096774e-06, "loss": 0.011, "step": 3860 }, { "epoch": 61.00435483870968, "grad_norm": 0.0005537400138564408, "learning_rate": 4.17562724014337e-06, "loss": 0.107, "step": 3870 }, { "epoch": 61.005967741935486, "grad_norm": 0.0026548614259809256, "learning_rate": 4.157706093189964e-06, "loss": 0.0001, "step": 3880 }, { "epoch": 61.00758064516129, "grad_norm": 0.00497156148776412, "learning_rate": 4.139784946236559e-06, "loss": 0.0045, "step": 3890 }, { "epoch": 61.009193548387096, "grad_norm": 0.3038000762462616, "learning_rate": 4.121863799283155e-06, "loss": 0.1406, "step": 3900 }, { "epoch": 61.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.987642765045166, "eval_runtime": 5.9366, "eval_samples_per_second": 5.222, "eval_steps_per_second": 1.348, "step": 3906 }, { "epoch": 62.00064516129032, "grad_norm": 0.0010510239517316222, "learning_rate": 4.103942652329749e-06, "loss": 0.0, "step": 3910 }, { "epoch": 62.00225806451613, "grad_norm": 0.0029426671098917723, "learning_rate": 4.086021505376344e-06, "loss": 0.0357, "step": 3920 }, { "epoch": 62.00387096774193, "grad_norm": 0.006547905970364809, "learning_rate": 4.0681003584229395e-06, "loss": 0.0006, "step": 3930 }, { "epoch": 62.005483870967744, "grad_norm": 0.003717880230396986, "learning_rate": 4.050179211469534e-06, "loss": 0.0001, "step": 3940 }, { "epoch": 62.00709677419355, "grad_norm": 0.0009262536186724901, "learning_rate": 4.032258064516129e-06, "loss": 0.0, "step": 3950 }, { "epoch": 62.008709677419354, "grad_norm": 0.0012034719111397862, "learning_rate": 4.0143369175627245e-06, "loss": 0.0, "step": 3960 }, { "epoch": 62.01016129032258, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.409881114959717, "eval_runtime": 5.8393, "eval_samples_per_second": 5.309, "eval_steps_per_second": 1.37, "step": 3969 }, { "epoch": 63.00016129032258, "grad_norm": 0.0015827568713575602, "learning_rate": 3.996415770609319e-06, "loss": 0.0, "step": 3970 }, { "epoch": 63.001774193548385, "grad_norm": 0.0013760023284703493, "learning_rate": 3.978494623655914e-06, "loss": 0.0, "step": 3980 }, { "epoch": 63.00338709677419, "grad_norm": 0.004404567647725344, "learning_rate": 3.960573476702509e-06, "loss": 0.0, "step": 3990 }, { "epoch": 63.005, "grad_norm": 0.004297593142837286, "learning_rate": 3.942652329749105e-06, "loss": 0.0001, "step": 4000 }, { "epoch": 63.00661290322581, "grad_norm": 0.0003436031111050397, "learning_rate": 3.924731182795699e-06, "loss": 0.0332, "step": 4010 }, { "epoch": 63.00822580645161, "grad_norm": 0.0010513163870200515, "learning_rate": 3.906810035842294e-06, "loss": 0.0001, "step": 4020 }, { "epoch": 63.00983870967742, "grad_norm": 0.0028779839631170034, "learning_rate": 3.88888888888889e-06, "loss": 0.0, "step": 4030 }, { "epoch": 63.01016129032258, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.267378091812134, "eval_runtime": 5.9147, "eval_samples_per_second": 5.241, "eval_steps_per_second": 1.353, "step": 4032 }, { "epoch": 64.00129032258064, "grad_norm": 0.001035613240674138, "learning_rate": 3.870967741935484e-06, "loss": 0.0, "step": 4040 }, { "epoch": 64.00290322580645, "grad_norm": 0.0008846246637403965, "learning_rate": 3.853046594982079e-06, "loss": 0.0, "step": 4050 }, { "epoch": 64.00451612903225, "grad_norm": 0.0007107908022589982, "learning_rate": 3.8351254480286745e-06, "loss": 0.0, "step": 4060 }, { "epoch": 64.00612903225806, "grad_norm": 0.007056610658764839, "learning_rate": 3.817204301075269e-06, "loss": 0.0001, "step": 4070 }, { "epoch": 64.00774193548386, "grad_norm": 0.0007364367484115064, "learning_rate": 3.7992831541218638e-06, "loss": 0.0, "step": 4080 }, { "epoch": 64.00935483870968, "grad_norm": 0.001208829809911549, "learning_rate": 3.7813620071684594e-06, "loss": 0.0, "step": 4090 }, { "epoch": 64.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.9748694896698, "eval_runtime": 5.8975, "eval_samples_per_second": 5.256, "eval_steps_per_second": 1.357, "step": 4095 }, { "epoch": 65.0008064516129, "grad_norm": 0.0015768223674967885, "learning_rate": 3.763440860215054e-06, "loss": 0.0, "step": 4100 }, { "epoch": 65.00241935483871, "grad_norm": 0.0003682167152874172, "learning_rate": 3.7455197132616487e-06, "loss": 0.0, "step": 4110 }, { "epoch": 65.00403225806451, "grad_norm": 0.00040682387771084905, "learning_rate": 3.7275985663082444e-06, "loss": 0.0, "step": 4120 }, { "epoch": 65.00564516129032, "grad_norm": 0.0006791821797378361, "learning_rate": 3.7096774193548392e-06, "loss": 0.0, "step": 4130 }, { "epoch": 65.00725806451612, "grad_norm": 0.0023219408467411995, "learning_rate": 3.691756272401434e-06, "loss": 0.0, "step": 4140 }, { "epoch": 65.00887096774194, "grad_norm": 0.00033632866689004004, "learning_rate": 3.6738351254480293e-06, "loss": 0.0, "step": 4150 }, { "epoch": 65.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.3262135982513428, "eval_runtime": 5.6981, "eval_samples_per_second": 5.44, "eval_steps_per_second": 1.404, "step": 4158 }, { "epoch": 66.00032258064516, "grad_norm": 0.0009350565378554165, "learning_rate": 3.655913978494624e-06, "loss": 0.0032, "step": 4160 }, { "epoch": 66.00193548387097, "grad_norm": 0.001185098197311163, "learning_rate": 3.637992831541219e-06, "loss": 0.0, "step": 4170 }, { "epoch": 66.00354838709677, "grad_norm": 0.0022740501444786787, "learning_rate": 3.620071684587814e-06, "loss": 0.0, "step": 4180 }, { "epoch": 66.00516129032258, "grad_norm": 0.00037708552554249763, "learning_rate": 3.602150537634409e-06, "loss": 0.0001, "step": 4190 }, { "epoch": 66.00677419354838, "grad_norm": 0.0010945936664938927, "learning_rate": 3.584229390681004e-06, "loss": 0.0, "step": 4200 }, { "epoch": 66.0083870967742, "grad_norm": 0.015282081440091133, "learning_rate": 3.5663082437275988e-06, "loss": 0.0, "step": 4210 }, { "epoch": 66.01, "grad_norm": 0.015242286957800388, "learning_rate": 3.548387096774194e-06, "loss": 0.0, "step": 4220 }, { "epoch": 66.01016129032259, "eval_accuracy": 0.7096774193548387, "eval_loss": 2.5556366443634033, "eval_runtime": 5.7924, "eval_samples_per_second": 5.352, "eval_steps_per_second": 1.381, "step": 4221 }, { "epoch": 67.00145161290322, "grad_norm": 0.0018096225103363395, "learning_rate": 3.530465949820789e-06, "loss": 0.0, "step": 4230 }, { "epoch": 67.00306451612903, "grad_norm": 0.0007935313042253256, "learning_rate": 3.5125448028673837e-06, "loss": 0.0015, "step": 4240 }, { "epoch": 67.00467741935483, "grad_norm": 0.004196200519800186, "learning_rate": 3.494623655913979e-06, "loss": 0.0, "step": 4250 }, { "epoch": 67.00629032258064, "grad_norm": 0.004180178511887789, "learning_rate": 3.4767025089605738e-06, "loss": 0.0, "step": 4260 }, { "epoch": 67.00790322580646, "grad_norm": 0.008008614182472229, "learning_rate": 3.4587813620071686e-06, "loss": 0.183, "step": 4270 }, { "epoch": 67.00951612903226, "grad_norm": 0.02671939507126808, "learning_rate": 3.440860215053764e-06, "loss": 0.2639, "step": 4280 }, { "epoch": 67.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.6953513622283936, "eval_runtime": 6.1771, "eval_samples_per_second": 5.019, "eval_steps_per_second": 1.295, "step": 4284 }, { "epoch": 68.00096774193548, "grad_norm": 0.9249791502952576, "learning_rate": 3.4229390681003587e-06, "loss": 0.1692, "step": 4290 }, { "epoch": 68.00258064516129, "grad_norm": 437.2980651855469, "learning_rate": 3.4050179211469536e-06, "loss": 0.3198, "step": 4300 }, { "epoch": 68.0041935483871, "grad_norm": 0.0018597375601530075, "learning_rate": 3.3870967741935484e-06, "loss": 0.0001, "step": 4310 }, { "epoch": 68.0058064516129, "grad_norm": 0.0013708691112697124, "learning_rate": 3.3691756272401437e-06, "loss": 0.1683, "step": 4320 }, { "epoch": 68.0074193548387, "grad_norm": 0.005116583313792944, "learning_rate": 3.3512544802867385e-06, "loss": 0.0, "step": 4330 }, { "epoch": 68.00903225806452, "grad_norm": 0.7601214051246643, "learning_rate": 3.3333333333333333e-06, "loss": 0.0011, "step": 4340 }, { "epoch": 68.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.2776241302490234, "eval_runtime": 6.6228, "eval_samples_per_second": 4.681, "eval_steps_per_second": 1.208, "step": 4347 }, { "epoch": 69.00048387096774, "grad_norm": 0.004644962027668953, "learning_rate": 3.3154121863799286e-06, "loss": 0.0, "step": 4350 }, { "epoch": 69.00209677419355, "grad_norm": 0.0013096232432872057, "learning_rate": 3.2974910394265234e-06, "loss": 0.0, "step": 4360 }, { "epoch": 69.00370967741935, "grad_norm": 0.001277285278774798, "learning_rate": 3.2795698924731183e-06, "loss": 0.0, "step": 4370 }, { "epoch": 69.00532258064516, "grad_norm": 0.0017252061516046524, "learning_rate": 3.261648745519714e-06, "loss": 0.0, "step": 4380 }, { "epoch": 69.00693548387096, "grad_norm": 0.0006072830292396247, "learning_rate": 3.2437275985663088e-06, "loss": 0.0, "step": 4390 }, { "epoch": 69.00854838709678, "grad_norm": 0.0008386512636207044, "learning_rate": 3.225806451612903e-06, "loss": 0.0, "step": 4400 }, { "epoch": 69.01016129032259, "grad_norm": 0.0009972426341846585, "learning_rate": 3.207885304659498e-06, "loss": 0.0, "step": 4410 }, { "epoch": 69.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.6619575023651123, "eval_runtime": 6.1776, "eval_samples_per_second": 5.018, "eval_steps_per_second": 1.295, "step": 4410 }, { "epoch": 70.0016129032258, "grad_norm": 0.0013387261424213648, "learning_rate": 3.1899641577060937e-06, "loss": 0.0, "step": 4420 }, { "epoch": 70.00322580645161, "grad_norm": 0.0005100954440422356, "learning_rate": 3.1720430107526885e-06, "loss": 0.0, "step": 4430 }, { "epoch": 70.00483870967741, "grad_norm": 0.0009658474591560662, "learning_rate": 3.1541218637992834e-06, "loss": 0.0, "step": 4440 }, { "epoch": 70.00645161290322, "grad_norm": 0.002550078323110938, "learning_rate": 3.1362007168458786e-06, "loss": 0.0, "step": 4450 }, { "epoch": 70.00806451612904, "grad_norm": 0.0013757863780483603, "learning_rate": 3.1182795698924735e-06, "loss": 0.0, "step": 4460 }, { "epoch": 70.00967741935484, "grad_norm": 0.0012467859778553247, "learning_rate": 3.1003584229390683e-06, "loss": 0.0, "step": 4470 }, { "epoch": 70.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.5887346267700195, "eval_runtime": 86.42, "eval_samples_per_second": 0.359, "eval_steps_per_second": 0.093, "step": 4473 }, { "epoch": 71.00112903225806, "grad_norm": 0.0008130945498123765, "learning_rate": 3.0824372759856636e-06, "loss": 0.0, "step": 4480 }, { "epoch": 71.00274193548387, "grad_norm": 0.058679137378931046, "learning_rate": 3.0645161290322584e-06, "loss": 0.0001, "step": 4490 }, { "epoch": 71.00435483870967, "grad_norm": 0.0005214307457208633, "learning_rate": 3.0465949820788532e-06, "loss": 0.0002, "step": 4500 }, { "epoch": 71.00596774193548, "grad_norm": 0.08657626807689667, "learning_rate": 3.0286738351254485e-06, "loss": 0.0, "step": 4510 }, { "epoch": 71.00758064516128, "grad_norm": 0.0007953665335662663, "learning_rate": 3.0107526881720433e-06, "loss": 0.1117, "step": 4520 }, { "epoch": 71.0091935483871, "grad_norm": 0.0006889233482070267, "learning_rate": 2.992831541218638e-06, "loss": 0.0, "step": 4530 }, { "epoch": 71.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.504023551940918, "eval_runtime": 6.8749, "eval_samples_per_second": 4.509, "eval_steps_per_second": 1.164, "step": 4536 }, { "epoch": 72.00064516129032, "grad_norm": 0.0038752758409827948, "learning_rate": 2.974910394265233e-06, "loss": 0.238, "step": 4540 }, { "epoch": 72.00225806451613, "grad_norm": 0.0008320348570123315, "learning_rate": 2.9569892473118283e-06, "loss": 0.0, "step": 4550 }, { "epoch": 72.00387096774193, "grad_norm": 0.0052330815233290195, "learning_rate": 2.939068100358423e-06, "loss": 0.0, "step": 4560 }, { "epoch": 72.00548387096774, "grad_norm": 0.0010985853150486946, "learning_rate": 2.921146953405018e-06, "loss": 0.0, "step": 4570 }, { "epoch": 72.00709677419354, "grad_norm": 0.0009674577158875763, "learning_rate": 2.903225806451613e-06, "loss": 0.0, "step": 4580 }, { "epoch": 72.00870967741936, "grad_norm": 0.002207015873864293, "learning_rate": 2.885304659498208e-06, "loss": 0.0, "step": 4590 }, { "epoch": 72.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 3.8665788173675537, "eval_runtime": 6.4838, "eval_samples_per_second": 4.781, "eval_steps_per_second": 1.234, "step": 4599 }, { "epoch": 73.00016129032258, "grad_norm": 0.0014342574868351221, "learning_rate": 2.867383512544803e-06, "loss": 0.0, "step": 4600 }, { "epoch": 73.00177419354839, "grad_norm": 0.0015508056385442615, "learning_rate": 2.849462365591398e-06, "loss": 0.0, "step": 4610 }, { "epoch": 73.00338709677419, "grad_norm": 0.001471922965720296, "learning_rate": 2.831541218637993e-06, "loss": 0.0001, "step": 4620 }, { "epoch": 73.005, "grad_norm": 0.0018374361097812653, "learning_rate": 2.813620071684588e-06, "loss": 0.0, "step": 4630 }, { "epoch": 73.0066129032258, "grad_norm": 0.002281290479004383, "learning_rate": 2.7956989247311827e-06, "loss": 0.0, "step": 4640 }, { "epoch": 73.00822580645162, "grad_norm": 0.0008221657481044531, "learning_rate": 2.7777777777777783e-06, "loss": 0.0, "step": 4650 }, { "epoch": 73.00983870967742, "grad_norm": 0.00182114087510854, "learning_rate": 2.7598566308243727e-06, "loss": 0.0, "step": 4660 }, { "epoch": 73.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.001722812652588, "eval_runtime": 63.2346, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.127, "step": 4662 }, { "epoch": 74.00129032258064, "grad_norm": 0.004445404279977083, "learning_rate": 2.7419354838709676e-06, "loss": 0.0, "step": 4670 }, { "epoch": 74.00290322580645, "grad_norm": 0.0008719922625459731, "learning_rate": 2.7240143369175633e-06, "loss": 0.0, "step": 4680 }, { "epoch": 74.00451612903225, "grad_norm": 0.0013860003091394901, "learning_rate": 2.706093189964158e-06, "loss": 0.0, "step": 4690 }, { "epoch": 74.00612903225806, "grad_norm": 0.003048549173399806, "learning_rate": 2.688172043010753e-06, "loss": 0.0, "step": 4700 }, { "epoch": 74.00774193548386, "grad_norm": 0.0007664980948902667, "learning_rate": 2.670250896057348e-06, "loss": 0.0, "step": 4710 }, { "epoch": 74.00935483870968, "grad_norm": 0.0011522058630362153, "learning_rate": 2.652329749103943e-06, "loss": 0.0, "step": 4720 }, { "epoch": 74.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 3.942169666290283, "eval_runtime": 6.761, "eval_samples_per_second": 4.585, "eval_steps_per_second": 1.183, "step": 4725 }, { "epoch": 75.0008064516129, "grad_norm": 0.0012468983186408877, "learning_rate": 2.634408602150538e-06, "loss": 0.0, "step": 4730 }, { "epoch": 75.00241935483871, "grad_norm": 0.0016760459402576089, "learning_rate": 2.616487455197133e-06, "loss": 0.0, "step": 4740 }, { "epoch": 75.00403225806451, "grad_norm": 0.0008701254264451563, "learning_rate": 2.598566308243728e-06, "loss": 0.0, "step": 4750 }, { "epoch": 75.00564516129032, "grad_norm": 0.0037713153287768364, "learning_rate": 2.580645161290323e-06, "loss": 0.0042, "step": 4760 }, { "epoch": 75.00725806451612, "grad_norm": 0.0011189499637112021, "learning_rate": 2.5627240143369176e-06, "loss": 0.0, "step": 4770 }, { "epoch": 75.00887096774194, "grad_norm": 0.008365134708583355, "learning_rate": 2.544802867383513e-06, "loss": 0.0001, "step": 4780 }, { "epoch": 75.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.539722442626953, "eval_runtime": 5.6404, "eval_samples_per_second": 5.496, "eval_steps_per_second": 1.418, "step": 4788 }, { "epoch": 76.00032258064516, "grad_norm": 0.0007567619904875755, "learning_rate": 2.5268817204301077e-06, "loss": 0.0, "step": 4790 }, { "epoch": 76.00193548387097, "grad_norm": 0.0011233144905418158, "learning_rate": 2.5089605734767026e-06, "loss": 0.0, "step": 4800 }, { "epoch": 76.00354838709677, "grad_norm": 0.00045829531154595315, "learning_rate": 2.4910394265232974e-06, "loss": 0.0005, "step": 4810 }, { "epoch": 76.00516129032258, "grad_norm": 0.0007880241610109806, "learning_rate": 2.4731182795698927e-06, "loss": 0.002, "step": 4820 }, { "epoch": 76.00677419354838, "grad_norm": 0.0007963485550135374, "learning_rate": 2.455197132616488e-06, "loss": 0.0, "step": 4830 }, { "epoch": 76.0083870967742, "grad_norm": 0.0005008154548704624, "learning_rate": 2.4372759856630828e-06, "loss": 0.0, "step": 4840 }, { "epoch": 76.01, "grad_norm": 0.0011406855192035437, "learning_rate": 2.4193548387096776e-06, "loss": 0.0, "step": 4850 }, { "epoch": 76.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.8404526710510254, "eval_runtime": 5.7904, "eval_samples_per_second": 5.354, "eval_steps_per_second": 1.382, "step": 4851 }, { "epoch": 77.00145161290322, "grad_norm": 0.0006877694395370781, "learning_rate": 2.4014336917562724e-06, "loss": 0.0, "step": 4860 }, { "epoch": 77.00306451612903, "grad_norm": 0.001662616734392941, "learning_rate": 2.3835125448028677e-06, "loss": 0.0, "step": 4870 }, { "epoch": 77.00467741935483, "grad_norm": 0.0004580924578476697, "learning_rate": 2.3655913978494625e-06, "loss": 0.0, "step": 4880 }, { "epoch": 77.00629032258064, "grad_norm": 0.0003815030213445425, "learning_rate": 2.3476702508960574e-06, "loss": 0.0, "step": 4890 }, { "epoch": 77.00790322580646, "grad_norm": 0.0008154985844157636, "learning_rate": 2.3297491039426526e-06, "loss": 0.0, "step": 4900 }, { "epoch": 77.00951612903226, "grad_norm": 0.0010767716448754072, "learning_rate": 2.3118279569892475e-06, "loss": 0.0, "step": 4910 }, { "epoch": 77.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.999199867248535, "eval_runtime": 5.7977, "eval_samples_per_second": 5.347, "eval_steps_per_second": 1.38, "step": 4914 }, { "epoch": 78.00096774193548, "grad_norm": 0.001373056205920875, "learning_rate": 2.2939068100358423e-06, "loss": 0.0, "step": 4920 }, { "epoch": 78.00258064516129, "grad_norm": 0.0010064683156087995, "learning_rate": 2.2759856630824376e-06, "loss": 0.0, "step": 4930 }, { "epoch": 78.0041935483871, "grad_norm": 0.0004777978465426713, "learning_rate": 2.2580645161290324e-06, "loss": 0.0, "step": 4940 }, { "epoch": 78.0058064516129, "grad_norm": 0.0016679827822372317, "learning_rate": 2.2401433691756277e-06, "loss": 0.0, "step": 4950 }, { "epoch": 78.0074193548387, "grad_norm": 0.0015965336933732033, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "step": 4960 }, { "epoch": 78.00903225806452, "grad_norm": 0.010267838835716248, "learning_rate": 2.2043010752688173e-06, "loss": 0.0, "step": 4970 }, { "epoch": 78.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.9721951484680176, "eval_runtime": 6.0122, "eval_samples_per_second": 5.156, "eval_steps_per_second": 1.331, "step": 4977 }, { "epoch": 79.00048387096774, "grad_norm": 0.0005326010286808014, "learning_rate": 2.1863799283154126e-06, "loss": 0.0, "step": 4980 }, { "epoch": 79.00209677419355, "grad_norm": 0.0009924044134095311, "learning_rate": 2.1684587813620074e-06, "loss": 0.0, "step": 4990 }, { "epoch": 79.00370967741935, "grad_norm": 0.0009795678779482841, "learning_rate": 2.1505376344086023e-06, "loss": 0.0, "step": 5000 }, { "epoch": 79.00532258064516, "grad_norm": 0.002512082690373063, "learning_rate": 2.132616487455197e-06, "loss": 0.0, "step": 5010 }, { "epoch": 79.00693548387096, "grad_norm": 0.0010595355415716767, "learning_rate": 2.1146953405017924e-06, "loss": 0.0, "step": 5020 }, { "epoch": 79.00854838709678, "grad_norm": 0.002470111707225442, "learning_rate": 2.096774193548387e-06, "loss": 0.0, "step": 5030 }, { "epoch": 79.01016129032259, "grad_norm": 0.0008115039090625942, "learning_rate": 2.078853046594982e-06, "loss": 0.0, "step": 5040 }, { "epoch": 79.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.9420762062072754, "eval_runtime": 10.1554, "eval_samples_per_second": 3.053, "eval_steps_per_second": 0.788, "step": 5040 }, { "epoch": 80.0016129032258, "grad_norm": 0.0005455981008708477, "learning_rate": 2.0609318996415773e-06, "loss": 0.0, "step": 5050 }, { "epoch": 80.00322580645161, "grad_norm": 0.001032925909385085, "learning_rate": 2.043010752688172e-06, "loss": 0.0, "step": 5060 }, { "epoch": 80.00483870967741, "grad_norm": 0.0005147884367033839, "learning_rate": 2.025089605734767e-06, "loss": 0.0, "step": 5070 }, { "epoch": 80.00645161290322, "grad_norm": 0.0018231339054182172, "learning_rate": 2.0071684587813622e-06, "loss": 0.0, "step": 5080 }, { "epoch": 80.00806451612904, "grad_norm": 0.0002894848585128784, "learning_rate": 1.989247311827957e-06, "loss": 0.0, "step": 5090 }, { "epoch": 80.00967741935484, "grad_norm": 0.0009533461998216808, "learning_rate": 1.9713261648745523e-06, "loss": 0.2333, "step": 5100 }, { "epoch": 80.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.081673622131348, "eval_runtime": 5.7037, "eval_samples_per_second": 5.435, "eval_steps_per_second": 1.403, "step": 5103 }, { "epoch": 81.00112903225806, "grad_norm": 0.002396800322458148, "learning_rate": 1.953405017921147e-06, "loss": 0.0, "step": 5110 }, { "epoch": 81.00274193548387, "grad_norm": 0.0007857176824472845, "learning_rate": 1.935483870967742e-06, "loss": 0.0, "step": 5120 }, { "epoch": 81.00435483870967, "grad_norm": 0.0017762325005605817, "learning_rate": 1.9175627240143373e-06, "loss": 0.0, "step": 5130 }, { "epoch": 81.00596774193548, "grad_norm": 0.003709646640345454, "learning_rate": 1.8996415770609319e-06, "loss": 0.0021, "step": 5140 }, { "epoch": 81.00758064516128, "grad_norm": 0.0006074115517549217, "learning_rate": 1.881720430107527e-06, "loss": 0.0, "step": 5150 }, { "epoch": 81.0091935483871, "grad_norm": 0.006386525928974152, "learning_rate": 1.8637992831541222e-06, "loss": 0.0, "step": 5160 }, { "epoch": 81.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.6668758392333984, "eval_runtime": 5.7413, "eval_samples_per_second": 5.399, "eval_steps_per_second": 1.393, "step": 5166 }, { "epoch": 82.00064516129032, "grad_norm": 0.0005079368711449206, "learning_rate": 1.845878136200717e-06, "loss": 0.0, "step": 5170 }, { "epoch": 82.00225806451613, "grad_norm": 0.0006065468769520521, "learning_rate": 1.827956989247312e-06, "loss": 0.0, "step": 5180 }, { "epoch": 82.00387096774193, "grad_norm": 0.0012391282944008708, "learning_rate": 1.810035842293907e-06, "loss": 0.0, "step": 5190 }, { "epoch": 82.00548387096774, "grad_norm": 0.00035129269235767424, "learning_rate": 1.792114695340502e-06, "loss": 0.0, "step": 5200 }, { "epoch": 82.00709677419354, "grad_norm": 0.0007883913349360228, "learning_rate": 1.774193548387097e-06, "loss": 0.0, "step": 5210 }, { "epoch": 82.00870967741936, "grad_norm": 0.0006735958158969879, "learning_rate": 1.7562724014336918e-06, "loss": 0.0, "step": 5220 }, { "epoch": 82.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.6606853008270264, "eval_runtime": 5.835, "eval_samples_per_second": 5.313, "eval_steps_per_second": 1.371, "step": 5229 }, { "epoch": 83.00016129032258, "grad_norm": 0.0007135454216040671, "learning_rate": 1.7383512544802869e-06, "loss": 0.0, "step": 5230 }, { "epoch": 83.00177419354839, "grad_norm": 0.0005949111073277891, "learning_rate": 1.720430107526882e-06, "loss": 0.0, "step": 5240 }, { "epoch": 83.00338709677419, "grad_norm": 0.0006373505457304418, "learning_rate": 1.7025089605734768e-06, "loss": 0.0, "step": 5250 }, { "epoch": 83.005, "grad_norm": 0.0007057363400235772, "learning_rate": 1.6845878136200718e-06, "loss": 0.0, "step": 5260 }, { "epoch": 83.0066129032258, "grad_norm": 0.0005377751658670604, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "step": 5270 }, { "epoch": 83.00822580645162, "grad_norm": 0.0014775992603972554, "learning_rate": 1.6487455197132617e-06, "loss": 0.0, "step": 5280 }, { "epoch": 83.00983870967742, "grad_norm": 0.0009177481988444924, "learning_rate": 1.630824372759857e-06, "loss": 0.0, "step": 5290 }, { "epoch": 83.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.6873209476470947, "eval_runtime": 5.641, "eval_samples_per_second": 5.495, "eval_steps_per_second": 1.418, "step": 5292 }, { "epoch": 84.00129032258064, "grad_norm": 0.0010123479878529906, "learning_rate": 1.6129032258064516e-06, "loss": 0.0, "step": 5300 }, { "epoch": 84.00290322580645, "grad_norm": 0.0005889268941245973, "learning_rate": 1.5949820788530469e-06, "loss": 0.0004, "step": 5310 }, { "epoch": 84.00451612903225, "grad_norm": 0.00028648244915530086, "learning_rate": 1.5770609318996417e-06, "loss": 0.0, "step": 5320 }, { "epoch": 84.00612903225806, "grad_norm": 0.0007187535520642996, "learning_rate": 1.5591397849462367e-06, "loss": 0.0, "step": 5330 }, { "epoch": 84.00774193548386, "grad_norm": 0.0006879689171910286, "learning_rate": 1.5412186379928318e-06, "loss": 0.0, "step": 5340 }, { "epoch": 84.00935483870968, "grad_norm": 0.0007166188443079591, "learning_rate": 1.5232974910394266e-06, "loss": 0.0, "step": 5350 }, { "epoch": 84.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.597898483276367, "eval_runtime": 6.3321, "eval_samples_per_second": 4.896, "eval_steps_per_second": 1.263, "step": 5355 }, { "epoch": 85.0008064516129, "grad_norm": 0.0009514502016827464, "learning_rate": 1.5053763440860217e-06, "loss": 0.0, "step": 5360 }, { "epoch": 85.00241935483871, "grad_norm": 0.0005126717733219266, "learning_rate": 1.4874551971326165e-06, "loss": 0.0, "step": 5370 }, { "epoch": 85.00403225806451, "grad_norm": 0.001519693061709404, "learning_rate": 1.4695340501792116e-06, "loss": 0.0774, "step": 5380 }, { "epoch": 85.00564516129032, "grad_norm": 0.0012091078097000718, "learning_rate": 1.4516129032258066e-06, "loss": 0.0, "step": 5390 }, { "epoch": 85.00725806451612, "grad_norm": 0.00032768482924439013, "learning_rate": 1.4336917562724014e-06, "loss": 0.0, "step": 5400 }, { "epoch": 85.00887096774194, "grad_norm": 0.0005739382468163967, "learning_rate": 1.4157706093189965e-06, "loss": 0.0, "step": 5410 }, { "epoch": 85.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 3.888089895248413, "eval_runtime": 5.6981, "eval_samples_per_second": 5.44, "eval_steps_per_second": 1.404, "step": 5418 }, { "epoch": 86.00032258064516, "grad_norm": 0.00140287633985281, "learning_rate": 1.3978494623655913e-06, "loss": 0.0, "step": 5420 }, { "epoch": 86.00193548387097, "grad_norm": 0.000553338264580816, "learning_rate": 1.3799283154121864e-06, "loss": 0.0008, "step": 5430 }, { "epoch": 86.00354838709677, "grad_norm": 0.0009555838769301772, "learning_rate": 1.3620071684587816e-06, "loss": 0.0, "step": 5440 }, { "epoch": 86.00516129032258, "grad_norm": 0.0007269910420291126, "learning_rate": 1.3440860215053765e-06, "loss": 0.0, "step": 5450 }, { "epoch": 86.00677419354838, "grad_norm": 0.0008325223461724818, "learning_rate": 1.3261648745519715e-06, "loss": 0.0, "step": 5460 }, { "epoch": 86.0083870967742, "grad_norm": 0.000698271906003356, "learning_rate": 1.3082437275985666e-06, "loss": 0.0, "step": 5470 }, { "epoch": 86.01, "grad_norm": 0.0004638571117538959, "learning_rate": 1.2903225806451614e-06, "loss": 0.0, "step": 5480 }, { "epoch": 86.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.601414203643799, "eval_runtime": 150.1876, "eval_samples_per_second": 0.206, "eval_steps_per_second": 0.053, "step": 5481 }, { "epoch": 87.00145161290322, "grad_norm": 0.0014305762015283108, "learning_rate": 1.2724014336917565e-06, "loss": 0.0, "step": 5490 }, { "epoch": 87.00306451612903, "grad_norm": 0.0010401890613138676, "learning_rate": 1.2544802867383513e-06, "loss": 0.0, "step": 5500 }, { "epoch": 87.00467741935483, "grad_norm": 0.0006740922690369189, "learning_rate": 1.2365591397849463e-06, "loss": 0.0038, "step": 5510 }, { "epoch": 87.00629032258064, "grad_norm": 0.00045293488074094057, "learning_rate": 1.2186379928315414e-06, "loss": 0.0, "step": 5520 }, { "epoch": 87.00790322580646, "grad_norm": 0.0005945725715719163, "learning_rate": 1.2007168458781362e-06, "loss": 0.0, "step": 5530 }, { "epoch": 87.00951612903226, "grad_norm": 0.007623703684657812, "learning_rate": 1.1827956989247313e-06, "loss": 0.0, "step": 5540 }, { "epoch": 87.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.7987687587738037, "eval_runtime": 6.0769, "eval_samples_per_second": 5.101, "eval_steps_per_second": 1.316, "step": 5544 }, { "epoch": 88.00096774193548, "grad_norm": 0.0004915293538942933, "learning_rate": 1.1648745519713263e-06, "loss": 0.0, "step": 5550 }, { "epoch": 88.00258064516129, "grad_norm": 0.00038745274650864303, "learning_rate": 1.1469534050179212e-06, "loss": 0.0, "step": 5560 }, { "epoch": 88.0041935483871, "grad_norm": 0.0006638484774157405, "learning_rate": 1.1290322580645162e-06, "loss": 0.0, "step": 5570 }, { "epoch": 88.0058064516129, "grad_norm": 0.0004941201186738908, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "step": 5580 }, { "epoch": 88.0074193548387, "grad_norm": 0.0006597687606699765, "learning_rate": 1.0931899641577063e-06, "loss": 0.0, "step": 5590 }, { "epoch": 88.00903225806452, "grad_norm": 0.000671563611831516, "learning_rate": 1.0752688172043011e-06, "loss": 0.0, "step": 5600 }, { "epoch": 88.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.80474591255188, "eval_runtime": 9.3847, "eval_samples_per_second": 3.303, "eval_steps_per_second": 0.852, "step": 5607 }, { "epoch": 89.00048387096774, "grad_norm": 0.0013941754586994648, "learning_rate": 1.0573476702508962e-06, "loss": 0.0, "step": 5610 }, { "epoch": 89.00209677419355, "grad_norm": 0.0004829160461667925, "learning_rate": 1.039426523297491e-06, "loss": 0.0, "step": 5620 }, { "epoch": 89.00370967741935, "grad_norm": 0.0008248007507063448, "learning_rate": 1.021505376344086e-06, "loss": 0.0, "step": 5630 }, { "epoch": 89.00532258064516, "grad_norm": 0.0014396782498806715, "learning_rate": 1.0035842293906811e-06, "loss": 0.0, "step": 5640 }, { "epoch": 89.00693548387096, "grad_norm": 0.0003691385209094733, "learning_rate": 9.856630824372762e-07, "loss": 0.0, "step": 5650 }, { "epoch": 89.00854838709678, "grad_norm": 0.0005942166317254305, "learning_rate": 9.67741935483871e-07, "loss": 0.0, "step": 5660 }, { "epoch": 89.01016129032259, "grad_norm": 0.0009872328955680132, "learning_rate": 9.498207885304659e-07, "loss": 0.0, "step": 5670 }, { "epoch": 89.01016129032259, "eval_accuracy": 0.6129032258064516, "eval_loss": 3.810636281967163, "eval_runtime": 10.9006, "eval_samples_per_second": 2.844, "eval_steps_per_second": 0.734, "step": 5670 }, { "epoch": 90.0016129032258, "grad_norm": 0.001488726120442152, "learning_rate": 9.318996415770611e-07, "loss": 0.0, "step": 5680 }, { "epoch": 90.00322580645161, "grad_norm": 0.00045306392712518573, "learning_rate": 9.13978494623656e-07, "loss": 0.0, "step": 5690 }, { "epoch": 90.00483870967741, "grad_norm": 0.00041228829650208354, "learning_rate": 8.96057347670251e-07, "loss": 0.0002, "step": 5700 }, { "epoch": 90.00645161290322, "grad_norm": 0.0005429069860838354, "learning_rate": 8.781362007168459e-07, "loss": 0.0, "step": 5710 }, { "epoch": 90.00806451612904, "grad_norm": 0.0010314955143257976, "learning_rate": 8.60215053763441e-07, "loss": 0.0, "step": 5720 }, { "epoch": 90.00967741935484, "grad_norm": 0.0010697644902393222, "learning_rate": 8.422939068100359e-07, "loss": 0.0001, "step": 5730 }, { "epoch": 90.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 4.146834850311279, "eval_runtime": 8.3835, "eval_samples_per_second": 3.698, "eval_steps_per_second": 0.954, "step": 5733 }, { "epoch": 91.00112903225806, "grad_norm": 0.0048898071981966496, "learning_rate": 8.243727598566309e-07, "loss": 0.0, "step": 5740 }, { "epoch": 91.00274193548387, "grad_norm": 0.0018456532852724195, "learning_rate": 8.064516129032258e-07, "loss": 0.0, "step": 5750 }, { "epoch": 91.00435483870967, "grad_norm": 0.0004042400687467307, "learning_rate": 7.885304659498208e-07, "loss": 0.0001, "step": 5760 }, { "epoch": 91.00596774193548, "grad_norm": 0.00040011032251641154, "learning_rate": 7.706093189964159e-07, "loss": 0.0, "step": 5770 }, { "epoch": 91.00758064516128, "grad_norm": 0.0003558983444236219, "learning_rate": 7.526881720430108e-07, "loss": 0.0, "step": 5780 }, { "epoch": 91.0091935483871, "grad_norm": 0.0005216757417656481, "learning_rate": 7.347670250896058e-07, "loss": 0.0, "step": 5790 }, { "epoch": 91.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.369957447052002, "eval_runtime": 9.2447, "eval_samples_per_second": 3.353, "eval_steps_per_second": 0.865, "step": 5796 }, { "epoch": 92.00064516129032, "grad_norm": 0.000797704269643873, "learning_rate": 7.168458781362007e-07, "loss": 0.0, "step": 5800 }, { "epoch": 92.00225806451613, "grad_norm": 0.00026240726583637297, "learning_rate": 6.989247311827957e-07, "loss": 0.0, "step": 5810 }, { "epoch": 92.00387096774193, "grad_norm": 0.0008079367689788342, "learning_rate": 6.810035842293908e-07, "loss": 0.0, "step": 5820 }, { "epoch": 92.00548387096774, "grad_norm": 0.0007815553108230233, "learning_rate": 6.630824372759858e-07, "loss": 0.0, "step": 5830 }, { "epoch": 92.00709677419354, "grad_norm": 0.0003803852014243603, "learning_rate": 6.451612903225807e-07, "loss": 0.0, "step": 5840 }, { "epoch": 92.00870967741936, "grad_norm": 0.0003677853674162179, "learning_rate": 6.272401433691756e-07, "loss": 0.0, "step": 5850 }, { "epoch": 92.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.239170074462891, "eval_runtime": 9.5569, "eval_samples_per_second": 3.244, "eval_steps_per_second": 0.837, "step": 5859 }, { "epoch": 93.00016129032258, "grad_norm": 0.0006770737236365676, "learning_rate": 6.093189964157707e-07, "loss": 0.0, "step": 5860 }, { "epoch": 93.00177419354839, "grad_norm": 0.0004486216639634222, "learning_rate": 5.913978494623656e-07, "loss": 0.0, "step": 5870 }, { "epoch": 93.00338709677419, "grad_norm": 0.0007312916568480432, "learning_rate": 5.734767025089606e-07, "loss": 0.0, "step": 5880 }, { "epoch": 93.005, "grad_norm": 0.0003597465401981026, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "step": 5890 }, { "epoch": 93.0066129032258, "grad_norm": 0.0010129413567483425, "learning_rate": 5.376344086021506e-07, "loss": 0.0, "step": 5900 }, { "epoch": 93.00822580645162, "grad_norm": 0.0002682928752619773, "learning_rate": 5.197132616487455e-07, "loss": 0.0, "step": 5910 }, { "epoch": 93.00983870967742, "grad_norm": 0.0004301499866414815, "learning_rate": 5.017921146953406e-07, "loss": 0.0, "step": 5920 }, { "epoch": 93.01016129032259, "eval_accuracy": 0.5483870967741935, "eval_loss": 4.208507537841797, "eval_runtime": 9.3789, "eval_samples_per_second": 3.305, "eval_steps_per_second": 0.853, "step": 5922 }, { "epoch": 94.00129032258064, "grad_norm": 0.0005511997151188552, "learning_rate": 4.838709677419355e-07, "loss": 0.0, "step": 5930 }, { "epoch": 94.00290322580645, "grad_norm": 0.000891170697286725, "learning_rate": 4.6594982078853055e-07, "loss": 0.0, "step": 5940 }, { "epoch": 94.00451612903225, "grad_norm": 0.0010540627408772707, "learning_rate": 4.480286738351255e-07, "loss": 0.0, "step": 5950 }, { "epoch": 94.00612903225806, "grad_norm": 0.0005532228387892246, "learning_rate": 4.301075268817205e-07, "loss": 0.0, "step": 5960 }, { "epoch": 94.00774193548386, "grad_norm": 0.005676784086972475, "learning_rate": 4.1218637992831543e-07, "loss": 0.0, "step": 5970 }, { "epoch": 94.00935483870968, "grad_norm": 0.001065823482349515, "learning_rate": 3.942652329749104e-07, "loss": 0.0, "step": 5980 }, { "epoch": 94.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 4.201733112335205, "eval_runtime": 9.0814, "eval_samples_per_second": 3.414, "eval_steps_per_second": 0.881, "step": 5985 }, { "epoch": 95.0008064516129, "grad_norm": 0.0003691113379318267, "learning_rate": 3.763440860215054e-07, "loss": 0.0, "step": 5990 }, { "epoch": 95.00241935483871, "grad_norm": 0.0005401527741923928, "learning_rate": 3.5842293906810036e-07, "loss": 0.0, "step": 6000 }, { "epoch": 95.00403225806451, "grad_norm": 0.00024014603695832193, "learning_rate": 3.405017921146954e-07, "loss": 0.0, "step": 6010 }, { "epoch": 95.00564516129032, "grad_norm": 0.000683434889651835, "learning_rate": 3.2258064516129035e-07, "loss": 0.0, "step": 6020 }, { "epoch": 95.00725806451612, "grad_norm": 0.0005779504426755011, "learning_rate": 3.0465949820788535e-07, "loss": 0.0, "step": 6030 }, { "epoch": 95.00887096774194, "grad_norm": 0.0002458437520544976, "learning_rate": 2.867383512544803e-07, "loss": 0.0, "step": 6040 }, { "epoch": 95.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 4.197226047515869, "eval_runtime": 7.4744, "eval_samples_per_second": 4.147, "eval_steps_per_second": 1.07, "step": 6048 }, { "epoch": 96.00032258064516, "grad_norm": 0.0005491464398801327, "learning_rate": 2.688172043010753e-07, "loss": 0.0, "step": 6050 }, { "epoch": 96.00193548387097, "grad_norm": 0.00044942478416487575, "learning_rate": 2.508960573476703e-07, "loss": 0.0, "step": 6060 }, { "epoch": 96.00354838709677, "grad_norm": 0.0016082311049103737, "learning_rate": 2.3297491039426527e-07, "loss": 0.0, "step": 6070 }, { "epoch": 96.00516129032258, "grad_norm": 0.0021796412765979767, "learning_rate": 2.1505376344086024e-07, "loss": 0.0, "step": 6080 }, { "epoch": 96.00677419354838, "grad_norm": 0.000279360159765929, "learning_rate": 1.971326164874552e-07, "loss": 0.0, "step": 6090 }, { "epoch": 96.0083870967742, "grad_norm": 0.0006110651884227991, "learning_rate": 1.7921146953405018e-07, "loss": 0.0, "step": 6100 }, { "epoch": 96.01, "grad_norm": 0.0003387313918210566, "learning_rate": 1.6129032258064518e-07, "loss": 0.0, "step": 6110 }, { "epoch": 96.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 4.192503452301025, "eval_runtime": 5.9516, "eval_samples_per_second": 5.209, "eval_steps_per_second": 1.344, "step": 6111 }, { "epoch": 97.00145161290322, "grad_norm": 0.0006941120373085141, "learning_rate": 1.4336917562724014e-07, "loss": 0.0, "step": 6120 }, { "epoch": 97.00306451612903, "grad_norm": 0.0009020269499160349, "learning_rate": 1.2544802867383514e-07, "loss": 0.0, "step": 6130 }, { "epoch": 97.00467741935483, "grad_norm": 0.0007686845492571592, "learning_rate": 1.0752688172043012e-07, "loss": 0.0, "step": 6140 }, { "epoch": 97.00629032258064, "grad_norm": 0.00033951245131902397, "learning_rate": 8.960573476702509e-08, "loss": 0.0, "step": 6150 }, { "epoch": 97.00790322580646, "grad_norm": 0.0009793393546715379, "learning_rate": 7.168458781362007e-08, "loss": 0.0, "step": 6160 }, { "epoch": 97.00951612903226, "grad_norm": 0.008884147740900517, "learning_rate": 5.376344086021506e-08, "loss": 0.0, "step": 6170 }, { "epoch": 97.01016129032259, "eval_accuracy": 0.5806451612903226, "eval_loss": 4.1910810470581055, "eval_runtime": 8.2457, "eval_samples_per_second": 3.76, "eval_steps_per_second": 0.97, "step": 6174 }, { "epoch": 98.00096774193548, "grad_norm": 0.0006854013190604746, "learning_rate": 3.5842293906810036e-08, "loss": 0.0, "step": 6180 }, { "epoch": 98.00258064516129, "grad_norm": 0.001278868643566966, "learning_rate": 1.7921146953405018e-08, "loss": 0.0, "step": 6190 }, { "epoch": 98.0041935483871, "grad_norm": 0.0004975146730430424, "learning_rate": 0.0, "loss": 0.0, "step": 6200 }, { "epoch": 98.0041935483871, "eval_accuracy": 0.5806451612903226, "eval_loss": 4.191013336181641, "eval_runtime": 9.3098, "eval_samples_per_second": 3.33, "eval_steps_per_second": 0.859, "step": 6200 }, { "epoch": 98.0041935483871, "step": 6200, "total_flos": 1.0846713204046076e+20, "train_loss": 0.10874109933826877, "train_runtime": 11690.8518, "train_samples_per_second": 2.121, "train_steps_per_second": 0.53 }, { "epoch": 98.0041935483871, "eval_accuracy": 0.726027397260274, "eval_loss": 0.5596539378166199, "eval_runtime": 19.8013, "eval_samples_per_second": 3.687, "eval_steps_per_second": 0.96, "step": 6200 }, { "epoch": 98.0041935483871, "eval_accuracy": 0.726027397260274, "eval_loss": 0.5596538186073303, "eval_runtime": 18.83, "eval_samples_per_second": 3.877, "eval_steps_per_second": 1.009, "step": 6200 } ], "logging_steps": 10, "max_steps": 6200, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0846713204046076e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }