{ "best_metric": 0.5304816365242004, "best_model_checkpoint": "./vit-base-beans/checkpoint-1600", "epoch": 4.0, "eval_steps": 100, "global_step": 1736, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02304147465437788, "grad_norm": 2.396202564239502, "learning_rate": 0.00019884792626728113, "loss": 1.8485, "step": 10 }, { "epoch": 0.04608294930875576, "grad_norm": 1.289166808128357, "learning_rate": 0.00019769585253456222, "loss": 1.5911, "step": 20 }, { "epoch": 0.06912442396313365, "grad_norm": 2.512033462524414, "learning_rate": 0.00019654377880184333, "loss": 1.4806, "step": 30 }, { "epoch": 0.09216589861751152, "grad_norm": 2.6234657764434814, "learning_rate": 0.00019539170506912442, "loss": 1.3684, "step": 40 }, { "epoch": 0.1152073732718894, "grad_norm": 2.335149049758911, "learning_rate": 0.00019423963133640554, "loss": 1.4012, "step": 50 }, { "epoch": 0.1382488479262673, "grad_norm": 3.386568546295166, "learning_rate": 0.00019308755760368663, "loss": 1.2248, "step": 60 }, { "epoch": 0.16129032258064516, "grad_norm": 1.9273797273635864, "learning_rate": 0.00019193548387096775, "loss": 1.144, "step": 70 }, { "epoch": 0.18433179723502305, "grad_norm": 2.2117414474487305, "learning_rate": 0.00019078341013824886, "loss": 1.0101, "step": 80 }, { "epoch": 0.2073732718894009, "grad_norm": 3.1132171154022217, "learning_rate": 0.00018963133640552998, "loss": 1.1411, "step": 90 }, { "epoch": 0.2304147465437788, "grad_norm": 3.0585570335388184, "learning_rate": 0.00018847926267281107, "loss": 1.0791, "step": 100 }, { "epoch": 0.2304147465437788, "eval_accuracy": 0.6335113484646195, "eval_loss": 1.0347875356674194, "eval_runtime": 11.9052, "eval_samples_per_second": 125.828, "eval_steps_per_second": 15.791, "step": 100 }, { "epoch": 0.2534562211981567, "grad_norm": 2.400747299194336, "learning_rate": 0.00018732718894009219, "loss": 1.04, "step": 110 }, { "epoch": 0.2764976958525346, "grad_norm": 2.432607412338257, "learning_rate": 0.00018617511520737328, "loss": 1.0396, "step": 120 }, { "epoch": 0.2995391705069124, "grad_norm": 2.5169568061828613, "learning_rate": 0.0001850230414746544, "loss": 0.9925, "step": 130 }, { "epoch": 0.3225806451612903, "grad_norm": 2.450554847717285, "learning_rate": 0.00018387096774193548, "loss": 1.0361, "step": 140 }, { "epoch": 0.3456221198156682, "grad_norm": 1.5931885242462158, "learning_rate": 0.0001827188940092166, "loss": 0.9851, "step": 150 }, { "epoch": 0.3686635944700461, "grad_norm": 1.8019052743911743, "learning_rate": 0.0001815668202764977, "loss": 0.8847, "step": 160 }, { "epoch": 0.391705069124424, "grad_norm": 2.283034086227417, "learning_rate": 0.0001804147465437788, "loss": 0.8507, "step": 170 }, { "epoch": 0.4147465437788018, "grad_norm": 2.5878796577453613, "learning_rate": 0.0001792626728110599, "loss": 0.9579, "step": 180 }, { "epoch": 0.4377880184331797, "grad_norm": 3.469618558883667, "learning_rate": 0.000178110599078341, "loss": 0.9453, "step": 190 }, { "epoch": 0.4608294930875576, "grad_norm": 1.9743025302886963, "learning_rate": 0.00017695852534562213, "loss": 0.9415, "step": 200 }, { "epoch": 0.4608294930875576, "eval_accuracy": 0.6448598130841121, "eval_loss": 0.9576324820518494, "eval_runtime": 11.862, "eval_samples_per_second": 126.285, "eval_steps_per_second": 15.849, "step": 200 }, { "epoch": 0.4838709677419355, "grad_norm": 3.031723976135254, "learning_rate": 0.00017580645161290325, "loss": 0.7819, "step": 210 }, { "epoch": 0.5069124423963134, "grad_norm": 2.2470805644989014, "learning_rate": 0.00017465437788018436, "loss": 0.8163, "step": 220 }, { "epoch": 0.5299539170506913, "grad_norm": 1.723471760749817, "learning_rate": 0.00017350230414746545, "loss": 0.6728, "step": 230 }, { "epoch": 0.5529953917050692, "grad_norm": 3.93212628364563, "learning_rate": 0.00017235023041474657, "loss": 0.684, "step": 240 }, { "epoch": 0.576036866359447, "grad_norm": 1.4867981672286987, "learning_rate": 0.00017119815668202766, "loss": 0.8527, "step": 250 }, { "epoch": 0.5990783410138248, "grad_norm": 2.4340641498565674, "learning_rate": 0.00017004608294930878, "loss": 1.0102, "step": 260 }, { "epoch": 0.6221198156682027, "grad_norm": 2.8441660404205322, "learning_rate": 0.00016889400921658987, "loss": 0.7739, "step": 270 }, { "epoch": 0.6451612903225806, "grad_norm": 1.6598294973373413, "learning_rate": 0.00016774193548387098, "loss": 0.7442, "step": 280 }, { "epoch": 0.6682027649769585, "grad_norm": 3.455202102661133, "learning_rate": 0.00016658986175115207, "loss": 0.7643, "step": 290 }, { "epoch": 0.6912442396313364, "grad_norm": 2.480116367340088, "learning_rate": 0.0001654377880184332, "loss": 0.7839, "step": 300 }, { "epoch": 0.6912442396313364, "eval_accuracy": 0.6662216288384513, "eval_loss": 0.89629727602005, "eval_runtime": 11.7103, "eval_samples_per_second": 127.921, "eval_steps_per_second": 16.054, "step": 300 }, { "epoch": 0.7142857142857143, "grad_norm": 3.3055620193481445, "learning_rate": 0.00016428571428571428, "loss": 0.639, "step": 310 }, { "epoch": 0.7373271889400922, "grad_norm": 1.8542070388793945, "learning_rate": 0.0001631336405529954, "loss": 0.8931, "step": 320 }, { "epoch": 0.7603686635944701, "grad_norm": 1.6089766025543213, "learning_rate": 0.00016198156682027649, "loss": 0.9023, "step": 330 }, { "epoch": 0.783410138248848, "grad_norm": 1.5780836343765259, "learning_rate": 0.0001608294930875576, "loss": 0.7285, "step": 340 }, { "epoch": 0.8064516129032258, "grad_norm": 3.153092384338379, "learning_rate": 0.00015967741935483872, "loss": 0.8702, "step": 350 }, { "epoch": 0.8294930875576036, "grad_norm": 2.3161656856536865, "learning_rate": 0.00015852534562211984, "loss": 0.7343, "step": 360 }, { "epoch": 0.8525345622119815, "grad_norm": 1.7923251390457153, "learning_rate": 0.00015737327188940093, "loss": 0.7986, "step": 370 }, { "epoch": 0.8755760368663594, "grad_norm": 2.7093405723571777, "learning_rate": 0.00015622119815668204, "loss": 0.6377, "step": 380 }, { "epoch": 0.8986175115207373, "grad_norm": 4.7555251121521, "learning_rate": 0.00015506912442396313, "loss": 0.8223, "step": 390 }, { "epoch": 0.9216589861751152, "grad_norm": 2.78916072845459, "learning_rate": 0.00015391705069124425, "loss": 0.7181, "step": 400 }, { "epoch": 0.9216589861751152, "eval_accuracy": 0.6962616822429907, "eval_loss": 0.8479276299476624, "eval_runtime": 11.6609, "eval_samples_per_second": 128.464, "eval_steps_per_second": 16.122, "step": 400 }, { "epoch": 0.9447004608294931, "grad_norm": 2.4783871173858643, "learning_rate": 0.00015276497695852537, "loss": 0.7422, "step": 410 }, { "epoch": 0.967741935483871, "grad_norm": 2.8775382041931152, "learning_rate": 0.00015161290322580646, "loss": 0.6255, "step": 420 }, { "epoch": 0.9907834101382489, "grad_norm": 2.3851194381713867, "learning_rate": 0.00015046082949308757, "loss": 0.7266, "step": 430 }, { "epoch": 1.0138248847926268, "grad_norm": 5.285385608673096, "learning_rate": 0.00014930875576036866, "loss": 0.6283, "step": 440 }, { "epoch": 1.0368663594470047, "grad_norm": 1.691789984703064, "learning_rate": 0.00014815668202764978, "loss": 0.4918, "step": 450 }, { "epoch": 1.0599078341013826, "grad_norm": 2.8921382427215576, "learning_rate": 0.00014700460829493087, "loss": 0.5787, "step": 460 }, { "epoch": 1.0829493087557605, "grad_norm": 3.1509757041931152, "learning_rate": 0.00014585253456221199, "loss": 0.4906, "step": 470 }, { "epoch": 1.1059907834101383, "grad_norm": 3.2979822158813477, "learning_rate": 0.0001447004608294931, "loss": 0.5715, "step": 480 }, { "epoch": 1.129032258064516, "grad_norm": 3.3389899730682373, "learning_rate": 0.00014354838709677422, "loss": 0.5411, "step": 490 }, { "epoch": 1.1520737327188941, "grad_norm": 0.9589664936065674, "learning_rate": 0.0001423963133640553, "loss": 0.3995, "step": 500 }, { "epoch": 1.1520737327188941, "eval_accuracy": 0.7169559412550067, "eval_loss": 0.7820530533790588, "eval_runtime": 11.5056, "eval_samples_per_second": 130.197, "eval_steps_per_second": 16.34, "step": 500 }, { "epoch": 1.1751152073732718, "grad_norm": 2.248042106628418, "learning_rate": 0.00014124423963133643, "loss": 0.5057, "step": 510 }, { "epoch": 1.1981566820276497, "grad_norm": 3.944963216781616, "learning_rate": 0.00014009216589861752, "loss": 0.5005, "step": 520 }, { "epoch": 1.2211981566820276, "grad_norm": 2.7981412410736084, "learning_rate": 0.00013894009216589863, "loss": 0.6703, "step": 530 }, { "epoch": 1.2442396313364055, "grad_norm": 1.683069109916687, "learning_rate": 0.00013778801843317972, "loss": 0.5394, "step": 540 }, { "epoch": 1.2672811059907834, "grad_norm": 1.2122957706451416, "learning_rate": 0.00013663594470046084, "loss": 0.4775, "step": 550 }, { "epoch": 1.2903225806451613, "grad_norm": 1.4005225896835327, "learning_rate": 0.00013548387096774193, "loss": 0.4467, "step": 560 }, { "epoch": 1.3133640552995391, "grad_norm": 2.5969114303588867, "learning_rate": 0.00013433179723502305, "loss": 0.4289, "step": 570 }, { "epoch": 1.336405529953917, "grad_norm": 3.344553232192993, "learning_rate": 0.00013317972350230414, "loss": 0.4631, "step": 580 }, { "epoch": 1.359447004608295, "grad_norm": 1.6798585653305054, "learning_rate": 0.00013202764976958525, "loss": 0.4329, "step": 590 }, { "epoch": 1.3824884792626728, "grad_norm": 1.3849396705627441, "learning_rate": 0.00013087557603686637, "loss": 0.5025, "step": 600 }, { "epoch": 1.3824884792626728, "eval_accuracy": 0.7837116154873164, "eval_loss": 0.6299713253974915, "eval_runtime": 11.705, "eval_samples_per_second": 127.979, "eval_steps_per_second": 16.061, "step": 600 }, { "epoch": 1.4055299539170507, "grad_norm": 2.550548791885376, "learning_rate": 0.00012972350230414746, "loss": 0.4463, "step": 610 }, { "epoch": 1.4285714285714286, "grad_norm": 3.063411235809326, "learning_rate": 0.00012857142857142858, "loss": 0.3624, "step": 620 }, { "epoch": 1.4516129032258065, "grad_norm": 6.676961898803711, "learning_rate": 0.0001274193548387097, "loss": 0.4446, "step": 630 }, { "epoch": 1.4746543778801844, "grad_norm": 0.8720624446868896, "learning_rate": 0.0001262672811059908, "loss": 0.5162, "step": 640 }, { "epoch": 1.4976958525345623, "grad_norm": 2.214848041534424, "learning_rate": 0.0001251152073732719, "loss": 0.2978, "step": 650 }, { "epoch": 1.52073732718894, "grad_norm": 5.083272457122803, "learning_rate": 0.00012396313364055302, "loss": 0.5157, "step": 660 }, { "epoch": 1.543778801843318, "grad_norm": 4.042588710784912, "learning_rate": 0.0001228110599078341, "loss": 0.5338, "step": 670 }, { "epoch": 1.5668202764976957, "grad_norm": 3.1029160022735596, "learning_rate": 0.00012165898617511522, "loss": 0.4767, "step": 680 }, { "epoch": 1.5898617511520738, "grad_norm": 1.4430710077285767, "learning_rate": 0.00012050691244239631, "loss": 0.5531, "step": 690 }, { "epoch": 1.6129032258064515, "grad_norm": 11.178030967712402, "learning_rate": 0.00011935483870967743, "loss": 0.4985, "step": 700 }, { "epoch": 1.6129032258064515, "eval_accuracy": 0.7489986648865153, "eval_loss": 0.7058817744255066, "eval_runtime": 11.9139, "eval_samples_per_second": 125.736, "eval_steps_per_second": 15.78, "step": 700 }, { "epoch": 1.6359447004608296, "grad_norm": 3.918297529220581, "learning_rate": 0.00011820276497695852, "loss": 0.5471, "step": 710 }, { "epoch": 1.6589861751152073, "grad_norm": 2.7170467376708984, "learning_rate": 0.00011705069124423964, "loss": 0.4797, "step": 720 }, { "epoch": 1.6820276497695854, "grad_norm": 1.0436949729919434, "learning_rate": 0.00011589861751152074, "loss": 0.427, "step": 730 }, { "epoch": 1.705069124423963, "grad_norm": 3.6829638481140137, "learning_rate": 0.00011474654377880186, "loss": 0.5121, "step": 740 }, { "epoch": 1.728110599078341, "grad_norm": 1.8748345375061035, "learning_rate": 0.00011359447004608295, "loss": 0.4227, "step": 750 }, { "epoch": 1.7511520737327189, "grad_norm": 4.548758506774902, "learning_rate": 0.00011244239631336406, "loss": 0.3164, "step": 760 }, { "epoch": 1.7741935483870968, "grad_norm": 3.4847280979156494, "learning_rate": 0.00011129032258064515, "loss": 0.5092, "step": 770 }, { "epoch": 1.7972350230414746, "grad_norm": 1.8869714736938477, "learning_rate": 0.00011013824884792627, "loss": 0.4472, "step": 780 }, { "epoch": 1.8202764976958525, "grad_norm": 3.899409770965576, "learning_rate": 0.00010898617511520739, "loss": 0.4708, "step": 790 }, { "epoch": 1.8433179723502304, "grad_norm": 1.543060541152954, "learning_rate": 0.00010783410138248849, "loss": 0.4388, "step": 800 }, { "epoch": 1.8433179723502304, "eval_accuracy": 0.7857142857142857, "eval_loss": 0.5893343091011047, "eval_runtime": 11.4174, "eval_samples_per_second": 131.203, "eval_steps_per_second": 16.466, "step": 800 }, { "epoch": 1.8663594470046083, "grad_norm": 5.587724208831787, "learning_rate": 0.0001066820276497696, "loss": 0.4264, "step": 810 }, { "epoch": 1.8894009216589862, "grad_norm": 7.794037342071533, "learning_rate": 0.0001055299539170507, "loss": 0.4513, "step": 820 }, { "epoch": 1.912442396313364, "grad_norm": 3.597796678543091, "learning_rate": 0.00010437788018433181, "loss": 0.437, "step": 830 }, { "epoch": 1.935483870967742, "grad_norm": 2.825336217880249, "learning_rate": 0.0001032258064516129, "loss": 0.5202, "step": 840 }, { "epoch": 1.9585253456221197, "grad_norm": 1.8002281188964844, "learning_rate": 0.00010207373271889402, "loss": 0.3283, "step": 850 }, { "epoch": 1.9815668202764978, "grad_norm": 6.496976375579834, "learning_rate": 0.00010092165898617512, "loss": 0.2887, "step": 860 }, { "epoch": 2.0046082949308754, "grad_norm": 2.1674392223358154, "learning_rate": 9.976958525345623e-05, "loss": 0.3299, "step": 870 }, { "epoch": 2.0276497695852536, "grad_norm": 0.475057989358902, "learning_rate": 9.861751152073733e-05, "loss": 0.2049, "step": 880 }, { "epoch": 2.0506912442396312, "grad_norm": 2.232353687286377, "learning_rate": 9.746543778801845e-05, "loss": 0.2598, "step": 890 }, { "epoch": 2.0737327188940093, "grad_norm": 3.595874309539795, "learning_rate": 9.631336405529955e-05, "loss": 0.2389, "step": 900 }, { "epoch": 2.0737327188940093, "eval_accuracy": 0.807743658210948, "eval_loss": 0.5928804278373718, "eval_runtime": 11.7831, "eval_samples_per_second": 127.131, "eval_steps_per_second": 15.955, "step": 900 }, { "epoch": 2.096774193548387, "grad_norm": 2.4027860164642334, "learning_rate": 9.516129032258065e-05, "loss": 0.2023, "step": 910 }, { "epoch": 2.119815668202765, "grad_norm": 4.1582560539245605, "learning_rate": 9.400921658986176e-05, "loss": 0.2389, "step": 920 }, { "epoch": 2.142857142857143, "grad_norm": 3.8105199337005615, "learning_rate": 9.285714285714286e-05, "loss": 0.2054, "step": 930 }, { "epoch": 2.165898617511521, "grad_norm": 4.042884826660156, "learning_rate": 9.170506912442398e-05, "loss": 0.2445, "step": 940 }, { "epoch": 2.1889400921658986, "grad_norm": 3.3385071754455566, "learning_rate": 9.055299539170508e-05, "loss": 0.2578, "step": 950 }, { "epoch": 2.2119815668202767, "grad_norm": 2.232977867126465, "learning_rate": 8.940092165898618e-05, "loss": 0.2168, "step": 960 }, { "epoch": 2.2350230414746544, "grad_norm": 4.8774847984313965, "learning_rate": 8.824884792626729e-05, "loss": 0.1978, "step": 970 }, { "epoch": 2.258064516129032, "grad_norm": 2.6131808757781982, "learning_rate": 8.709677419354839e-05, "loss": 0.223, "step": 980 }, { "epoch": 2.28110599078341, "grad_norm": 1.6126481294631958, "learning_rate": 8.594470046082949e-05, "loss": 0.3882, "step": 990 }, { "epoch": 2.3041474654377883, "grad_norm": 1.6977124214172363, "learning_rate": 8.479262672811061e-05, "loss": 0.2767, "step": 1000 }, { "epoch": 2.3041474654377883, "eval_accuracy": 0.8090787716955942, "eval_loss": 0.5795237421989441, "eval_runtime": 11.3869, "eval_samples_per_second": 131.555, "eval_steps_per_second": 16.51, "step": 1000 }, { "epoch": 2.327188940092166, "grad_norm": 5.384529113769531, "learning_rate": 8.364055299539171e-05, "loss": 0.2478, "step": 1010 }, { "epoch": 2.3502304147465436, "grad_norm": 7.527071952819824, "learning_rate": 8.248847926267282e-05, "loss": 0.1614, "step": 1020 }, { "epoch": 2.3732718894009217, "grad_norm": 3.253967523574829, "learning_rate": 8.133640552995392e-05, "loss": 0.1988, "step": 1030 }, { "epoch": 2.3963133640552994, "grad_norm": 2.3061683177948, "learning_rate": 8.018433179723502e-05, "loss": 0.2267, "step": 1040 }, { "epoch": 2.4193548387096775, "grad_norm": 5.240030288696289, "learning_rate": 7.903225806451613e-05, "loss": 0.3522, "step": 1050 }, { "epoch": 2.442396313364055, "grad_norm": 5.367170810699463, "learning_rate": 7.788018433179723e-05, "loss": 0.21, "step": 1060 }, { "epoch": 2.4654377880184333, "grad_norm": 2.52602219581604, "learning_rate": 7.672811059907835e-05, "loss": 0.208, "step": 1070 }, { "epoch": 2.488479262672811, "grad_norm": 3.110276937484741, "learning_rate": 7.557603686635945e-05, "loss": 0.1624, "step": 1080 }, { "epoch": 2.511520737327189, "grad_norm": 3.7577178478240967, "learning_rate": 7.442396313364057e-05, "loss": 0.2187, "step": 1090 }, { "epoch": 2.5345622119815667, "grad_norm": 0.886064887046814, "learning_rate": 7.327188940092167e-05, "loss": 0.2387, "step": 1100 }, { "epoch": 2.5345622119815667, "eval_accuracy": 0.8090787716955942, "eval_loss": 0.6099982857704163, "eval_runtime": 11.7513, "eval_samples_per_second": 127.476, "eval_steps_per_second": 15.998, "step": 1100 }, { "epoch": 2.557603686635945, "grad_norm": 0.9772585034370422, "learning_rate": 7.211981566820277e-05, "loss": 0.2289, "step": 1110 }, { "epoch": 2.5806451612903225, "grad_norm": 5.879600524902344, "learning_rate": 7.096774193548388e-05, "loss": 0.2592, "step": 1120 }, { "epoch": 2.6036866359447006, "grad_norm": 5.125580310821533, "learning_rate": 6.981566820276498e-05, "loss": 0.1801, "step": 1130 }, { "epoch": 2.6267281105990783, "grad_norm": 4.4502692222595215, "learning_rate": 6.86635944700461e-05, "loss": 0.3577, "step": 1140 }, { "epoch": 2.6497695852534564, "grad_norm": 0.543267548084259, "learning_rate": 6.75115207373272e-05, "loss": 0.1313, "step": 1150 }, { "epoch": 2.672811059907834, "grad_norm": 1.4891630411148071, "learning_rate": 6.63594470046083e-05, "loss": 0.1858, "step": 1160 }, { "epoch": 2.6958525345622117, "grad_norm": 2.359645366668701, "learning_rate": 6.52073732718894e-05, "loss": 0.2059, "step": 1170 }, { "epoch": 2.71889400921659, "grad_norm": 2.5760185718536377, "learning_rate": 6.405529953917051e-05, "loss": 0.2378, "step": 1180 }, { "epoch": 2.741935483870968, "grad_norm": 0.24703356623649597, "learning_rate": 6.290322580645161e-05, "loss": 0.1487, "step": 1190 }, { "epoch": 2.7649769585253456, "grad_norm": 0.22307877242565155, "learning_rate": 6.175115207373272e-05, "loss": 0.1691, "step": 1200 }, { "epoch": 2.7649769585253456, "eval_accuracy": 0.8070761014686249, "eval_loss": 0.6174820065498352, "eval_runtime": 11.265, "eval_samples_per_second": 132.978, "eval_steps_per_second": 16.689, "step": 1200 }, { "epoch": 2.7880184331797233, "grad_norm": 2.50034761428833, "learning_rate": 6.0599078341013825e-05, "loss": 0.2148, "step": 1210 }, { "epoch": 2.8110599078341014, "grad_norm": 0.3251860439777374, "learning_rate": 5.944700460829493e-05, "loss": 0.1538, "step": 1220 }, { "epoch": 2.8341013824884795, "grad_norm": 3.687969446182251, "learning_rate": 5.829493087557604e-05, "loss": 0.2445, "step": 1230 }, { "epoch": 2.857142857142857, "grad_norm": 7.214417457580566, "learning_rate": 5.714285714285714e-05, "loss": 0.229, "step": 1240 }, { "epoch": 2.880184331797235, "grad_norm": 2.587062120437622, "learning_rate": 5.5990783410138245e-05, "loss": 0.1999, "step": 1250 }, { "epoch": 2.903225806451613, "grad_norm": 4.365920066833496, "learning_rate": 5.4838709677419355e-05, "loss": 0.1061, "step": 1260 }, { "epoch": 2.9262672811059907, "grad_norm": 3.7295572757720947, "learning_rate": 5.368663594470046e-05, "loss": 0.3093, "step": 1270 }, { "epoch": 2.9493087557603688, "grad_norm": 2.4992685317993164, "learning_rate": 5.253456221198156e-05, "loss": 0.1644, "step": 1280 }, { "epoch": 2.9723502304147464, "grad_norm": 5.495995998382568, "learning_rate": 5.138248847926268e-05, "loss": 0.2393, "step": 1290 }, { "epoch": 2.9953917050691246, "grad_norm": 2.1380579471588135, "learning_rate": 5.023041474654379e-05, "loss": 0.1738, "step": 1300 }, { "epoch": 2.9953917050691246, "eval_accuracy": 0.8197596795727636, "eval_loss": 0.5877332091331482, "eval_runtime": 11.4089, "eval_samples_per_second": 131.301, "eval_steps_per_second": 16.478, "step": 1300 }, { "epoch": 3.0184331797235022, "grad_norm": 6.119831085205078, "learning_rate": 4.9078341013824885e-05, "loss": 0.075, "step": 1310 }, { "epoch": 3.0414746543778803, "grad_norm": 0.25446683168411255, "learning_rate": 4.792626728110599e-05, "loss": 0.0528, "step": 1320 }, { "epoch": 3.064516129032258, "grad_norm": 0.32773900032043457, "learning_rate": 4.67741935483871e-05, "loss": 0.0551, "step": 1330 }, { "epoch": 3.087557603686636, "grad_norm": 0.8912816643714905, "learning_rate": 4.562211981566821e-05, "loss": 0.0799, "step": 1340 }, { "epoch": 3.110599078341014, "grad_norm": 0.6732431054115295, "learning_rate": 4.447004608294931e-05, "loss": 0.0327, "step": 1350 }, { "epoch": 3.133640552995392, "grad_norm": 5.909882545471191, "learning_rate": 4.3317972350230415e-05, "loss": 0.108, "step": 1360 }, { "epoch": 3.1566820276497696, "grad_norm": 1.3546661138534546, "learning_rate": 4.2165898617511525e-05, "loss": 0.1057, "step": 1370 }, { "epoch": 3.1797235023041477, "grad_norm": 0.09205944836139679, "learning_rate": 4.101382488479263e-05, "loss": 0.045, "step": 1380 }, { "epoch": 3.2027649769585254, "grad_norm": 0.12445586174726486, "learning_rate": 3.986175115207373e-05, "loss": 0.0391, "step": 1390 }, { "epoch": 3.225806451612903, "grad_norm": 0.49267128109931946, "learning_rate": 3.870967741935484e-05, "loss": 0.0397, "step": 1400 }, { "epoch": 3.225806451612903, "eval_accuracy": 0.835781041388518, "eval_loss": 0.576629102230072, "eval_runtime": 11.5658, "eval_samples_per_second": 129.52, "eval_steps_per_second": 16.255, "step": 1400 }, { "epoch": 3.248847926267281, "grad_norm": 0.24710910022258759, "learning_rate": 3.7557603686635945e-05, "loss": 0.0982, "step": 1410 }, { "epoch": 3.271889400921659, "grad_norm": 1.3541345596313477, "learning_rate": 3.640552995391705e-05, "loss": 0.1062, "step": 1420 }, { "epoch": 3.294930875576037, "grad_norm": 0.07805185765028, "learning_rate": 3.525345622119816e-05, "loss": 0.0367, "step": 1430 }, { "epoch": 3.3179723502304146, "grad_norm": 0.704824686050415, "learning_rate": 3.410138248847927e-05, "loss": 0.0576, "step": 1440 }, { "epoch": 3.3410138248847927, "grad_norm": 3.216744899749756, "learning_rate": 3.294930875576037e-05, "loss": 0.123, "step": 1450 }, { "epoch": 3.3640552995391704, "grad_norm": 3.2812583446502686, "learning_rate": 3.1797235023041475e-05, "loss": 0.0535, "step": 1460 }, { "epoch": 3.3870967741935485, "grad_norm": 0.09345371276140213, "learning_rate": 3.0645161290322585e-05, "loss": 0.0363, "step": 1470 }, { "epoch": 3.410138248847926, "grad_norm": 0.5610162019729614, "learning_rate": 2.9493087557603688e-05, "loss": 0.0903, "step": 1480 }, { "epoch": 3.4331797235023043, "grad_norm": 1.413180947303772, "learning_rate": 2.8341013824884795e-05, "loss": 0.0792, "step": 1490 }, { "epoch": 3.456221198156682, "grad_norm": 6.735473155975342, "learning_rate": 2.7188940092165898e-05, "loss": 0.03, "step": 1500 }, { "epoch": 3.456221198156682, "eval_accuracy": 0.8371161548731643, "eval_loss": 0.5680701732635498, "eval_runtime": 11.6369, "eval_samples_per_second": 128.728, "eval_steps_per_second": 16.155, "step": 1500 }, { "epoch": 3.47926267281106, "grad_norm": 1.4329415559768677, "learning_rate": 2.6036866359447005e-05, "loss": 0.0206, "step": 1510 }, { "epoch": 3.5023041474654377, "grad_norm": 0.0513407364487648, "learning_rate": 2.488479262672811e-05, "loss": 0.0637, "step": 1520 }, { "epoch": 3.525345622119816, "grad_norm": 0.09985367208719254, "learning_rate": 2.3732718894009218e-05, "loss": 0.0829, "step": 1530 }, { "epoch": 3.5483870967741935, "grad_norm": 0.0632900595664978, "learning_rate": 2.258064516129032e-05, "loss": 0.0329, "step": 1540 }, { "epoch": 3.571428571428571, "grad_norm": 0.23229588568210602, "learning_rate": 2.1428571428571428e-05, "loss": 0.0709, "step": 1550 }, { "epoch": 3.5944700460829493, "grad_norm": 0.15025608241558075, "learning_rate": 2.0276497695852538e-05, "loss": 0.1135, "step": 1560 }, { "epoch": 3.6175115207373274, "grad_norm": 5.933778285980225, "learning_rate": 1.912442396313364e-05, "loss": 0.1093, "step": 1570 }, { "epoch": 3.640552995391705, "grad_norm": 0.06949874013662338, "learning_rate": 1.7972350230414748e-05, "loss": 0.0498, "step": 1580 }, { "epoch": 3.6635944700460827, "grad_norm": 0.09838402271270752, "learning_rate": 1.682027649769585e-05, "loss": 0.0598, "step": 1590 }, { "epoch": 3.686635944700461, "grad_norm": 0.9366612434387207, "learning_rate": 1.5668202764976958e-05, "loss": 0.092, "step": 1600 }, { "epoch": 3.686635944700461, "eval_accuracy": 0.8451268357810414, "eval_loss": 0.5304816365242004, "eval_runtime": 11.6024, "eval_samples_per_second": 129.111, "eval_steps_per_second": 16.203, "step": 1600 }, { "epoch": 3.709677419354839, "grad_norm": 0.04733530059456825, "learning_rate": 1.4516129032258066e-05, "loss": 0.0276, "step": 1610 }, { "epoch": 3.7327188940092166, "grad_norm": 0.08603022992610931, "learning_rate": 1.3364055299539171e-05, "loss": 0.0347, "step": 1620 }, { "epoch": 3.7557603686635943, "grad_norm": 0.041543856263160706, "learning_rate": 1.2211981566820276e-05, "loss": 0.026, "step": 1630 }, { "epoch": 3.7788018433179724, "grad_norm": 0.24026305973529816, "learning_rate": 1.1059907834101383e-05, "loss": 0.0496, "step": 1640 }, { "epoch": 3.80184331797235, "grad_norm": 0.03894612938165665, "learning_rate": 9.90783410138249e-06, "loss": 0.0365, "step": 1650 }, { "epoch": 3.824884792626728, "grad_norm": 4.442405700683594, "learning_rate": 8.755760368663595e-06, "loss": 0.0402, "step": 1660 }, { "epoch": 3.847926267281106, "grad_norm": 0.032657474279403687, "learning_rate": 7.603686635944701e-06, "loss": 0.0596, "step": 1670 }, { "epoch": 3.870967741935484, "grad_norm": 2.9635491371154785, "learning_rate": 6.451612903225806e-06, "loss": 0.0835, "step": 1680 }, { "epoch": 3.8940092165898617, "grad_norm": 0.06601913273334503, "learning_rate": 5.299539170506913e-06, "loss": 0.0277, "step": 1690 }, { "epoch": 3.9170506912442398, "grad_norm": 0.22990980744361877, "learning_rate": 4.147465437788019e-06, "loss": 0.0416, "step": 1700 }, { "epoch": 3.9170506912442398, "eval_accuracy": 0.8471295060080107, "eval_loss": 0.5442608594894409, "eval_runtime": 11.486, "eval_samples_per_second": 130.42, "eval_steps_per_second": 16.368, "step": 1700 }, { "epoch": 3.9400921658986174, "grad_norm": 0.06300857663154602, "learning_rate": 2.9953917050691243e-06, "loss": 0.0331, "step": 1710 }, { "epoch": 3.9631336405529956, "grad_norm": 1.9846687316894531, "learning_rate": 1.8433179723502305e-06, "loss": 0.04, "step": 1720 }, { "epoch": 3.986175115207373, "grad_norm": 0.4808693826198578, "learning_rate": 6.912442396313364e-07, "loss": 0.0494, "step": 1730 }, { "epoch": 4.0, "step": 1736, "total_flos": 2.1525139607212524e+18, "train_loss": 0.4232822818690181, "train_runtime": 559.5799, "train_samples_per_second": 49.637, "train_steps_per_second": 3.102 } ], "logging_steps": 10, "max_steps": 1736, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1525139607212524e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }