{ "best_metric": 0.7013603448867798, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_t5/t5_small_amazon/checkpoint-1100", "epoch": 3.0, "eval_steps": 50, "global_step": 1140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 2.4166171550750732, "learning_rate": 0.0004956140350877193, "loss": 3.1083, "step": 10 }, { "epoch": 0.05, "grad_norm": 2.1570193767547607, "learning_rate": 0.0004912280701754386, "loss": 2.9816, "step": 20 }, { "epoch": 0.08, "grad_norm": 1.6915223598480225, "learning_rate": 0.0004868421052631579, "loss": 2.9485, "step": 30 }, { "epoch": 0.11, "grad_norm": 1.6796109676361084, "learning_rate": 0.0004824561403508772, "loss": 2.5652, "step": 40 }, { "epoch": 0.13, "grad_norm": 2.0288569927215576, "learning_rate": 0.00047807017543859647, "loss": 2.1202, "step": 50 }, { "epoch": 0.13, "eval_accuracy": 0.5329380764163373, "eval_f1_macro": 0.384104867500989, "eval_f1_micro": 0.5329380764163373, "eval_loss": 1.808777093887329, "eval_runtime": 1.2462, "eval_samples_per_second": 1218.095, "eval_steps_per_second": 38.517, "step": 50 }, { "epoch": 0.16, "grad_norm": 2.4368162155151367, "learning_rate": 0.00047368421052631577, "loss": 1.8285, "step": 60 }, { "epoch": 0.18, "grad_norm": 2.4492692947387695, "learning_rate": 0.0004692982456140351, "loss": 1.5762, "step": 70 }, { "epoch": 0.21, "grad_norm": 4.4139299392700195, "learning_rate": 0.00046491228070175437, "loss": 1.3461, "step": 80 }, { "epoch": 0.24, "grad_norm": 2.3908071517944336, "learning_rate": 0.0004605263157894737, "loss": 1.3544, "step": 90 }, { "epoch": 0.26, "grad_norm": 2.259780168533325, "learning_rate": 0.000456140350877193, "loss": 1.1993, "step": 100 }, { "epoch": 0.26, "eval_accuracy": 0.6613965744400527, "eval_f1_macro": 0.5438670494720591, "eval_f1_micro": 0.6613965744400527, "eval_loss": 1.1745669841766357, "eval_runtime": 1.2956, "eval_samples_per_second": 1171.684, "eval_steps_per_second": 37.049, "step": 100 }, { "epoch": 0.29, "grad_norm": 2.785118341445923, "learning_rate": 0.00045175438596491233, "loss": 1.278, "step": 110 }, { "epoch": 0.32, "grad_norm": 3.2425591945648193, "learning_rate": 0.0004473684210526316, "loss": 1.0901, "step": 120 }, { "epoch": 0.34, "grad_norm": 3.0471508502960205, "learning_rate": 0.0004429824561403509, "loss": 1.1888, "step": 130 }, { "epoch": 0.37, "grad_norm": 2.5289652347564697, "learning_rate": 0.0004385964912280702, "loss": 0.9788, "step": 140 }, { "epoch": 0.39, "grad_norm": 2.7128379344940186, "learning_rate": 0.0004342105263157895, "loss": 1.0698, "step": 150 }, { "epoch": 0.39, "eval_accuracy": 0.6884057971014492, "eval_f1_macro": 0.6003720421134128, "eval_f1_micro": 0.6884057971014492, "eval_loss": 1.0115997791290283, "eval_runtime": 1.2952, "eval_samples_per_second": 1172.016, "eval_steps_per_second": 37.06, "step": 150 }, { "epoch": 0.42, "grad_norm": 3.8709490299224854, "learning_rate": 0.0004298245614035088, "loss": 1.0113, "step": 160 }, { "epoch": 0.45, "grad_norm": 3.408216714859009, "learning_rate": 0.0004254385964912281, "loss": 1.0532, "step": 170 }, { "epoch": 0.47, "grad_norm": 2.8890082836151123, "learning_rate": 0.00042105263157894734, "loss": 1.0167, "step": 180 }, { "epoch": 0.5, "grad_norm": 4.663267612457275, "learning_rate": 0.0004166666666666667, "loss": 1.0561, "step": 190 }, { "epoch": 0.53, "grad_norm": 2.8700387477874756, "learning_rate": 0.000412280701754386, "loss": 0.8999, "step": 200 }, { "epoch": 0.53, "eval_accuracy": 0.717391304347826, "eval_f1_macro": 0.6538801770527956, "eval_f1_micro": 0.717391304347826, "eval_loss": 0.9428409337997437, "eval_runtime": 1.2446, "eval_samples_per_second": 1219.646, "eval_steps_per_second": 38.566, "step": 200 }, { "epoch": 0.55, "grad_norm": 3.236957550048828, "learning_rate": 0.00040789473684210524, "loss": 0.8635, "step": 210 }, { "epoch": 0.58, "grad_norm": 5.146123886108398, "learning_rate": 0.00040350877192982455, "loss": 1.0005, "step": 220 }, { "epoch": 0.61, "grad_norm": 3.9536075592041016, "learning_rate": 0.0003991228070175439, "loss": 0.9122, "step": 230 }, { "epoch": 0.63, "grad_norm": 2.807563304901123, "learning_rate": 0.00039473684210526315, "loss": 0.9735, "step": 240 }, { "epoch": 0.66, "grad_norm": 7.1934919357299805, "learning_rate": 0.00039035087719298245, "loss": 1.1022, "step": 250 }, { "epoch": 0.66, "eval_accuracy": 0.7246376811594203, "eval_f1_macro": 0.6715689496561603, "eval_f1_micro": 0.7246376811594203, "eval_loss": 0.8931711316108704, "eval_runtime": 1.296, "eval_samples_per_second": 1171.291, "eval_steps_per_second": 37.037, "step": 250 }, { "epoch": 0.68, "grad_norm": 2.2508842945098877, "learning_rate": 0.00038596491228070175, "loss": 0.9077, "step": 260 }, { "epoch": 0.71, "grad_norm": 3.2862801551818848, "learning_rate": 0.00038157894736842105, "loss": 0.9932, "step": 270 }, { "epoch": 0.74, "grad_norm": 2.539895534515381, "learning_rate": 0.00037719298245614036, "loss": 0.8718, "step": 280 }, { "epoch": 0.76, "grad_norm": 4.113556385040283, "learning_rate": 0.00037280701754385966, "loss": 0.856, "step": 290 }, { "epoch": 0.79, "grad_norm": 2.2022409439086914, "learning_rate": 0.00036842105263157896, "loss": 0.8337, "step": 300 }, { "epoch": 0.79, "eval_accuracy": 0.7285902503293807, "eval_f1_macro": 0.6788777576959576, "eval_f1_micro": 0.7285902503293807, "eval_loss": 0.8664311766624451, "eval_runtime": 1.2586, "eval_samples_per_second": 1206.118, "eval_steps_per_second": 38.138, "step": 300 }, { "epoch": 0.82, "grad_norm": 3.421111583709717, "learning_rate": 0.00036403508771929826, "loss": 0.836, "step": 310 }, { "epoch": 0.84, "grad_norm": 4.356611251831055, "learning_rate": 0.00035964912280701756, "loss": 0.911, "step": 320 }, { "epoch": 0.87, "grad_norm": 4.17406702041626, "learning_rate": 0.00035526315789473687, "loss": 0.9708, "step": 330 }, { "epoch": 0.89, "grad_norm": 3.6954843997955322, "learning_rate": 0.0003508771929824561, "loss": 0.8783, "step": 340 }, { "epoch": 0.92, "grad_norm": 3.14982533454895, "learning_rate": 0.00034649122807017547, "loss": 0.9594, "step": 350 }, { "epoch": 0.92, "eval_accuracy": 0.7503293807641633, "eval_f1_macro": 0.6993639076591665, "eval_f1_micro": 0.7503293807641633, "eval_loss": 0.8445017337799072, "eval_runtime": 1.2645, "eval_samples_per_second": 1200.522, "eval_steps_per_second": 37.961, "step": 350 }, { "epoch": 0.95, "grad_norm": 3.1979424953460693, "learning_rate": 0.00034210526315789477, "loss": 0.7379, "step": 360 }, { "epoch": 0.97, "grad_norm": 2.852599620819092, "learning_rate": 0.000337719298245614, "loss": 0.931, "step": 370 }, { "epoch": 1.0, "grad_norm": 2.8022282123565674, "learning_rate": 0.0003333333333333333, "loss": 0.7579, "step": 380 }, { "epoch": 1.03, "grad_norm": 3.0786988735198975, "learning_rate": 0.0003289473684210527, "loss": 0.7347, "step": 390 }, { "epoch": 1.05, "grad_norm": 2.600757360458374, "learning_rate": 0.0003245614035087719, "loss": 0.803, "step": 400 }, { "epoch": 1.05, "eval_accuracy": 0.7529644268774703, "eval_f1_macro": 0.6995912826384776, "eval_f1_micro": 0.7529644268774703, "eval_loss": 0.8047745227813721, "eval_runtime": 1.3204, "eval_samples_per_second": 1149.687, "eval_steps_per_second": 36.354, "step": 400 }, { "epoch": 1.08, "grad_norm": 2.7453322410583496, "learning_rate": 0.00032017543859649123, "loss": 0.7006, "step": 410 }, { "epoch": 1.11, "grad_norm": 2.7500882148742676, "learning_rate": 0.00031578947368421053, "loss": 0.7501, "step": 420 }, { "epoch": 1.13, "grad_norm": 4.403367519378662, "learning_rate": 0.00031140350877192983, "loss": 0.8383, "step": 430 }, { "epoch": 1.16, "grad_norm": 2.546445369720459, "learning_rate": 0.00030701754385964913, "loss": 0.6768, "step": 440 }, { "epoch": 1.18, "grad_norm": 1.9148272275924683, "learning_rate": 0.00030263157894736844, "loss": 0.7271, "step": 450 }, { "epoch": 1.18, "eval_accuracy": 0.7602108036890646, "eval_f1_macro": 0.7019487651738747, "eval_f1_micro": 0.7602108036890646, "eval_loss": 0.7776466608047485, "eval_runtime": 1.3229, "eval_samples_per_second": 1147.482, "eval_steps_per_second": 36.284, "step": 450 }, { "epoch": 1.21, "grad_norm": 3.360862970352173, "learning_rate": 0.0002982456140350877, "loss": 0.8197, "step": 460 }, { "epoch": 1.24, "grad_norm": 2.3219194412231445, "learning_rate": 0.00029385964912280704, "loss": 0.6231, "step": 470 }, { "epoch": 1.26, "grad_norm": 4.6816229820251465, "learning_rate": 0.00028947368421052634, "loss": 0.7613, "step": 480 }, { "epoch": 1.29, "grad_norm": 3.6067404747009277, "learning_rate": 0.00028508771929824564, "loss": 0.6973, "step": 490 }, { "epoch": 1.32, "grad_norm": 2.7148826122283936, "learning_rate": 0.0002807017543859649, "loss": 0.6694, "step": 500 }, { "epoch": 1.32, "eval_accuracy": 0.7608695652173914, "eval_f1_macro": 0.7083962744871815, "eval_f1_micro": 0.7608695652173914, "eval_loss": 0.7674228549003601, "eval_runtime": 1.272, "eval_samples_per_second": 1193.385, "eval_steps_per_second": 37.735, "step": 500 }, { "epoch": 1.34, "grad_norm": 2.637134552001953, "learning_rate": 0.00027631578947368425, "loss": 0.6469, "step": 510 }, { "epoch": 1.37, "grad_norm": 2.232243776321411, "learning_rate": 0.00027192982456140355, "loss": 0.7557, "step": 520 }, { "epoch": 1.39, "grad_norm": 3.86612606048584, "learning_rate": 0.0002675438596491228, "loss": 0.7325, "step": 530 }, { "epoch": 1.42, "grad_norm": 2.2842824459075928, "learning_rate": 0.0002631578947368421, "loss": 0.5918, "step": 540 }, { "epoch": 1.45, "grad_norm": 2.8408637046813965, "learning_rate": 0.00025877192982456146, "loss": 0.6109, "step": 550 }, { "epoch": 1.45, "eval_accuracy": 0.7608695652173914, "eval_f1_macro": 0.7081134075783231, "eval_f1_micro": 0.7608695652173914, "eval_loss": 0.7647947669029236, "eval_runtime": 1.2692, "eval_samples_per_second": 1196.015, "eval_steps_per_second": 37.819, "step": 550 }, { "epoch": 1.47, "grad_norm": 2.67587947845459, "learning_rate": 0.0002543859649122807, "loss": 0.6937, "step": 560 }, { "epoch": 1.5, "grad_norm": 2.2099223136901855, "learning_rate": 0.00025, "loss": 0.6696, "step": 570 }, { "epoch": 1.53, "grad_norm": 2.383314609527588, "learning_rate": 0.0002456140350877193, "loss": 0.6252, "step": 580 }, { "epoch": 1.55, "grad_norm": 3.261552572250366, "learning_rate": 0.0002412280701754386, "loss": 0.5798, "step": 590 }, { "epoch": 1.58, "grad_norm": 2.702723741531372, "learning_rate": 0.00023684210526315788, "loss": 0.6575, "step": 600 }, { "epoch": 1.58, "eval_accuracy": 0.7628458498023716, "eval_f1_macro": 0.71173444087918, "eval_f1_micro": 0.7628458498023716, "eval_loss": 0.7526513934135437, "eval_runtime": 1.4987, "eval_samples_per_second": 1012.867, "eval_steps_per_second": 32.027, "step": 600 }, { "epoch": 1.61, "grad_norm": 2.8729443550109863, "learning_rate": 0.00023245614035087719, "loss": 0.6817, "step": 610 }, { "epoch": 1.63, "grad_norm": 2.480459213256836, "learning_rate": 0.0002280701754385965, "loss": 0.6792, "step": 620 }, { "epoch": 1.66, "grad_norm": 3.253549575805664, "learning_rate": 0.0002236842105263158, "loss": 0.6368, "step": 630 }, { "epoch": 1.68, "grad_norm": 3.7337493896484375, "learning_rate": 0.0002192982456140351, "loss": 0.7799, "step": 640 }, { "epoch": 1.71, "grad_norm": 3.4292104244232178, "learning_rate": 0.0002149122807017544, "loss": 0.777, "step": 650 }, { "epoch": 1.71, "eval_accuracy": 0.769433465085639, "eval_f1_macro": 0.7217630318046797, "eval_f1_micro": 0.769433465085639, "eval_loss": 0.7418988943099976, "eval_runtime": 1.2675, "eval_samples_per_second": 1197.616, "eval_steps_per_second": 37.869, "step": 650 }, { "epoch": 1.74, "grad_norm": 2.52197003364563, "learning_rate": 0.00021052631578947367, "loss": 0.5507, "step": 660 }, { "epoch": 1.76, "grad_norm": 3.5961380004882812, "learning_rate": 0.000206140350877193, "loss": 0.6555, "step": 670 }, { "epoch": 1.79, "grad_norm": 5.47299337387085, "learning_rate": 0.00020175438596491227, "loss": 0.748, "step": 680 }, { "epoch": 1.82, "grad_norm": 3.6607069969177246, "learning_rate": 0.00019736842105263157, "loss": 0.6491, "step": 690 }, { "epoch": 1.84, "grad_norm": 3.5043647289276123, "learning_rate": 0.00019298245614035088, "loss": 0.6362, "step": 700 }, { "epoch": 1.84, "eval_accuracy": 0.7799736495388669, "eval_f1_macro": 0.7301450284031649, "eval_f1_micro": 0.7799736495388669, "eval_loss": 0.7271929383277893, "eval_runtime": 1.2681, "eval_samples_per_second": 1197.064, "eval_steps_per_second": 37.852, "step": 700 }, { "epoch": 1.87, "grad_norm": 2.762425661087036, "learning_rate": 0.00018859649122807018, "loss": 0.7247, "step": 710 }, { "epoch": 1.89, "grad_norm": 2.687269926071167, "learning_rate": 0.00018421052631578948, "loss": 0.6264, "step": 720 }, { "epoch": 1.92, "grad_norm": 2.9003891944885254, "learning_rate": 0.00017982456140350878, "loss": 0.6243, "step": 730 }, { "epoch": 1.95, "grad_norm": 3.443100929260254, "learning_rate": 0.00017543859649122806, "loss": 0.7026, "step": 740 }, { "epoch": 1.97, "grad_norm": 2.562163829803467, "learning_rate": 0.00017105263157894739, "loss": 0.648, "step": 750 }, { "epoch": 1.97, "eval_accuracy": 0.7812911725955204, "eval_f1_macro": 0.7356498242584253, "eval_f1_micro": 0.7812911725955204, "eval_loss": 0.7136700749397278, "eval_runtime": 1.268, "eval_samples_per_second": 1197.205, "eval_steps_per_second": 37.856, "step": 750 }, { "epoch": 2.0, "grad_norm": 2.538322925567627, "learning_rate": 0.00016666666666666666, "loss": 0.6682, "step": 760 }, { "epoch": 2.03, "grad_norm": 2.9608592987060547, "learning_rate": 0.00016228070175438596, "loss": 0.554, "step": 770 }, { "epoch": 2.05, "grad_norm": 3.1620254516601562, "learning_rate": 0.00015789473684210527, "loss": 0.5446, "step": 780 }, { "epoch": 2.08, "grad_norm": 2.561002016067505, "learning_rate": 0.00015350877192982457, "loss": 0.5153, "step": 790 }, { "epoch": 2.11, "grad_norm": 2.196606397628784, "learning_rate": 0.00014912280701754384, "loss": 0.4981, "step": 800 }, { "epoch": 2.11, "eval_accuracy": 0.7766798418972332, "eval_f1_macro": 0.7258055584715567, "eval_f1_micro": 0.7766798418972332, "eval_loss": 0.7154478430747986, "eval_runtime": 1.3162, "eval_samples_per_second": 1153.362, "eval_steps_per_second": 36.47, "step": 800 }, { "epoch": 2.13, "grad_norm": 2.3084490299224854, "learning_rate": 0.00014473684210526317, "loss": 0.4972, "step": 810 }, { "epoch": 2.16, "grad_norm": 2.283141613006592, "learning_rate": 0.00014035087719298245, "loss": 0.4606, "step": 820 }, { "epoch": 2.18, "grad_norm": 4.426635265350342, "learning_rate": 0.00013596491228070177, "loss": 0.5108, "step": 830 }, { "epoch": 2.21, "grad_norm": 3.8946399688720703, "learning_rate": 0.00013157894736842105, "loss": 0.5072, "step": 840 }, { "epoch": 2.24, "grad_norm": 2.5790274143218994, "learning_rate": 0.00012719298245614035, "loss": 0.4955, "step": 850 }, { "epoch": 2.24, "eval_accuracy": 0.7799736495388669, "eval_f1_macro": 0.731846151928883, "eval_f1_micro": 0.7799736495388669, "eval_loss": 0.7233121395111084, "eval_runtime": 1.2667, "eval_samples_per_second": 1198.367, "eval_steps_per_second": 37.893, "step": 850 }, { "epoch": 2.26, "grad_norm": 3.8150198459625244, "learning_rate": 0.00012280701754385965, "loss": 0.4035, "step": 860 }, { "epoch": 2.29, "grad_norm": 2.8765792846679688, "learning_rate": 0.00011842105263157894, "loss": 0.5247, "step": 870 }, { "epoch": 2.32, "grad_norm": 3.2408721446990967, "learning_rate": 0.00011403508771929824, "loss": 0.5062, "step": 880 }, { "epoch": 2.34, "grad_norm": 2.066289186477661, "learning_rate": 0.00010964912280701755, "loss": 0.3746, "step": 890 }, { "epoch": 2.37, "grad_norm": 4.370830535888672, "learning_rate": 0.00010526315789473683, "loss": 0.4451, "step": 900 }, { "epoch": 2.37, "eval_accuracy": 0.7779973649538867, "eval_f1_macro": 0.7279848994870372, "eval_f1_micro": 0.7779973649538867, "eval_loss": 0.7181668877601624, "eval_runtime": 1.3175, "eval_samples_per_second": 1152.141, "eval_steps_per_second": 36.431, "step": 900 }, { "epoch": 2.39, "grad_norm": 2.3764281272888184, "learning_rate": 0.00010087719298245614, "loss": 0.5377, "step": 910 }, { "epoch": 2.42, "grad_norm": 3.0116217136383057, "learning_rate": 9.649122807017544e-05, "loss": 0.5044, "step": 920 }, { "epoch": 2.45, "grad_norm": 2.8177366256713867, "learning_rate": 9.210526315789474e-05, "loss": 0.5144, "step": 930 }, { "epoch": 2.47, "grad_norm": 3.101243257522583, "learning_rate": 8.771929824561403e-05, "loss": 0.6122, "step": 940 }, { "epoch": 2.5, "grad_norm": 2.797656297683716, "learning_rate": 8.333333333333333e-05, "loss": 0.421, "step": 950 }, { "epoch": 2.5, "eval_accuracy": 0.7747035573122529, "eval_f1_macro": 0.7261997754426944, "eval_f1_micro": 0.7747035573122529, "eval_loss": 0.7116832733154297, "eval_runtime": 1.2665, "eval_samples_per_second": 1198.612, "eval_steps_per_second": 37.901, "step": 950 }, { "epoch": 2.53, "grad_norm": 1.7224795818328857, "learning_rate": 7.894736842105263e-05, "loss": 0.5545, "step": 960 }, { "epoch": 2.55, "grad_norm": 1.7975401878356934, "learning_rate": 7.456140350877192e-05, "loss": 0.5856, "step": 970 }, { "epoch": 2.58, "grad_norm": 3.113168478012085, "learning_rate": 7.017543859649122e-05, "loss": 0.4921, "step": 980 }, { "epoch": 2.61, "grad_norm": 2.885319709777832, "learning_rate": 6.578947368421052e-05, "loss": 0.4638, "step": 990 }, { "epoch": 2.63, "grad_norm": 3.5493428707122803, "learning_rate": 6.140350877192983e-05, "loss": 0.4853, "step": 1000 }, { "epoch": 2.63, "eval_accuracy": 0.7760210803689065, "eval_f1_macro": 0.7271826503982889, "eval_f1_micro": 0.7760210803689065, "eval_loss": 0.7091566324234009, "eval_runtime": 1.3164, "eval_samples_per_second": 1153.153, "eval_steps_per_second": 36.463, "step": 1000 }, { "epoch": 2.66, "grad_norm": 3.597778081893921, "learning_rate": 5.701754385964912e-05, "loss": 0.4753, "step": 1010 }, { "epoch": 2.68, "grad_norm": 2.5180883407592773, "learning_rate": 5.263157894736842e-05, "loss": 0.5155, "step": 1020 }, { "epoch": 2.71, "grad_norm": 2.6854727268218994, "learning_rate": 4.824561403508772e-05, "loss": 0.4774, "step": 1030 }, { "epoch": 2.74, "grad_norm": 3.311980962753296, "learning_rate": 4.3859649122807014e-05, "loss": 0.524, "step": 1040 }, { "epoch": 2.76, "grad_norm": 4.007936477661133, "learning_rate": 3.9473684210526316e-05, "loss": 0.5442, "step": 1050 }, { "epoch": 2.76, "eval_accuracy": 0.7740447957839263, "eval_f1_macro": 0.7272104730436151, "eval_f1_micro": 0.7740447957839263, "eval_loss": 0.7113538384437561, "eval_runtime": 1.3148, "eval_samples_per_second": 1154.575, "eval_steps_per_second": 36.508, "step": 1050 }, { "epoch": 2.79, "grad_norm": 2.6298959255218506, "learning_rate": 3.508771929824561e-05, "loss": 0.5173, "step": 1060 }, { "epoch": 2.82, "grad_norm": 2.9783828258514404, "learning_rate": 3.0701754385964913e-05, "loss": 0.6384, "step": 1070 }, { "epoch": 2.84, "grad_norm": 3.03838849067688, "learning_rate": 2.631578947368421e-05, "loss": 0.5627, "step": 1080 }, { "epoch": 2.87, "grad_norm": 2.553374767303467, "learning_rate": 2.1929824561403507e-05, "loss": 0.4947, "step": 1090 }, { "epoch": 2.89, "grad_norm": 2.0844762325286865, "learning_rate": 1.7543859649122806e-05, "loss": 0.4863, "step": 1100 }, { "epoch": 2.89, "eval_accuracy": 0.7766798418972332, "eval_f1_macro": 0.7273078975523218, "eval_f1_micro": 0.7766798418972332, "eval_loss": 0.7013603448867798, "eval_runtime": 1.3157, "eval_samples_per_second": 1153.75, "eval_steps_per_second": 36.482, "step": 1100 }, { "epoch": 2.92, "grad_norm": 1.7584428787231445, "learning_rate": 1.3157894736842104e-05, "loss": 0.516, "step": 1110 }, { "epoch": 2.95, "grad_norm": 2.012805700302124, "learning_rate": 8.771929824561403e-06, "loss": 0.6011, "step": 1120 }, { "epoch": 2.97, "grad_norm": 3.4424493312835693, "learning_rate": 4.3859649122807014e-06, "loss": 0.4124, "step": 1130 }, { "epoch": 3.0, "grad_norm": 2.8425490856170654, "learning_rate": 0.0, "loss": 0.4705, "step": 1140 }, { "epoch": 3.0, "step": 1140, "total_flos": 1242006489661440.0, "train_loss": 0.8189868109268055, "train_runtime": 138.2392, "train_samples_per_second": 263.543, "train_steps_per_second": 8.247 } ], "logging_steps": 10, "max_steps": 1140, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 1242006489661440.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }