{ "best_metric": 1.0, "best_model_checkpoint": "resnet-50-finetuned-student_kaggle/checkpoint-423", "epoch": 20.0, "eval_steps": 500, "global_step": 940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2127659574468085, "grad_norm": 54.35947799682617, "learning_rate": 5.319148936170213e-06, "loss": 0.9341, "step": 10 }, { "epoch": 0.425531914893617, "grad_norm": 34.59556579589844, "learning_rate": 1.0638297872340426e-05, "loss": 0.9157, "step": 20 }, { "epoch": 0.6382978723404256, "grad_norm": 42.179847717285156, "learning_rate": 1.595744680851064e-05, "loss": 0.7801, "step": 30 }, { "epoch": 0.851063829787234, "grad_norm": 30.099029541015625, "learning_rate": 2.1276595744680852e-05, "loss": 0.7142, "step": 40 }, { "epoch": 1.0, "eval_accuracy": 0.610062893081761, "eval_loss": 0.6418222188949585, "eval_runtime": 7.6299, "eval_samples_per_second": 83.356, "eval_steps_per_second": 2.621, "step": 47 }, { "epoch": 1.0638297872340425, "grad_norm": 48.046546936035156, "learning_rate": 2.6595744680851064e-05, "loss": 0.7114, "step": 50 }, { "epoch": 1.2765957446808511, "grad_norm": 45.94879913330078, "learning_rate": 3.191489361702128e-05, "loss": 0.6014, "step": 60 }, { "epoch": 1.4893617021276595, "grad_norm": 17.69209861755371, "learning_rate": 3.723404255319149e-05, "loss": 0.4815, "step": 70 }, { "epoch": 1.702127659574468, "grad_norm": 18.821670532226562, "learning_rate": 4.2553191489361704e-05, "loss": 0.463, "step": 80 }, { "epoch": 1.9148936170212765, "grad_norm": 23.751588821411133, "learning_rate": 4.787234042553192e-05, "loss": 0.3351, "step": 90 }, { "epoch": 2.0, "eval_accuracy": 0.8946540880503144, "eval_loss": 0.25965991616249084, "eval_runtime": 7.6659, "eval_samples_per_second": 82.964, "eval_steps_per_second": 2.609, "step": 94 }, { "epoch": 2.127659574468085, "grad_norm": 18.11069679260254, "learning_rate": 4.964539007092199e-05, "loss": 0.3193, "step": 100 }, { "epoch": 2.3404255319148937, "grad_norm": 12.397391319274902, "learning_rate": 4.905437352245863e-05, "loss": 0.2768, "step": 110 }, { "epoch": 2.5531914893617023, "grad_norm": 16.857635498046875, "learning_rate": 4.846335697399527e-05, "loss": 0.2594, "step": 120 }, { "epoch": 2.7659574468085104, "grad_norm": 12.635449409484863, "learning_rate": 4.787234042553192e-05, "loss": 0.2063, "step": 130 }, { "epoch": 2.978723404255319, "grad_norm": 15.277303695678711, "learning_rate": 4.728132387706856e-05, "loss": 0.2574, "step": 140 }, { "epoch": 3.0, "eval_accuracy": 0.9779874213836478, "eval_loss": 0.10460298508405685, "eval_runtime": 8.3391, "eval_samples_per_second": 76.267, "eval_steps_per_second": 2.398, "step": 141 }, { "epoch": 3.1914893617021276, "grad_norm": 14.497098922729492, "learning_rate": 4.669030732860521e-05, "loss": 0.2349, "step": 150 }, { "epoch": 3.404255319148936, "grad_norm": 17.647092819213867, "learning_rate": 4.609929078014185e-05, "loss": 0.1631, "step": 160 }, { "epoch": 3.617021276595745, "grad_norm": 12.856146812438965, "learning_rate": 4.550827423167849e-05, "loss": 0.1675, "step": 170 }, { "epoch": 3.829787234042553, "grad_norm": 7.248583793640137, "learning_rate": 4.491725768321513e-05, "loss": 0.1479, "step": 180 }, { "epoch": 4.0, "eval_accuracy": 0.9874213836477987, "eval_loss": 0.061614990234375, "eval_runtime": 8.4097, "eval_samples_per_second": 75.627, "eval_steps_per_second": 2.378, "step": 188 }, { "epoch": 4.042553191489362, "grad_norm": 25.721847534179688, "learning_rate": 4.432624113475177e-05, "loss": 0.1528, "step": 190 }, { "epoch": 4.25531914893617, "grad_norm": 6.252942085266113, "learning_rate": 4.373522458628842e-05, "loss": 0.145, "step": 200 }, { "epoch": 4.468085106382979, "grad_norm": 6.672601222991943, "learning_rate": 4.3144208037825064e-05, "loss": 0.1247, "step": 210 }, { "epoch": 4.680851063829787, "grad_norm": 20.4866886138916, "learning_rate": 4.2553191489361704e-05, "loss": 0.1405, "step": 220 }, { "epoch": 4.8936170212765955, "grad_norm": 19.644893646240234, "learning_rate": 4.1962174940898345e-05, "loss": 0.1284, "step": 230 }, { "epoch": 5.0, "eval_accuracy": 0.9952830188679245, "eval_loss": 0.02317511849105358, "eval_runtime": 10.3441, "eval_samples_per_second": 61.485, "eval_steps_per_second": 1.933, "step": 235 }, { "epoch": 5.1063829787234045, "grad_norm": 10.905556678771973, "learning_rate": 4.1371158392434986e-05, "loss": 0.1178, "step": 240 }, { "epoch": 5.319148936170213, "grad_norm": 11.02078628540039, "learning_rate": 4.078014184397163e-05, "loss": 0.1176, "step": 250 }, { "epoch": 5.531914893617021, "grad_norm": 7.510810375213623, "learning_rate": 4.018912529550828e-05, "loss": 0.0881, "step": 260 }, { "epoch": 5.74468085106383, "grad_norm": 2.0541610717773438, "learning_rate": 3.959810874704492e-05, "loss": 0.1274, "step": 270 }, { "epoch": 5.957446808510638, "grad_norm": 7.680713653564453, "learning_rate": 3.900709219858156e-05, "loss": 0.077, "step": 280 }, { "epoch": 6.0, "eval_accuracy": 0.9952830188679245, "eval_loss": 0.015012426301836967, "eval_runtime": 8.4118, "eval_samples_per_second": 75.608, "eval_steps_per_second": 2.378, "step": 282 }, { "epoch": 6.170212765957447, "grad_norm": 8.117420196533203, "learning_rate": 3.84160756501182e-05, "loss": 0.172, "step": 290 }, { "epoch": 6.382978723404255, "grad_norm": 23.871868133544922, "learning_rate": 3.782505910165485e-05, "loss": 0.0613, "step": 300 }, { "epoch": 6.595744680851064, "grad_norm": 15.407998085021973, "learning_rate": 3.723404255319149e-05, "loss": 0.1287, "step": 310 }, { "epoch": 6.808510638297872, "grad_norm": 6.940992832183838, "learning_rate": 3.664302600472813e-05, "loss": 0.103, "step": 320 }, { "epoch": 7.0, "eval_accuracy": 0.9984276729559748, "eval_loss": 0.010532047599554062, "eval_runtime": 7.9689, "eval_samples_per_second": 79.81, "eval_steps_per_second": 2.51, "step": 329 }, { "epoch": 7.0212765957446805, "grad_norm": 4.598968029022217, "learning_rate": 3.605200945626478e-05, "loss": 0.0906, "step": 330 }, { "epoch": 7.23404255319149, "grad_norm": 7.53684663772583, "learning_rate": 3.546099290780142e-05, "loss": 0.0792, "step": 340 }, { "epoch": 7.446808510638298, "grad_norm": 2.750072479248047, "learning_rate": 3.4869976359338065e-05, "loss": 0.091, "step": 350 }, { "epoch": 7.659574468085106, "grad_norm": 4.067008018493652, "learning_rate": 3.4278959810874706e-05, "loss": 0.0777, "step": 360 }, { "epoch": 7.872340425531915, "grad_norm": 15.093037605285645, "learning_rate": 3.3687943262411347e-05, "loss": 0.0922, "step": 370 }, { "epoch": 8.0, "eval_accuracy": 0.9984276729559748, "eval_loss": 0.009353628382086754, "eval_runtime": 7.7264, "eval_samples_per_second": 82.315, "eval_steps_per_second": 2.589, "step": 376 }, { "epoch": 8.085106382978724, "grad_norm": 8.675619125366211, "learning_rate": 3.309692671394799e-05, "loss": 0.1211, "step": 380 }, { "epoch": 8.297872340425531, "grad_norm": 5.723608493804932, "learning_rate": 3.2505910165484634e-05, "loss": 0.0645, "step": 390 }, { "epoch": 8.51063829787234, "grad_norm": 8.031245231628418, "learning_rate": 3.191489361702128e-05, "loss": 0.0787, "step": 400 }, { "epoch": 8.72340425531915, "grad_norm": 2.483238935470581, "learning_rate": 3.132387706855792e-05, "loss": 0.0683, "step": 410 }, { "epoch": 8.936170212765958, "grad_norm": 7.612273216247559, "learning_rate": 3.073286052009456e-05, "loss": 0.08, "step": 420 }, { "epoch": 9.0, "eval_accuracy": 1.0, "eval_loss": 0.00555912172421813, "eval_runtime": 8.4427, "eval_samples_per_second": 75.331, "eval_steps_per_second": 2.369, "step": 423 }, { "epoch": 9.148936170212766, "grad_norm": 2.0842654705047607, "learning_rate": 3.0141843971631207e-05, "loss": 0.0812, "step": 430 }, { "epoch": 9.361702127659575, "grad_norm": 17.500883102416992, "learning_rate": 2.9550827423167847e-05, "loss": 0.0688, "step": 440 }, { "epoch": 9.574468085106384, "grad_norm": 9.671988487243652, "learning_rate": 2.895981087470449e-05, "loss": 0.0837, "step": 450 }, { "epoch": 9.787234042553191, "grad_norm": 13.061412811279297, "learning_rate": 2.836879432624114e-05, "loss": 0.1024, "step": 460 }, { "epoch": 10.0, "grad_norm": 11.679757118225098, "learning_rate": 2.777777777777778e-05, "loss": 0.0492, "step": 470 }, { "epoch": 10.0, "eval_accuracy": 1.0, "eval_loss": 0.004496446345001459, "eval_runtime": 9.0183, "eval_samples_per_second": 70.523, "eval_steps_per_second": 2.218, "step": 470 }, { "epoch": 10.212765957446809, "grad_norm": 3.253716230392456, "learning_rate": 2.7186761229314423e-05, "loss": 0.0495, "step": 480 }, { "epoch": 10.425531914893616, "grad_norm": 5.826026916503906, "learning_rate": 2.6595744680851064e-05, "loss": 0.0266, "step": 490 }, { "epoch": 10.638297872340425, "grad_norm": 9.738965034484863, "learning_rate": 2.6004728132387708e-05, "loss": 0.0739, "step": 500 }, { "epoch": 10.851063829787234, "grad_norm": 9.04916763305664, "learning_rate": 2.5413711583924348e-05, "loss": 0.0574, "step": 510 }, { "epoch": 11.0, "eval_accuracy": 1.0, "eval_loss": 0.004308629781007767, "eval_runtime": 8.4135, "eval_samples_per_second": 75.593, "eval_steps_per_second": 2.377, "step": 517 }, { "epoch": 11.063829787234043, "grad_norm": 11.041916847229004, "learning_rate": 2.4822695035460995e-05, "loss": 0.0977, "step": 520 }, { "epoch": 11.27659574468085, "grad_norm": 31.96723175048828, "learning_rate": 2.4231678486997636e-05, "loss": 0.0541, "step": 530 }, { "epoch": 11.48936170212766, "grad_norm": 2.877957344055176, "learning_rate": 2.364066193853428e-05, "loss": 0.0401, "step": 540 }, { "epoch": 11.702127659574469, "grad_norm": 3.020596742630005, "learning_rate": 2.3049645390070924e-05, "loss": 0.0284, "step": 550 }, { "epoch": 11.914893617021276, "grad_norm": 7.811126232147217, "learning_rate": 2.2458628841607564e-05, "loss": 0.0382, "step": 560 }, { "epoch": 12.0, "eval_accuracy": 1.0, "eval_loss": 0.0022806336637586355, "eval_runtime": 7.7232, "eval_samples_per_second": 82.35, "eval_steps_per_second": 2.59, "step": 564 }, { "epoch": 12.127659574468085, "grad_norm": 11.633199691772461, "learning_rate": 2.186761229314421e-05, "loss": 0.0589, "step": 570 }, { "epoch": 12.340425531914894, "grad_norm": 0.3253759443759918, "learning_rate": 2.1276595744680852e-05, "loss": 0.041, "step": 580 }, { "epoch": 12.553191489361701, "grad_norm": 2.2859044075012207, "learning_rate": 2.0685579196217493e-05, "loss": 0.0578, "step": 590 }, { "epoch": 12.76595744680851, "grad_norm": 1.5324259996414185, "learning_rate": 2.009456264775414e-05, "loss": 0.0312, "step": 600 }, { "epoch": 12.97872340425532, "grad_norm": 6.985143661499023, "learning_rate": 1.950354609929078e-05, "loss": 0.0666, "step": 610 }, { "epoch": 13.0, "eval_accuracy": 1.0, "eval_loss": 0.0022491966374218464, "eval_runtime": 8.1081, "eval_samples_per_second": 78.44, "eval_steps_per_second": 2.467, "step": 611 }, { "epoch": 13.191489361702128, "grad_norm": 8.847366333007812, "learning_rate": 1.8912529550827425e-05, "loss": 0.0539, "step": 620 }, { "epoch": 13.404255319148936, "grad_norm": 10.476814270019531, "learning_rate": 1.8321513002364065e-05, "loss": 0.0369, "step": 630 }, { "epoch": 13.617021276595745, "grad_norm": 5.339621067047119, "learning_rate": 1.773049645390071e-05, "loss": 0.0308, "step": 640 }, { "epoch": 13.829787234042554, "grad_norm": 14.648975372314453, "learning_rate": 1.7139479905437353e-05, "loss": 0.0477, "step": 650 }, { "epoch": 14.0, "eval_accuracy": 1.0, "eval_loss": 0.0021932125091552734, "eval_runtime": 8.4493, "eval_samples_per_second": 75.272, "eval_steps_per_second": 2.367, "step": 658 }, { "epoch": 14.042553191489361, "grad_norm": 5.510159969329834, "learning_rate": 1.6548463356973994e-05, "loss": 0.028, "step": 660 }, { "epoch": 14.25531914893617, "grad_norm": 5.803068161010742, "learning_rate": 1.595744680851064e-05, "loss": 0.0522, "step": 670 }, { "epoch": 14.46808510638298, "grad_norm": 1.1623107194900513, "learning_rate": 1.536643026004728e-05, "loss": 0.0481, "step": 680 }, { "epoch": 14.680851063829786, "grad_norm": 12.495600700378418, "learning_rate": 1.4775413711583924e-05, "loss": 0.0588, "step": 690 }, { "epoch": 14.893617021276595, "grad_norm": 3.4236888885498047, "learning_rate": 1.418439716312057e-05, "loss": 0.0614, "step": 700 }, { "epoch": 15.0, "eval_accuracy": 1.0, "eval_loss": 0.002270177938044071, "eval_runtime": 8.5563, "eval_samples_per_second": 74.331, "eval_steps_per_second": 2.337, "step": 705 }, { "epoch": 15.106382978723405, "grad_norm": 11.681058883666992, "learning_rate": 1.3593380614657212e-05, "loss": 0.0674, "step": 710 }, { "epoch": 15.319148936170214, "grad_norm": 1.846946120262146, "learning_rate": 1.3002364066193854e-05, "loss": 0.0415, "step": 720 }, { "epoch": 15.53191489361702, "grad_norm": 8.939858436584473, "learning_rate": 1.2411347517730498e-05, "loss": 0.0189, "step": 730 }, { "epoch": 15.74468085106383, "grad_norm": 3.521784782409668, "learning_rate": 1.182033096926714e-05, "loss": 0.0585, "step": 740 }, { "epoch": 15.957446808510639, "grad_norm": 1.9891993999481201, "learning_rate": 1.1229314420803782e-05, "loss": 0.0282, "step": 750 }, { "epoch": 16.0, "eval_accuracy": 1.0, "eval_loss": 0.0013930280692875385, "eval_runtime": 8.1789, "eval_samples_per_second": 77.761, "eval_steps_per_second": 2.445, "step": 752 }, { "epoch": 16.170212765957448, "grad_norm": 7.0705246925354, "learning_rate": 1.0638297872340426e-05, "loss": 0.0508, "step": 760 }, { "epoch": 16.382978723404257, "grad_norm": 11.365514755249023, "learning_rate": 1.004728132387707e-05, "loss": 0.0393, "step": 770 }, { "epoch": 16.595744680851062, "grad_norm": 8.82397747039795, "learning_rate": 9.456264775413712e-06, "loss": 0.0509, "step": 780 }, { "epoch": 16.80851063829787, "grad_norm": 5.013731002807617, "learning_rate": 8.865248226950355e-06, "loss": 0.0659, "step": 790 }, { "epoch": 17.0, "eval_accuracy": 1.0, "eval_loss": 0.0016287014586851, "eval_runtime": 7.6703, "eval_samples_per_second": 82.917, "eval_steps_per_second": 2.607, "step": 799 }, { "epoch": 17.02127659574468, "grad_norm": 2.8596644401550293, "learning_rate": 8.274231678486997e-06, "loss": 0.0285, "step": 800 }, { "epoch": 17.23404255319149, "grad_norm": 10.184608459472656, "learning_rate": 7.68321513002364e-06, "loss": 0.062, "step": 810 }, { "epoch": 17.4468085106383, "grad_norm": 6.029819011688232, "learning_rate": 7.092198581560285e-06, "loss": 0.0672, "step": 820 }, { "epoch": 17.659574468085108, "grad_norm": 0.9212875366210938, "learning_rate": 6.501182033096927e-06, "loss": 0.0404, "step": 830 }, { "epoch": 17.872340425531917, "grad_norm": 0.5147794485092163, "learning_rate": 5.91016548463357e-06, "loss": 0.0586, "step": 840 }, { "epoch": 18.0, "eval_accuracy": 1.0, "eval_loss": 0.0009691208251751959, "eval_runtime": 8.4381, "eval_samples_per_second": 75.373, "eval_steps_per_second": 2.37, "step": 846 }, { "epoch": 18.085106382978722, "grad_norm": 3.351142406463623, "learning_rate": 5.319148936170213e-06, "loss": 0.0333, "step": 850 }, { "epoch": 18.29787234042553, "grad_norm": 7.570976257324219, "learning_rate": 4.728132387706856e-06, "loss": 0.0401, "step": 860 }, { "epoch": 18.51063829787234, "grad_norm": 6.660007953643799, "learning_rate": 4.137115839243498e-06, "loss": 0.0329, "step": 870 }, { "epoch": 18.72340425531915, "grad_norm": 11.373592376708984, "learning_rate": 3.5460992907801423e-06, "loss": 0.0523, "step": 880 }, { "epoch": 18.93617021276596, "grad_norm": 4.553757667541504, "learning_rate": 2.955082742316785e-06, "loss": 0.0557, "step": 890 }, { "epoch": 19.0, "eval_accuracy": 1.0, "eval_loss": 0.0012750416062772274, "eval_runtime": 8.3957, "eval_samples_per_second": 75.753, "eval_steps_per_second": 2.382, "step": 893 }, { "epoch": 19.148936170212767, "grad_norm": 1.8293001651763916, "learning_rate": 2.364066193853428e-06, "loss": 0.0248, "step": 900 }, { "epoch": 19.361702127659573, "grad_norm": 3.5974695682525635, "learning_rate": 1.7730496453900712e-06, "loss": 0.0298, "step": 910 }, { "epoch": 19.574468085106382, "grad_norm": 4.631837844848633, "learning_rate": 1.182033096926714e-06, "loss": 0.0272, "step": 920 }, { "epoch": 19.78723404255319, "grad_norm": 1.5552431344985962, "learning_rate": 5.91016548463357e-07, "loss": 0.0281, "step": 930 }, { "epoch": 20.0, "grad_norm": 5.388515949249268, "learning_rate": 0.0, "loss": 0.07, "step": 940 }, { "epoch": 20.0, "eval_accuracy": 1.0, "eval_loss": 0.001178326434455812, "eval_runtime": 8.4555, "eval_samples_per_second": 75.217, "eval_steps_per_second": 2.365, "step": 940 }, { "epoch": 20.0, "step": 940, "total_flos": 6.302667737382912e+17, "train_loss": 0.13929352825309368, "train_runtime": 703.6937, "train_samples_per_second": 42.177, "train_steps_per_second": 1.336 } ], "logging_steps": 10, "max_steps": 940, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 6.302667737382912e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }