|
{ |
|
"best_metric": 0.9064, |
|
"best_model_checkpoint": "resnet-18-finetuned-cifar10/checkpoint-819", |
|
"epoch": 2.9945155393053016, |
|
"eval_steps": 500, |
|
"global_step": 819, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 16.46918296813965, |
|
"learning_rate": 6.0975609756097564e-06, |
|
"loss": 3.2109, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 22.162967681884766, |
|
"learning_rate": 1.2195121951219513e-05, |
|
"loss": 2.929, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 18.4061336517334, |
|
"learning_rate": 1.8292682926829268e-05, |
|
"loss": 2.6339, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 14.292740821838379, |
|
"learning_rate": 2.4390243902439026e-05, |
|
"loss": 2.2668, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 15.263444900512695, |
|
"learning_rate": 3.048780487804878e-05, |
|
"loss": 1.947, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 18.372751235961914, |
|
"learning_rate": 3.6585365853658535e-05, |
|
"loss": 1.7001, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 14.107364654541016, |
|
"learning_rate": 4.26829268292683e-05, |
|
"loss": 1.4365, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 11.996474266052246, |
|
"learning_rate": 4.878048780487805e-05, |
|
"loss": 1.3653, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 13.601606369018555, |
|
"learning_rate": 4.94572591587517e-05, |
|
"loss": 1.2311, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 12.406356811523438, |
|
"learning_rate": 4.877883310719132e-05, |
|
"loss": 1.2409, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 10.42392349243164, |
|
"learning_rate": 4.810040705563094e-05, |
|
"loss": 1.1009, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 11.649271011352539, |
|
"learning_rate": 4.742198100407056e-05, |
|
"loss": 1.1132, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 10.590466499328613, |
|
"learning_rate": 4.674355495251018e-05, |
|
"loss": 1.0677, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 10.409879684448242, |
|
"learning_rate": 4.60651289009498e-05, |
|
"loss": 1.0262, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 11.766679763793945, |
|
"learning_rate": 4.5386702849389416e-05, |
|
"loss": 1.0399, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 10.652596473693848, |
|
"learning_rate": 4.470827679782904e-05, |
|
"loss": 1.0203, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 10.809771537780762, |
|
"learning_rate": 4.402985074626866e-05, |
|
"loss": 0.9633, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 12.764290809631348, |
|
"learning_rate": 4.335142469470828e-05, |
|
"loss": 0.9355, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 10.757213592529297, |
|
"learning_rate": 4.26729986431479e-05, |
|
"loss": 0.9337, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 11.284212112426758, |
|
"learning_rate": 4.199457259158752e-05, |
|
"loss": 0.9314, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 11.635641098022461, |
|
"learning_rate": 4.131614654002714e-05, |
|
"loss": 0.8976, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 10.869036674499512, |
|
"learning_rate": 4.063772048846676e-05, |
|
"loss": 0.9065, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 11.703560829162598, |
|
"learning_rate": 3.995929443690638e-05, |
|
"loss": 0.8444, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 10.33545207977295, |
|
"learning_rate": 3.9280868385345995e-05, |
|
"loss": 0.825, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 7.844155311584473, |
|
"learning_rate": 3.860244233378562e-05, |
|
"loss": 0.8369, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 9.158990859985352, |
|
"learning_rate": 3.792401628222524e-05, |
|
"loss": 0.8422, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 9.740525245666504, |
|
"learning_rate": 3.724559023066486e-05, |
|
"loss": 0.8502, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8632, |
|
"eval_loss": 0.4063829481601715, |
|
"eval_runtime": 123.7138, |
|
"eval_samples_per_second": 121.248, |
|
"eval_steps_per_second": 3.791, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 7.927298069000244, |
|
"learning_rate": 3.656716417910448e-05, |
|
"loss": 0.847, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 9.951263427734375, |
|
"learning_rate": 3.58887381275441e-05, |
|
"loss": 0.8189, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 7.741189479827881, |
|
"learning_rate": 3.521031207598372e-05, |
|
"loss": 0.804, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 8.110347747802734, |
|
"learning_rate": 3.453188602442334e-05, |
|
"loss": 0.7829, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 8.17768669128418, |
|
"learning_rate": 3.385345997286296e-05, |
|
"loss": 0.8124, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 8.95181655883789, |
|
"learning_rate": 3.3175033921302575e-05, |
|
"loss": 0.7392, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 9.440756797790527, |
|
"learning_rate": 3.24966078697422e-05, |
|
"loss": 0.8488, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 8.526937484741211, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 0.7525, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 8.15047836303711, |
|
"learning_rate": 3.113975576662144e-05, |
|
"loss": 0.7966, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 8.909910202026367, |
|
"learning_rate": 3.046132971506106e-05, |
|
"loss": 0.7682, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 9.869742393493652, |
|
"learning_rate": 2.9782903663500678e-05, |
|
"loss": 0.7143, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 9.056320190429688, |
|
"learning_rate": 2.91044776119403e-05, |
|
"loss": 0.7467, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 8.215534210205078, |
|
"learning_rate": 2.842605156037992e-05, |
|
"loss": 0.7589, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 8.56048583984375, |
|
"learning_rate": 2.7747625508819542e-05, |
|
"loss": 0.786, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 7.839065074920654, |
|
"learning_rate": 2.7069199457259158e-05, |
|
"loss": 0.7033, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 8.194784164428711, |
|
"learning_rate": 2.639077340569878e-05, |
|
"loss": 0.6867, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 7.5709547996521, |
|
"learning_rate": 2.57123473541384e-05, |
|
"loss": 0.7555, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 8.456563949584961, |
|
"learning_rate": 2.5033921302578023e-05, |
|
"loss": 0.6939, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 8.453351020812988, |
|
"learning_rate": 2.4355495251017642e-05, |
|
"loss": 0.6965, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 7.400697231292725, |
|
"learning_rate": 2.367706919945726e-05, |
|
"loss": 0.7067, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 8.554298400878906, |
|
"learning_rate": 2.299864314789688e-05, |
|
"loss": 0.712, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 8.703104972839355, |
|
"learning_rate": 2.2320217096336503e-05, |
|
"loss": 0.7176, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 8.274211883544922, |
|
"learning_rate": 2.164179104477612e-05, |
|
"loss": 0.7214, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 10.009110450744629, |
|
"learning_rate": 2.0963364993215738e-05, |
|
"loss": 0.7518, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 9.217292785644531, |
|
"learning_rate": 2.028493894165536e-05, |
|
"loss": 0.7125, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 7.580327987670898, |
|
"learning_rate": 1.960651289009498e-05, |
|
"loss": 0.6731, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 7.33704137802124, |
|
"learning_rate": 1.89280868385346e-05, |
|
"loss": 0.6924, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8956, |
|
"eval_loss": 0.3149263858795166, |
|
"eval_runtime": 128.5203, |
|
"eval_samples_per_second": 116.713, |
|
"eval_steps_per_second": 3.649, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 8.008723258972168, |
|
"learning_rate": 1.824966078697422e-05, |
|
"loss": 0.7104, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 6.283710479736328, |
|
"learning_rate": 1.757123473541384e-05, |
|
"loss": 0.6671, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 7.36352014541626, |
|
"learning_rate": 1.689280868385346e-05, |
|
"loss": 0.6464, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 8.875202178955078, |
|
"learning_rate": 1.6214382632293083e-05, |
|
"loss": 0.6495, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 8.145440101623535, |
|
"learning_rate": 1.55359565807327e-05, |
|
"loss": 0.6609, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 8.108186721801758, |
|
"learning_rate": 1.485753052917232e-05, |
|
"loss": 0.7062, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 7.363587856292725, |
|
"learning_rate": 1.417910447761194e-05, |
|
"loss": 0.6395, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 7.521917343139648, |
|
"learning_rate": 1.3500678426051561e-05, |
|
"loss": 0.657, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 7.513711452484131, |
|
"learning_rate": 1.282225237449118e-05, |
|
"loss": 0.6638, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 8.112990379333496, |
|
"learning_rate": 1.2143826322930801e-05, |
|
"loss": 0.6968, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 7.811177730560303, |
|
"learning_rate": 1.1465400271370422e-05, |
|
"loss": 0.7007, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 7.438766956329346, |
|
"learning_rate": 1.0786974219810041e-05, |
|
"loss": 0.6458, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 9.478456497192383, |
|
"learning_rate": 1.010854816824966e-05, |
|
"loss": 0.6534, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 7.470586776733398, |
|
"learning_rate": 9.430122116689281e-06, |
|
"loss": 0.6625, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 7.3036956787109375, |
|
"learning_rate": 8.751696065128902e-06, |
|
"loss": 0.6581, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 7.530010223388672, |
|
"learning_rate": 8.073270013568522e-06, |
|
"loss": 0.6214, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 6.2767333984375, |
|
"learning_rate": 7.394843962008141e-06, |
|
"loss": 0.6136, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 8.302584648132324, |
|
"learning_rate": 6.716417910447762e-06, |
|
"loss": 0.6215, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 7.744215965270996, |
|
"learning_rate": 6.037991858887382e-06, |
|
"loss": 0.6382, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 7.538488864898682, |
|
"learning_rate": 5.359565807327002e-06, |
|
"loss": 0.6239, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 7.9057488441467285, |
|
"learning_rate": 4.681139755766622e-06, |
|
"loss": 0.6413, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 6.213993549346924, |
|
"learning_rate": 4.002713704206242e-06, |
|
"loss": 0.6153, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 6.704429626464844, |
|
"learning_rate": 3.324287652645862e-06, |
|
"loss": 0.6502, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 10.968541145324707, |
|
"learning_rate": 2.645861601085482e-06, |
|
"loss": 0.6439, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 8.878872871398926, |
|
"learning_rate": 1.967435549525102e-06, |
|
"loss": 0.6751, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 8.374876022338867, |
|
"learning_rate": 1.289009497964722e-06, |
|
"loss": 0.6586, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 8.15294075012207, |
|
"learning_rate": 6.10583446404342e-07, |
|
"loss": 0.6714, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_accuracy": 0.9064, |
|
"eval_loss": 0.2777732014656067, |
|
"eval_runtime": 127.5667, |
|
"eval_samples_per_second": 117.586, |
|
"eval_steps_per_second": 3.677, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"step": 819, |
|
"total_flos": 1.0585264325663785e+18, |
|
"train_loss": 0.9103804560371371, |
|
"train_runtime": 753.233, |
|
"train_samples_per_second": 139.399, |
|
"train_steps_per_second": 1.087 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 819, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.0585264325663785e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|