|
{ |
|
"best_metric": 0.2552177309989929, |
|
"best_model_checkpoint": "./ryan_model3272024/checkpoint-600", |
|
"epoch": 3.8338658146964857, |
|
"eval_steps": 100, |
|
"global_step": 1200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4023665189743042, |
|
"learning_rate": 0.00019840255591054313, |
|
"loss": 0.5486, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2863692045211792, |
|
"learning_rate": 0.00019680511182108628, |
|
"loss": 0.4543, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8842328190803528, |
|
"learning_rate": 0.0001952076677316294, |
|
"loss": 0.4222, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8728455901145935, |
|
"learning_rate": 0.00019361022364217253, |
|
"loss": 0.3764, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6641435027122498, |
|
"learning_rate": 0.00019201277955271565, |
|
"loss": 0.3214, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.4344050884246826, |
|
"learning_rate": 0.0001904153354632588, |
|
"loss": 0.3286, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8919397592544556, |
|
"learning_rate": 0.00018881789137380192, |
|
"loss": 0.33, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7052876949310303, |
|
"learning_rate": 0.00018722044728434505, |
|
"loss": 0.3337, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.4728272259235382, |
|
"learning_rate": 0.0001856230031948882, |
|
"loss": 0.3784, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1663854122161865, |
|
"learning_rate": 0.00018402555910543132, |
|
"loss": 0.3853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.3272034823894501, |
|
"eval_na_accuracy": 0.924, |
|
"eval_ordinal_accuracy": 0.52, |
|
"eval_ordinal_mae": 1.210578082634343, |
|
"eval_runtime": 52.9914, |
|
"eval_samples_per_second": 9.435, |
|
"eval_steps_per_second": 1.189, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8579528331756592, |
|
"learning_rate": 0.00018242811501597444, |
|
"loss": 0.3585, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.02351975440979, |
|
"learning_rate": 0.00018083067092651756, |
|
"loss": 0.3621, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3286011219024658, |
|
"learning_rate": 0.00017923322683706071, |
|
"loss": 0.3714, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6290095448493958, |
|
"learning_rate": 0.00017763578274760384, |
|
"loss": 0.3275, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.269338846206665, |
|
"learning_rate": 0.000176038338658147, |
|
"loss": 0.4287, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6244733333587646, |
|
"learning_rate": 0.0001744408945686901, |
|
"loss": 0.3067, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1287596225738525, |
|
"learning_rate": 0.00017284345047923323, |
|
"loss": 0.2982, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.436303734779358, |
|
"learning_rate": 0.00017124600638977638, |
|
"loss": 0.2946, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8159350752830505, |
|
"learning_rate": 0.00016964856230031948, |
|
"loss": 0.3514, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7363901138305664, |
|
"learning_rate": 0.00016805111821086263, |
|
"loss": 0.3396, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.27412503957748413, |
|
"eval_na_accuracy": 0.94, |
|
"eval_ordinal_accuracy": 0.5644444444444444, |
|
"eval_ordinal_mae": 1.1640199238227473, |
|
"eval_runtime": 21.3186, |
|
"eval_samples_per_second": 23.454, |
|
"eval_steps_per_second": 2.955, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.6321592330932617, |
|
"learning_rate": 0.00016645367412140575, |
|
"loss": 0.3952, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6153714656829834, |
|
"learning_rate": 0.0001648562300319489, |
|
"loss": 0.2947, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.3031296730041504, |
|
"learning_rate": 0.00016325878594249202, |
|
"loss": 0.3556, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.058060646057129, |
|
"learning_rate": 0.00016166134185303515, |
|
"loss": 0.3432, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.957135796546936, |
|
"learning_rate": 0.0001600638977635783, |
|
"loss": 0.3675, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.6347941160202026, |
|
"learning_rate": 0.00015846645367412142, |
|
"loss": 0.3008, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.1190528869628906, |
|
"learning_rate": 0.00015686900958466454, |
|
"loss": 0.2944, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.8016924858093262, |
|
"learning_rate": 0.00015527156549520767, |
|
"loss": 0.2361, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.3622130155563354, |
|
"learning_rate": 0.00015367412140575082, |
|
"loss": 0.3569, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6603774428367615, |
|
"learning_rate": 0.00015207667731629394, |
|
"loss": 0.2075, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.2772314250469208, |
|
"eval_na_accuracy": 0.946, |
|
"eval_ordinal_accuracy": 0.5933333333333334, |
|
"eval_ordinal_mae": 1.194209214001894, |
|
"eval_runtime": 20.7347, |
|
"eval_samples_per_second": 24.114, |
|
"eval_steps_per_second": 3.038, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.3968242406845093, |
|
"learning_rate": 0.00015047923322683706, |
|
"loss": 0.2232, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.7815521359443665, |
|
"learning_rate": 0.0001488817891373802, |
|
"loss": 0.3132, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.1288195848464966, |
|
"learning_rate": 0.00014728434504792333, |
|
"loss": 0.255, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.7704196572303772, |
|
"learning_rate": 0.00014568690095846646, |
|
"loss": 0.2415, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.9226877689361572, |
|
"learning_rate": 0.00014408945686900958, |
|
"loss": 0.1975, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.5694310069084167, |
|
"learning_rate": 0.00014249201277955273, |
|
"loss": 0.1722, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.719147801399231, |
|
"learning_rate": 0.00014089456869009585, |
|
"loss": 0.2175, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.9247463941574097, |
|
"learning_rate": 0.000139297124600639, |
|
"loss": 0.2088, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0941154956817627, |
|
"learning_rate": 0.00013769968051118212, |
|
"loss": 0.2854, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.0274015665054321, |
|
"learning_rate": 0.00013610223642172525, |
|
"loss": 0.196, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 0.273777574300766, |
|
"eval_na_accuracy": 0.95, |
|
"eval_ordinal_accuracy": 0.6133333333333333, |
|
"eval_ordinal_mae": 1.198390154937903, |
|
"eval_runtime": 20.9145, |
|
"eval_samples_per_second": 23.907, |
|
"eval_steps_per_second": 3.012, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.912687063217163, |
|
"learning_rate": 0.00013450479233226837, |
|
"loss": 0.2156, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.6906268000602722, |
|
"learning_rate": 0.0001329073482428115, |
|
"loss": 0.1366, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.43070048093795776, |
|
"learning_rate": 0.00013130990415335464, |
|
"loss": 0.2174, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.5173763632774353, |
|
"learning_rate": 0.00012971246006389777, |
|
"loss": 0.2016, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.04314386844635, |
|
"learning_rate": 0.00012811501597444092, |
|
"loss": 0.2233, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.523073673248291, |
|
"learning_rate": 0.00012651757188498404, |
|
"loss": 0.2231, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.259795904159546, |
|
"learning_rate": 0.00012492012779552716, |
|
"loss": 0.2366, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.6846562027931213, |
|
"learning_rate": 0.00012332268370607028, |
|
"loss": 0.2144, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.2122007608413696, |
|
"learning_rate": 0.00012172523961661342, |
|
"loss": 0.2938, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.3790067434310913, |
|
"learning_rate": 0.00012012779552715656, |
|
"loss": 0.2228, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.26852139830589294, |
|
"eval_na_accuracy": 0.956, |
|
"eval_ordinal_accuracy": 0.62, |
|
"eval_ordinal_mae": 1.1989026491012837, |
|
"eval_runtime": 20.0158, |
|
"eval_samples_per_second": 24.98, |
|
"eval_steps_per_second": 3.148, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.7108421921730042, |
|
"learning_rate": 0.00011853035143769968, |
|
"loss": 0.1916, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.42910462617874146, |
|
"learning_rate": 0.00011693290734824283, |
|
"loss": 0.2478, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.9730465412139893, |
|
"learning_rate": 0.00011533546325878595, |
|
"loss": 0.189, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.9566612243652344, |
|
"learning_rate": 0.00011373801916932908, |
|
"loss": 0.1768, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.5167070627212524, |
|
"learning_rate": 0.00011214057507987221, |
|
"loss": 0.1385, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.5880122780799866, |
|
"learning_rate": 0.00011054313099041533, |
|
"loss": 0.1262, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.202286720275879, |
|
"learning_rate": 0.00010894568690095847, |
|
"loss": 0.1721, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.6997601985931396, |
|
"learning_rate": 0.0001073482428115016, |
|
"loss": 0.2128, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.1591830253601074, |
|
"learning_rate": 0.00010575079872204474, |
|
"loss": 0.2402, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.5840221643447876, |
|
"learning_rate": 0.00010415335463258787, |
|
"loss": 0.1816, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 0.2552177309989929, |
|
"eval_na_accuracy": 0.95, |
|
"eval_ordinal_accuracy": 0.6266666666666667, |
|
"eval_ordinal_mae": 1.158560517811113, |
|
"eval_runtime": 19.5011, |
|
"eval_samples_per_second": 25.64, |
|
"eval_steps_per_second": 3.231, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.7560299634933472, |
|
"learning_rate": 0.000102555910543131, |
|
"loss": 0.2021, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.8860361576080322, |
|
"learning_rate": 0.00010095846645367413, |
|
"loss": 0.2092, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.7235255837440491, |
|
"learning_rate": 9.936102236421726e-05, |
|
"loss": 0.1131, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.3656529486179352, |
|
"learning_rate": 9.77635782747604e-05, |
|
"loss": 0.0867, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.3450271785259247, |
|
"learning_rate": 9.616613418530351e-05, |
|
"loss": 0.0903, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.0603750944137573, |
|
"learning_rate": 9.456869009584664e-05, |
|
"loss": 0.1234, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.6790297031402588, |
|
"learning_rate": 9.297124600638978e-05, |
|
"loss": 0.0936, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.5596363544464111, |
|
"learning_rate": 9.137380191693292e-05, |
|
"loss": 0.0651, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.5989049673080444, |
|
"learning_rate": 8.977635782747604e-05, |
|
"loss": 0.1218, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.9003208875656128, |
|
"learning_rate": 8.817891373801918e-05, |
|
"loss": 0.0682, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_loss": 0.27212005853652954, |
|
"eval_na_accuracy": 0.952, |
|
"eval_ordinal_accuracy": 0.6577777777777778, |
|
"eval_ordinal_mae": 1.1557789803379113, |
|
"eval_runtime": 19.5966, |
|
"eval_samples_per_second": 25.515, |
|
"eval_steps_per_second": 3.215, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.6663013100624084, |
|
"learning_rate": 8.658146964856231e-05, |
|
"loss": 0.0714, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.0458776950836182, |
|
"learning_rate": 8.498402555910544e-05, |
|
"loss": 0.102, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.9246501922607422, |
|
"learning_rate": 8.338658146964856e-05, |
|
"loss": 0.1623, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.0837684869766235, |
|
"learning_rate": 8.17891373801917e-05, |
|
"loss": 0.0934, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.564241349697113, |
|
"learning_rate": 8.019169329073483e-05, |
|
"loss": 0.0853, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 4.335838794708252, |
|
"learning_rate": 7.859424920127795e-05, |
|
"loss": 0.1246, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.957082211971283, |
|
"learning_rate": 7.699680511182109e-05, |
|
"loss": 0.1292, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.9633702039718628, |
|
"learning_rate": 7.539936102236423e-05, |
|
"loss": 0.1916, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.7254676222801208, |
|
"learning_rate": 7.380191693290735e-05, |
|
"loss": 0.1054, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.5885197520256042, |
|
"learning_rate": 7.220447284345049e-05, |
|
"loss": 0.0795, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 0.2753521502017975, |
|
"eval_na_accuracy": 0.948, |
|
"eval_ordinal_accuracy": 0.6333333333333333, |
|
"eval_ordinal_mae": 1.1599188842872779, |
|
"eval_runtime": 20.0506, |
|
"eval_samples_per_second": 24.937, |
|
"eval_steps_per_second": 3.142, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.5671622157096863, |
|
"learning_rate": 7.060702875399361e-05, |
|
"loss": 0.0948, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.9914100766181946, |
|
"learning_rate": 6.900958466453674e-05, |
|
"loss": 0.0715, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.4819205105304718, |
|
"learning_rate": 6.741214057507987e-05, |
|
"loss": 0.0839, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.3811684250831604, |
|
"learning_rate": 6.5814696485623e-05, |
|
"loss": 0.0825, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.9750994443893433, |
|
"learning_rate": 6.421725239616614e-05, |
|
"loss": 0.0968, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.35765138268470764, |
|
"learning_rate": 6.261980830670928e-05, |
|
"loss": 0.1605, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.3497343361377716, |
|
"learning_rate": 6.1022364217252406e-05, |
|
"loss": 0.0933, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.4838835299015045, |
|
"learning_rate": 5.942492012779552e-05, |
|
"loss": 0.0859, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.7002846002578735, |
|
"learning_rate": 5.782747603833866e-05, |
|
"loss": 0.1021, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.312203884124756, |
|
"learning_rate": 5.623003194888179e-05, |
|
"loss": 0.1367, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_loss": 0.29526129364967346, |
|
"eval_na_accuracy": 0.946, |
|
"eval_ordinal_accuracy": 0.64, |
|
"eval_ordinal_mae": 1.166716830432415, |
|
"eval_runtime": 20.0091, |
|
"eval_samples_per_second": 24.989, |
|
"eval_steps_per_second": 3.149, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.44126951694488525, |
|
"learning_rate": 5.4632587859424925e-05, |
|
"loss": 0.0854, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.0075191259384155, |
|
"learning_rate": 5.3035143769968054e-05, |
|
"loss": 0.0823, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.9991279244422913, |
|
"learning_rate": 5.1437699680511184e-05, |
|
"loss": 0.1156, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.8888081312179565, |
|
"learning_rate": 4.984025559105431e-05, |
|
"loss": 0.0876, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.3761376738548279, |
|
"learning_rate": 4.824281150159744e-05, |
|
"loss": 0.0452, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.365622341632843, |
|
"learning_rate": 4.664536741214058e-05, |
|
"loss": 0.0428, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.35657036304473877, |
|
"learning_rate": 4.504792332268371e-05, |
|
"loss": 0.033, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.5636401176452637, |
|
"learning_rate": 4.345047923322684e-05, |
|
"loss": 0.0356, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.431383341550827, |
|
"learning_rate": 4.185303514376997e-05, |
|
"loss": 0.0463, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.583328127861023, |
|
"learning_rate": 4.0255591054313104e-05, |
|
"loss": 0.0387, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"eval_loss": 0.2923290431499481, |
|
"eval_na_accuracy": 0.944, |
|
"eval_ordinal_accuracy": 0.6377777777777778, |
|
"eval_ordinal_mae": 1.2024743282463815, |
|
"eval_runtime": 19.3226, |
|
"eval_samples_per_second": 25.876, |
|
"eval_steps_per_second": 3.26, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 2.440162420272827, |
|
"learning_rate": 3.8658146964856234e-05, |
|
"loss": 0.0607, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.29546236991882324, |
|
"learning_rate": 3.7060702875399364e-05, |
|
"loss": 0.0515, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.44689303636550903, |
|
"learning_rate": 3.546325878594249e-05, |
|
"loss": 0.0273, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.3288978040218353, |
|
"learning_rate": 3.386581469648562e-05, |
|
"loss": 0.0352, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.41706767678260803, |
|
"learning_rate": 3.226837060702875e-05, |
|
"loss": 0.0345, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.31060507893562317, |
|
"learning_rate": 3.067092651757188e-05, |
|
"loss": 0.0294, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.2541821599006653, |
|
"learning_rate": 2.907348242811502e-05, |
|
"loss": 0.0354, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.574343740940094, |
|
"learning_rate": 2.747603833865815e-05, |
|
"loss": 0.0443, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.47532182931900024, |
|
"learning_rate": 2.5878594249201278e-05, |
|
"loss": 0.0605, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.45276594161987305, |
|
"learning_rate": 2.428115015974441e-05, |
|
"loss": 0.0293, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"eval_loss": 0.2884800434112549, |
|
"eval_na_accuracy": 0.948, |
|
"eval_ordinal_accuracy": 0.6644444444444444, |
|
"eval_ordinal_mae": 1.1666180535654227, |
|
"eval_runtime": 19.9365, |
|
"eval_samples_per_second": 25.08, |
|
"eval_steps_per_second": 3.16, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.655549168586731, |
|
"learning_rate": 2.268370607028754e-05, |
|
"loss": 0.034, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.16610193252563477, |
|
"learning_rate": 2.108626198083067e-05, |
|
"loss": 0.0319, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.26889652013778687, |
|
"learning_rate": 1.9488817891373803e-05, |
|
"loss": 0.0479, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.2418793886899948, |
|
"learning_rate": 1.7891373801916932e-05, |
|
"loss": 0.0322, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.5379694104194641, |
|
"learning_rate": 1.6293929712460065e-05, |
|
"loss": 0.0393, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.19815516471862793, |
|
"learning_rate": 1.4696485623003195e-05, |
|
"loss": 0.0217, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.889312207698822, |
|
"learning_rate": 1.3099041533546328e-05, |
|
"loss": 0.0332, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.2865816652774811, |
|
"learning_rate": 1.1501597444089457e-05, |
|
"loss": 0.0313, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.5947129726409912, |
|
"learning_rate": 9.904153354632589e-06, |
|
"loss": 0.034, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.44885268807411194, |
|
"learning_rate": 8.306709265175718e-06, |
|
"loss": 0.0286, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"eval_loss": 0.28681233525276184, |
|
"eval_na_accuracy": 0.95, |
|
"eval_ordinal_accuracy": 0.6711111111111111, |
|
"eval_ordinal_mae": 1.1625636271304554, |
|
"eval_runtime": 19.7259, |
|
"eval_samples_per_second": 25.347, |
|
"eval_steps_per_second": 3.194, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"step": 1200, |
|
"total_flos": 1.4860396665534874e+18, |
|
"train_loss": 0.17935538868109385, |
|
"train_runtime": 1702.5744, |
|
"train_samples_per_second": 11.747, |
|
"train_steps_per_second": 0.735 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1252, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 1.4860396665534874e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|