{ "best_metric": 0.2552177309989929, "best_model_checkpoint": "./ryan_model3272024/checkpoint-600", "epoch": 3.8338658146964857, "eval_steps": 100, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 1.4023665189743042, "learning_rate": 0.00019840255591054313, "loss": 0.5486, "step": 10 }, { "epoch": 0.06, "grad_norm": 1.2863692045211792, "learning_rate": 0.00019680511182108628, "loss": 0.4543, "step": 20 }, { "epoch": 0.1, "grad_norm": 0.8842328190803528, "learning_rate": 0.0001952076677316294, "loss": 0.4222, "step": 30 }, { "epoch": 0.13, "grad_norm": 0.8728455901145935, "learning_rate": 0.00019361022364217253, "loss": 0.3764, "step": 40 }, { "epoch": 0.16, "grad_norm": 0.6641435027122498, "learning_rate": 0.00019201277955271565, "loss": 0.3214, "step": 50 }, { "epoch": 0.19, "grad_norm": 1.4344050884246826, "learning_rate": 0.0001904153354632588, "loss": 0.3286, "step": 60 }, { "epoch": 0.22, "grad_norm": 0.8919397592544556, "learning_rate": 0.00018881789137380192, "loss": 0.33, "step": 70 }, { "epoch": 0.26, "grad_norm": 1.7052876949310303, "learning_rate": 0.00018722044728434505, "loss": 0.3337, "step": 80 }, { "epoch": 0.29, "grad_norm": 0.4728272259235382, "learning_rate": 0.0001856230031948882, "loss": 0.3784, "step": 90 }, { "epoch": 0.32, "grad_norm": 1.1663854122161865, "learning_rate": 0.00018402555910543132, "loss": 0.3853, "step": 100 }, { "epoch": 0.32, "eval_loss": 0.3272034823894501, "eval_na_accuracy": 0.924, "eval_ordinal_accuracy": 0.52, "eval_ordinal_mae": 1.210578082634343, "eval_runtime": 52.9914, "eval_samples_per_second": 9.435, "eval_steps_per_second": 1.189, "step": 100 }, { "epoch": 0.35, "grad_norm": 0.8579528331756592, "learning_rate": 0.00018242811501597444, "loss": 0.3585, "step": 110 }, { "epoch": 0.38, "grad_norm": 1.02351975440979, "learning_rate": 0.00018083067092651756, "loss": 0.3621, "step": 120 }, { "epoch": 0.42, "grad_norm": 1.3286011219024658, "learning_rate": 0.00017923322683706071, "loss": 0.3714, "step": 130 }, { "epoch": 0.45, "grad_norm": 0.6290095448493958, "learning_rate": 0.00017763578274760384, "loss": 0.3275, "step": 140 }, { "epoch": 0.48, "grad_norm": 1.269338846206665, "learning_rate": 0.000176038338658147, "loss": 0.4287, "step": 150 }, { "epoch": 0.51, "grad_norm": 0.6244733333587646, "learning_rate": 0.0001744408945686901, "loss": 0.3067, "step": 160 }, { "epoch": 0.54, "grad_norm": 1.1287596225738525, "learning_rate": 0.00017284345047923323, "loss": 0.2982, "step": 170 }, { "epoch": 0.58, "grad_norm": 1.436303734779358, "learning_rate": 0.00017124600638977638, "loss": 0.2946, "step": 180 }, { "epoch": 0.61, "grad_norm": 0.8159350752830505, "learning_rate": 0.00016964856230031948, "loss": 0.3514, "step": 190 }, { "epoch": 0.64, "grad_norm": 0.7363901138305664, "learning_rate": 0.00016805111821086263, "loss": 0.3396, "step": 200 }, { "epoch": 0.64, "eval_loss": 0.27412503957748413, "eval_na_accuracy": 0.94, "eval_ordinal_accuracy": 0.5644444444444444, "eval_ordinal_mae": 1.1640199238227473, "eval_runtime": 21.3186, "eval_samples_per_second": 23.454, "eval_steps_per_second": 2.955, "step": 200 }, { "epoch": 0.67, "grad_norm": 0.6321592330932617, "learning_rate": 0.00016645367412140575, "loss": 0.3952, "step": 210 }, { "epoch": 0.7, "grad_norm": 0.6153714656829834, "learning_rate": 0.0001648562300319489, "loss": 0.2947, "step": 220 }, { "epoch": 0.73, "grad_norm": 1.3031296730041504, "learning_rate": 0.00016325878594249202, "loss": 0.3556, "step": 230 }, { "epoch": 0.77, "grad_norm": 1.058060646057129, "learning_rate": 0.00016166134185303515, "loss": 0.3432, "step": 240 }, { "epoch": 0.8, "grad_norm": 0.957135796546936, "learning_rate": 0.0001600638977635783, "loss": 0.3675, "step": 250 }, { "epoch": 0.83, "grad_norm": 1.6347941160202026, "learning_rate": 0.00015846645367412142, "loss": 0.3008, "step": 260 }, { "epoch": 0.86, "grad_norm": 1.1190528869628906, "learning_rate": 0.00015686900958466454, "loss": 0.2944, "step": 270 }, { "epoch": 0.89, "grad_norm": 0.8016924858093262, "learning_rate": 0.00015527156549520767, "loss": 0.2361, "step": 280 }, { "epoch": 0.93, "grad_norm": 1.3622130155563354, "learning_rate": 0.00015367412140575082, "loss": 0.3569, "step": 290 }, { "epoch": 0.96, "grad_norm": 0.6603774428367615, "learning_rate": 0.00015207667731629394, "loss": 0.2075, "step": 300 }, { "epoch": 0.96, "eval_loss": 0.2772314250469208, "eval_na_accuracy": 0.946, "eval_ordinal_accuracy": 0.5933333333333334, "eval_ordinal_mae": 1.194209214001894, "eval_runtime": 20.7347, "eval_samples_per_second": 24.114, "eval_steps_per_second": 3.038, "step": 300 }, { "epoch": 0.99, "grad_norm": 1.3968242406845093, "learning_rate": 0.00015047923322683706, "loss": 0.2232, "step": 310 }, { "epoch": 1.02, "grad_norm": 0.7815521359443665, "learning_rate": 0.0001488817891373802, "loss": 0.3132, "step": 320 }, { "epoch": 1.05, "grad_norm": 1.1288195848464966, "learning_rate": 0.00014728434504792333, "loss": 0.255, "step": 330 }, { "epoch": 1.09, "grad_norm": 0.7704196572303772, "learning_rate": 0.00014568690095846646, "loss": 0.2415, "step": 340 }, { "epoch": 1.12, "grad_norm": 1.9226877689361572, "learning_rate": 0.00014408945686900958, "loss": 0.1975, "step": 350 }, { "epoch": 1.15, "grad_norm": 0.5694310069084167, "learning_rate": 0.00014249201277955273, "loss": 0.1722, "step": 360 }, { "epoch": 1.18, "grad_norm": 1.719147801399231, "learning_rate": 0.00014089456869009585, "loss": 0.2175, "step": 370 }, { "epoch": 1.21, "grad_norm": 0.9247463941574097, "learning_rate": 0.000139297124600639, "loss": 0.2088, "step": 380 }, { "epoch": 1.25, "grad_norm": 1.0941154956817627, "learning_rate": 0.00013769968051118212, "loss": 0.2854, "step": 390 }, { "epoch": 1.28, "grad_norm": 1.0274015665054321, "learning_rate": 0.00013610223642172525, "loss": 0.196, "step": 400 }, { "epoch": 1.28, "eval_loss": 0.273777574300766, "eval_na_accuracy": 0.95, "eval_ordinal_accuracy": 0.6133333333333333, "eval_ordinal_mae": 1.198390154937903, "eval_runtime": 20.9145, "eval_samples_per_second": 23.907, "eval_steps_per_second": 3.012, "step": 400 }, { "epoch": 1.31, "grad_norm": 2.912687063217163, "learning_rate": 0.00013450479233226837, "loss": 0.2156, "step": 410 }, { "epoch": 1.34, "grad_norm": 0.6906268000602722, "learning_rate": 0.0001329073482428115, "loss": 0.1366, "step": 420 }, { "epoch": 1.37, "grad_norm": 0.43070048093795776, "learning_rate": 0.00013130990415335464, "loss": 0.2174, "step": 430 }, { "epoch": 1.41, "grad_norm": 0.5173763632774353, "learning_rate": 0.00012971246006389777, "loss": 0.2016, "step": 440 }, { "epoch": 1.44, "grad_norm": 1.04314386844635, "learning_rate": 0.00012811501597444092, "loss": 0.2233, "step": 450 }, { "epoch": 1.47, "grad_norm": 0.523073673248291, "learning_rate": 0.00012651757188498404, "loss": 0.2231, "step": 460 }, { "epoch": 1.5, "grad_norm": 3.259795904159546, "learning_rate": 0.00012492012779552716, "loss": 0.2366, "step": 470 }, { "epoch": 1.53, "grad_norm": 0.6846562027931213, "learning_rate": 0.00012332268370607028, "loss": 0.2144, "step": 480 }, { "epoch": 1.57, "grad_norm": 1.2122007608413696, "learning_rate": 0.00012172523961661342, "loss": 0.2938, "step": 490 }, { "epoch": 1.6, "grad_norm": 1.3790067434310913, "learning_rate": 0.00012012779552715656, "loss": 0.2228, "step": 500 }, { "epoch": 1.6, "eval_loss": 0.26852139830589294, "eval_na_accuracy": 0.956, "eval_ordinal_accuracy": 0.62, "eval_ordinal_mae": 1.1989026491012837, "eval_runtime": 20.0158, "eval_samples_per_second": 24.98, "eval_steps_per_second": 3.148, "step": 500 }, { "epoch": 1.63, "grad_norm": 0.7108421921730042, "learning_rate": 0.00011853035143769968, "loss": 0.1916, "step": 510 }, { "epoch": 1.66, "grad_norm": 0.42910462617874146, "learning_rate": 0.00011693290734824283, "loss": 0.2478, "step": 520 }, { "epoch": 1.69, "grad_norm": 0.9730465412139893, "learning_rate": 0.00011533546325878595, "loss": 0.189, "step": 530 }, { "epoch": 1.73, "grad_norm": 0.9566612243652344, "learning_rate": 0.00011373801916932908, "loss": 0.1768, "step": 540 }, { "epoch": 1.76, "grad_norm": 0.5167070627212524, "learning_rate": 0.00011214057507987221, "loss": 0.1385, "step": 550 }, { "epoch": 1.79, "grad_norm": 0.5880122780799866, "learning_rate": 0.00011054313099041533, "loss": 0.1262, "step": 560 }, { "epoch": 1.82, "grad_norm": 1.202286720275879, "learning_rate": 0.00010894568690095847, "loss": 0.1721, "step": 570 }, { "epoch": 1.85, "grad_norm": 2.6997601985931396, "learning_rate": 0.0001073482428115016, "loss": 0.2128, "step": 580 }, { "epoch": 1.88, "grad_norm": 1.1591830253601074, "learning_rate": 0.00010575079872204474, "loss": 0.2402, "step": 590 }, { "epoch": 1.92, "grad_norm": 0.5840221643447876, "learning_rate": 0.00010415335463258787, "loss": 0.1816, "step": 600 }, { "epoch": 1.92, "eval_loss": 0.2552177309989929, "eval_na_accuracy": 0.95, "eval_ordinal_accuracy": 0.6266666666666667, "eval_ordinal_mae": 1.158560517811113, "eval_runtime": 19.5011, "eval_samples_per_second": 25.64, "eval_steps_per_second": 3.231, "step": 600 }, { "epoch": 1.95, "grad_norm": 0.7560299634933472, "learning_rate": 0.000102555910543131, "loss": 0.2021, "step": 610 }, { "epoch": 1.98, "grad_norm": 1.8860361576080322, "learning_rate": 0.00010095846645367413, "loss": 0.2092, "step": 620 }, { "epoch": 2.01, "grad_norm": 0.7235255837440491, "learning_rate": 9.936102236421726e-05, "loss": 0.1131, "step": 630 }, { "epoch": 2.04, "grad_norm": 0.3656529486179352, "learning_rate": 9.77635782747604e-05, "loss": 0.0867, "step": 640 }, { "epoch": 2.08, "grad_norm": 0.3450271785259247, "learning_rate": 9.616613418530351e-05, "loss": 0.0903, "step": 650 }, { "epoch": 2.11, "grad_norm": 1.0603750944137573, "learning_rate": 9.456869009584664e-05, "loss": 0.1234, "step": 660 }, { "epoch": 2.14, "grad_norm": 0.6790297031402588, "learning_rate": 9.297124600638978e-05, "loss": 0.0936, "step": 670 }, { "epoch": 2.17, "grad_norm": 0.5596363544464111, "learning_rate": 9.137380191693292e-05, "loss": 0.0651, "step": 680 }, { "epoch": 2.2, "grad_norm": 0.5989049673080444, "learning_rate": 8.977635782747604e-05, "loss": 0.1218, "step": 690 }, { "epoch": 2.24, "grad_norm": 0.9003208875656128, "learning_rate": 8.817891373801918e-05, "loss": 0.0682, "step": 700 }, { "epoch": 2.24, "eval_loss": 0.27212005853652954, "eval_na_accuracy": 0.952, "eval_ordinal_accuracy": 0.6577777777777778, "eval_ordinal_mae": 1.1557789803379113, "eval_runtime": 19.5966, "eval_samples_per_second": 25.515, "eval_steps_per_second": 3.215, "step": 700 }, { "epoch": 2.27, "grad_norm": 0.6663013100624084, "learning_rate": 8.658146964856231e-05, "loss": 0.0714, "step": 710 }, { "epoch": 2.3, "grad_norm": 1.0458776950836182, "learning_rate": 8.498402555910544e-05, "loss": 0.102, "step": 720 }, { "epoch": 2.33, "grad_norm": 0.9246501922607422, "learning_rate": 8.338658146964856e-05, "loss": 0.1623, "step": 730 }, { "epoch": 2.36, "grad_norm": 1.0837684869766235, "learning_rate": 8.17891373801917e-05, "loss": 0.0934, "step": 740 }, { "epoch": 2.4, "grad_norm": 0.564241349697113, "learning_rate": 8.019169329073483e-05, "loss": 0.0853, "step": 750 }, { "epoch": 2.43, "grad_norm": 4.335838794708252, "learning_rate": 7.859424920127795e-05, "loss": 0.1246, "step": 760 }, { "epoch": 2.46, "grad_norm": 0.957082211971283, "learning_rate": 7.699680511182109e-05, "loss": 0.1292, "step": 770 }, { "epoch": 2.49, "grad_norm": 0.9633702039718628, "learning_rate": 7.539936102236423e-05, "loss": 0.1916, "step": 780 }, { "epoch": 2.52, "grad_norm": 0.7254676222801208, "learning_rate": 7.380191693290735e-05, "loss": 0.1054, "step": 790 }, { "epoch": 2.56, "grad_norm": 0.5885197520256042, "learning_rate": 7.220447284345049e-05, "loss": 0.0795, "step": 800 }, { "epoch": 2.56, "eval_loss": 0.2753521502017975, "eval_na_accuracy": 0.948, "eval_ordinal_accuracy": 0.6333333333333333, "eval_ordinal_mae": 1.1599188842872779, "eval_runtime": 20.0506, "eval_samples_per_second": 24.937, "eval_steps_per_second": 3.142, "step": 800 }, { "epoch": 2.59, "grad_norm": 0.5671622157096863, "learning_rate": 7.060702875399361e-05, "loss": 0.0948, "step": 810 }, { "epoch": 2.62, "grad_norm": 0.9914100766181946, "learning_rate": 6.900958466453674e-05, "loss": 0.0715, "step": 820 }, { "epoch": 2.65, "grad_norm": 0.4819205105304718, "learning_rate": 6.741214057507987e-05, "loss": 0.0839, "step": 830 }, { "epoch": 2.68, "grad_norm": 0.3811684250831604, "learning_rate": 6.5814696485623e-05, "loss": 0.0825, "step": 840 }, { "epoch": 2.72, "grad_norm": 0.9750994443893433, "learning_rate": 6.421725239616614e-05, "loss": 0.0968, "step": 850 }, { "epoch": 2.75, "grad_norm": 0.35765138268470764, "learning_rate": 6.261980830670928e-05, "loss": 0.1605, "step": 860 }, { "epoch": 2.78, "grad_norm": 0.3497343361377716, "learning_rate": 6.1022364217252406e-05, "loss": 0.0933, "step": 870 }, { "epoch": 2.81, "grad_norm": 0.4838835299015045, "learning_rate": 5.942492012779552e-05, "loss": 0.0859, "step": 880 }, { "epoch": 2.84, "grad_norm": 0.7002846002578735, "learning_rate": 5.782747603833866e-05, "loss": 0.1021, "step": 890 }, { "epoch": 2.88, "grad_norm": 2.312203884124756, "learning_rate": 5.623003194888179e-05, "loss": 0.1367, "step": 900 }, { "epoch": 2.88, "eval_loss": 0.29526129364967346, "eval_na_accuracy": 0.946, "eval_ordinal_accuracy": 0.64, "eval_ordinal_mae": 1.166716830432415, "eval_runtime": 20.0091, "eval_samples_per_second": 24.989, "eval_steps_per_second": 3.149, "step": 900 }, { "epoch": 2.91, "grad_norm": 0.44126951694488525, "learning_rate": 5.4632587859424925e-05, "loss": 0.0854, "step": 910 }, { "epoch": 2.94, "grad_norm": 1.0075191259384155, "learning_rate": 5.3035143769968054e-05, "loss": 0.0823, "step": 920 }, { "epoch": 2.97, "grad_norm": 0.9991279244422913, "learning_rate": 5.1437699680511184e-05, "loss": 0.1156, "step": 930 }, { "epoch": 3.0, "grad_norm": 0.8888081312179565, "learning_rate": 4.984025559105431e-05, "loss": 0.0876, "step": 940 }, { "epoch": 3.04, "grad_norm": 0.3761376738548279, "learning_rate": 4.824281150159744e-05, "loss": 0.0452, "step": 950 }, { "epoch": 3.07, "grad_norm": 0.365622341632843, "learning_rate": 4.664536741214058e-05, "loss": 0.0428, "step": 960 }, { "epoch": 3.1, "grad_norm": 0.35657036304473877, "learning_rate": 4.504792332268371e-05, "loss": 0.033, "step": 970 }, { "epoch": 3.13, "grad_norm": 0.5636401176452637, "learning_rate": 4.345047923322684e-05, "loss": 0.0356, "step": 980 }, { "epoch": 3.16, "grad_norm": 0.431383341550827, "learning_rate": 4.185303514376997e-05, "loss": 0.0463, "step": 990 }, { "epoch": 3.19, "grad_norm": 0.583328127861023, "learning_rate": 4.0255591054313104e-05, "loss": 0.0387, "step": 1000 }, { "epoch": 3.19, "eval_loss": 0.2923290431499481, "eval_na_accuracy": 0.944, "eval_ordinal_accuracy": 0.6377777777777778, "eval_ordinal_mae": 1.2024743282463815, "eval_runtime": 19.3226, "eval_samples_per_second": 25.876, "eval_steps_per_second": 3.26, "step": 1000 }, { "epoch": 3.23, "grad_norm": 2.440162420272827, "learning_rate": 3.8658146964856234e-05, "loss": 0.0607, "step": 1010 }, { "epoch": 3.26, "grad_norm": 0.29546236991882324, "learning_rate": 3.7060702875399364e-05, "loss": 0.0515, "step": 1020 }, { "epoch": 3.29, "grad_norm": 0.44689303636550903, "learning_rate": 3.546325878594249e-05, "loss": 0.0273, "step": 1030 }, { "epoch": 3.32, "grad_norm": 0.3288978040218353, "learning_rate": 3.386581469648562e-05, "loss": 0.0352, "step": 1040 }, { "epoch": 3.35, "grad_norm": 0.41706767678260803, "learning_rate": 3.226837060702875e-05, "loss": 0.0345, "step": 1050 }, { "epoch": 3.39, "grad_norm": 0.31060507893562317, "learning_rate": 3.067092651757188e-05, "loss": 0.0294, "step": 1060 }, { "epoch": 3.42, "grad_norm": 0.2541821599006653, "learning_rate": 2.907348242811502e-05, "loss": 0.0354, "step": 1070 }, { "epoch": 3.45, "grad_norm": 0.574343740940094, "learning_rate": 2.747603833865815e-05, "loss": 0.0443, "step": 1080 }, { "epoch": 3.48, "grad_norm": 0.47532182931900024, "learning_rate": 2.5878594249201278e-05, "loss": 0.0605, "step": 1090 }, { "epoch": 3.51, "grad_norm": 0.45276594161987305, "learning_rate": 2.428115015974441e-05, "loss": 0.0293, "step": 1100 }, { "epoch": 3.51, "eval_loss": 0.2884800434112549, "eval_na_accuracy": 0.948, "eval_ordinal_accuracy": 0.6644444444444444, "eval_ordinal_mae": 1.1666180535654227, "eval_runtime": 19.9365, "eval_samples_per_second": 25.08, "eval_steps_per_second": 3.16, "step": 1100 }, { "epoch": 3.55, "grad_norm": 0.655549168586731, "learning_rate": 2.268370607028754e-05, "loss": 0.034, "step": 1110 }, { "epoch": 3.58, "grad_norm": 0.16610193252563477, "learning_rate": 2.108626198083067e-05, "loss": 0.0319, "step": 1120 }, { "epoch": 3.61, "grad_norm": 0.26889652013778687, "learning_rate": 1.9488817891373803e-05, "loss": 0.0479, "step": 1130 }, { "epoch": 3.64, "grad_norm": 0.2418793886899948, "learning_rate": 1.7891373801916932e-05, "loss": 0.0322, "step": 1140 }, { "epoch": 3.67, "grad_norm": 0.5379694104194641, "learning_rate": 1.6293929712460065e-05, "loss": 0.0393, "step": 1150 }, { "epoch": 3.71, "grad_norm": 0.19815516471862793, "learning_rate": 1.4696485623003195e-05, "loss": 0.0217, "step": 1160 }, { "epoch": 3.74, "grad_norm": 0.889312207698822, "learning_rate": 1.3099041533546328e-05, "loss": 0.0332, "step": 1170 }, { "epoch": 3.77, "grad_norm": 0.2865816652774811, "learning_rate": 1.1501597444089457e-05, "loss": 0.0313, "step": 1180 }, { "epoch": 3.8, "grad_norm": 0.5947129726409912, "learning_rate": 9.904153354632589e-06, "loss": 0.034, "step": 1190 }, { "epoch": 3.83, "grad_norm": 0.44885268807411194, "learning_rate": 8.306709265175718e-06, "loss": 0.0286, "step": 1200 }, { "epoch": 3.83, "eval_loss": 0.28681233525276184, "eval_na_accuracy": 0.95, "eval_ordinal_accuracy": 0.6711111111111111, "eval_ordinal_mae": 1.1625636271304554, "eval_runtime": 19.7259, "eval_samples_per_second": 25.347, "eval_steps_per_second": 3.194, "step": 1200 }, { "epoch": 3.83, "step": 1200, "total_flos": 1.4860396665534874e+18, "train_loss": 0.17935538868109385, "train_runtime": 1702.5744, "train_samples_per_second": 11.747, "train_steps_per_second": 0.735 } ], "logging_steps": 10, "max_steps": 1252, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 1.4860396665534874e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }