{ "best_metric": 0.6129801869392395, "best_model_checkpoint": "Action_model/checkpoint-300", "epoch": 10.0, "eval_steps": 100, "global_step": 2680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 2.570383071899414, "learning_rate": 9.96268656716418e-05, "loss": 0.1841, "step": 10 }, { "epoch": 0.07, "grad_norm": 6.266295433044434, "learning_rate": 9.925373134328359e-05, "loss": 0.2301, "step": 20 }, { "epoch": 0.11, "grad_norm": 8.001986503601074, "learning_rate": 9.888059701492539e-05, "loss": 0.2533, "step": 30 }, { "epoch": 0.15, "grad_norm": 5.319194316864014, "learning_rate": 9.850746268656717e-05, "loss": 0.2436, "step": 40 }, { "epoch": 0.19, "grad_norm": 0.9653372764587402, "learning_rate": 9.813432835820896e-05, "loss": 0.3712, "step": 50 }, { "epoch": 0.22, "grad_norm": 7.348043441772461, "learning_rate": 9.776119402985075e-05, "loss": 0.3645, "step": 60 }, { "epoch": 0.26, "grad_norm": 2.1969542503356934, "learning_rate": 9.738805970149254e-05, "loss": 0.4609, "step": 70 }, { "epoch": 0.3, "grad_norm": 6.397550106048584, "learning_rate": 9.701492537313434e-05, "loss": 0.4755, "step": 80 }, { "epoch": 0.34, "grad_norm": 6.923007488250732, "learning_rate": 9.664179104477612e-05, "loss": 0.3901, "step": 90 }, { "epoch": 0.37, "grad_norm": 4.786198616027832, "learning_rate": 9.626865671641792e-05, "loss": 0.255, "step": 100 }, { "epoch": 0.37, "eval_accuracy": 0.7926186291739895, "eval_loss": 0.7616190314292908, "eval_runtime": 8.7209, "eval_samples_per_second": 65.245, "eval_steps_per_second": 8.256, "step": 100 }, { "epoch": 0.41, "grad_norm": 8.368223190307617, "learning_rate": 9.58955223880597e-05, "loss": 0.3784, "step": 110 }, { "epoch": 0.45, "grad_norm": 4.078306198120117, "learning_rate": 9.552238805970149e-05, "loss": 0.4148, "step": 120 }, { "epoch": 0.49, "grad_norm": 7.815361022949219, "learning_rate": 9.514925373134329e-05, "loss": 0.3621, "step": 130 }, { "epoch": 0.52, "grad_norm": 11.498431205749512, "learning_rate": 9.477611940298507e-05, "loss": 0.3974, "step": 140 }, { "epoch": 0.56, "grad_norm": 7.946558952331543, "learning_rate": 9.440298507462687e-05, "loss": 0.3856, "step": 150 }, { "epoch": 0.6, "grad_norm": 0.3486919403076172, "learning_rate": 9.402985074626867e-05, "loss": 0.2435, "step": 160 }, { "epoch": 0.63, "grad_norm": 4.267444133758545, "learning_rate": 9.365671641791045e-05, "loss": 0.3736, "step": 170 }, { "epoch": 0.67, "grad_norm": 3.022345542907715, "learning_rate": 9.328358208955224e-05, "loss": 0.439, "step": 180 }, { "epoch": 0.71, "grad_norm": 5.57196044921875, "learning_rate": 9.291044776119402e-05, "loss": 0.2996, "step": 190 }, { "epoch": 0.75, "grad_norm": 2.636216640472412, "learning_rate": 9.253731343283582e-05, "loss": 0.2048, "step": 200 }, { "epoch": 0.75, "eval_accuracy": 0.8084358523725835, "eval_loss": 0.724670946598053, "eval_runtime": 8.4461, "eval_samples_per_second": 67.368, "eval_steps_per_second": 8.525, "step": 200 }, { "epoch": 0.78, "grad_norm": 1.615098237991333, "learning_rate": 9.216417910447762e-05, "loss": 0.3594, "step": 210 }, { "epoch": 0.82, "grad_norm": 9.315821647644043, "learning_rate": 9.17910447761194e-05, "loss": 0.3046, "step": 220 }, { "epoch": 0.86, "grad_norm": 3.669430732727051, "learning_rate": 9.14179104477612e-05, "loss": 0.4158, "step": 230 }, { "epoch": 0.9, "grad_norm": 7.0882978439331055, "learning_rate": 9.104477611940299e-05, "loss": 0.3477, "step": 240 }, { "epoch": 0.93, "grad_norm": 1.1667325496673584, "learning_rate": 9.067164179104479e-05, "loss": 0.316, "step": 250 }, { "epoch": 0.97, "grad_norm": 1.482625961303711, "learning_rate": 9.029850746268657e-05, "loss": 0.3922, "step": 260 }, { "epoch": 1.01, "grad_norm": 0.20793116092681885, "learning_rate": 8.992537313432836e-05, "loss": 0.3751, "step": 270 }, { "epoch": 1.04, "grad_norm": 6.772298812866211, "learning_rate": 8.955223880597016e-05, "loss": 0.3269, "step": 280 }, { "epoch": 1.08, "grad_norm": 5.833349227905273, "learning_rate": 8.917910447761194e-05, "loss": 0.3026, "step": 290 }, { "epoch": 1.12, "grad_norm": 6.349458694458008, "learning_rate": 8.880597014925374e-05, "loss": 0.3763, "step": 300 }, { "epoch": 1.12, "eval_accuracy": 0.8330404217926186, "eval_loss": 0.6129801869392395, "eval_runtime": 8.4095, "eval_samples_per_second": 67.661, "eval_steps_per_second": 8.562, "step": 300 }, { "epoch": 1.16, "grad_norm": 4.767229080200195, "learning_rate": 8.843283582089554e-05, "loss": 0.3808, "step": 310 }, { "epoch": 1.19, "grad_norm": 12.675297737121582, "learning_rate": 8.805970149253732e-05, "loss": 0.3766, "step": 320 }, { "epoch": 1.23, "grad_norm": 3.8118245601654053, "learning_rate": 8.76865671641791e-05, "loss": 0.2642, "step": 330 }, { "epoch": 1.27, "grad_norm": 8.736045837402344, "learning_rate": 8.731343283582089e-05, "loss": 0.3041, "step": 340 }, { "epoch": 1.31, "grad_norm": 6.683359146118164, "learning_rate": 8.694029850746269e-05, "loss": 0.1352, "step": 350 }, { "epoch": 1.34, "grad_norm": 4.780521392822266, "learning_rate": 8.656716417910447e-05, "loss": 0.4005, "step": 360 }, { "epoch": 1.38, "grad_norm": 9.654714584350586, "learning_rate": 8.619402985074627e-05, "loss": 0.3646, "step": 370 }, { "epoch": 1.42, "grad_norm": 4.174666881561279, "learning_rate": 8.582089552238807e-05, "loss": 0.2353, "step": 380 }, { "epoch": 1.46, "grad_norm": 7.596667289733887, "learning_rate": 8.548507462686568e-05, "loss": 0.3991, "step": 390 }, { "epoch": 1.49, "grad_norm": 5.592709064483643, "learning_rate": 8.511194029850747e-05, "loss": 0.307, "step": 400 }, { "epoch": 1.49, "eval_accuracy": 0.789103690685413, "eval_loss": 0.8137023448944092, "eval_runtime": 8.3292, "eval_samples_per_second": 68.314, "eval_steps_per_second": 8.644, "step": 400 }, { "epoch": 1.53, "grad_norm": 2.232590675354004, "learning_rate": 8.473880597014926e-05, "loss": 0.4669, "step": 410 }, { "epoch": 1.57, "grad_norm": 4.276609897613525, "learning_rate": 8.436567164179105e-05, "loss": 0.3831, "step": 420 }, { "epoch": 1.6, "grad_norm": 7.262507915496826, "learning_rate": 8.399253731343283e-05, "loss": 0.3472, "step": 430 }, { "epoch": 1.64, "grad_norm": 7.258556365966797, "learning_rate": 8.361940298507463e-05, "loss": 0.2396, "step": 440 }, { "epoch": 1.68, "grad_norm": 4.945961952209473, "learning_rate": 8.324626865671642e-05, "loss": 0.2433, "step": 450 }, { "epoch": 1.72, "grad_norm": 5.138702392578125, "learning_rate": 8.287313432835821e-05, "loss": 0.2947, "step": 460 }, { "epoch": 1.75, "grad_norm": 1.1640909910202026, "learning_rate": 8.25e-05, "loss": 0.4791, "step": 470 }, { "epoch": 1.79, "grad_norm": 4.626485824584961, "learning_rate": 8.21268656716418e-05, "loss": 0.286, "step": 480 }, { "epoch": 1.83, "grad_norm": 5.178492069244385, "learning_rate": 8.17537313432836e-05, "loss": 0.3202, "step": 490 }, { "epoch": 1.87, "grad_norm": 7.854339122772217, "learning_rate": 8.138059701492538e-05, "loss": 0.3542, "step": 500 }, { "epoch": 1.87, "eval_accuracy": 0.8014059753954306, "eval_loss": 0.6611581444740295, "eval_runtime": 8.5853, "eval_samples_per_second": 66.276, "eval_steps_per_second": 8.386, "step": 500 }, { "epoch": 1.9, "grad_norm": 1.429740071296692, "learning_rate": 8.100746268656717e-05, "loss": 0.3039, "step": 510 }, { "epoch": 1.94, "grad_norm": 2.9776551723480225, "learning_rate": 8.063432835820895e-05, "loss": 0.3825, "step": 520 }, { "epoch": 1.98, "grad_norm": 10.557899475097656, "learning_rate": 8.026119402985075e-05, "loss": 0.5109, "step": 530 }, { "epoch": 2.01, "grad_norm": 1.448002815246582, "learning_rate": 7.988805970149255e-05, "loss": 0.3421, "step": 540 }, { "epoch": 2.05, "grad_norm": 4.500860691070557, "learning_rate": 7.951492537313433e-05, "loss": 0.3008, "step": 550 }, { "epoch": 2.09, "grad_norm": 8.077374458312988, "learning_rate": 7.914179104477613e-05, "loss": 0.27, "step": 560 }, { "epoch": 2.13, "grad_norm": 0.16809479892253876, "learning_rate": 7.876865671641792e-05, "loss": 0.2184, "step": 570 }, { "epoch": 2.16, "grad_norm": 4.892763137817383, "learning_rate": 7.83955223880597e-05, "loss": 0.1479, "step": 580 }, { "epoch": 2.2, "grad_norm": 8.35221004486084, "learning_rate": 7.80223880597015e-05, "loss": 0.3498, "step": 590 }, { "epoch": 2.24, "grad_norm": 12.043429374694824, "learning_rate": 7.764925373134328e-05, "loss": 0.3518, "step": 600 }, { "epoch": 2.24, "eval_accuracy": 0.8189806678383128, "eval_loss": 0.6964564919471741, "eval_runtime": 8.3878, "eval_samples_per_second": 67.837, "eval_steps_per_second": 8.584, "step": 600 }, { "epoch": 2.28, "grad_norm": 3.7737715244293213, "learning_rate": 7.727611940298508e-05, "loss": 0.3532, "step": 610 }, { "epoch": 2.31, "grad_norm": 4.282881736755371, "learning_rate": 7.690298507462687e-05, "loss": 0.2214, "step": 620 }, { "epoch": 2.35, "grad_norm": 6.733531475067139, "learning_rate": 7.652985074626866e-05, "loss": 0.2709, "step": 630 }, { "epoch": 2.39, "grad_norm": 2.567267417907715, "learning_rate": 7.615671641791045e-05, "loss": 0.3725, "step": 640 }, { "epoch": 2.43, "grad_norm": 3.120966911315918, "learning_rate": 7.578358208955223e-05, "loss": 0.3036, "step": 650 }, { "epoch": 2.46, "grad_norm": 6.505622386932373, "learning_rate": 7.541044776119403e-05, "loss": 0.2426, "step": 660 }, { "epoch": 2.5, "grad_norm": 4.887637615203857, "learning_rate": 7.503731343283582e-05, "loss": 0.281, "step": 670 }, { "epoch": 2.54, "grad_norm": 9.790969848632812, "learning_rate": 7.466417910447762e-05, "loss": 0.4504, "step": 680 }, { "epoch": 2.57, "grad_norm": 4.354789733886719, "learning_rate": 7.429104477611941e-05, "loss": 0.4094, "step": 690 }, { "epoch": 2.61, "grad_norm": 5.015912055969238, "learning_rate": 7.39179104477612e-05, "loss": 0.3706, "step": 700 }, { "epoch": 2.61, "eval_accuracy": 0.804920913884007, "eval_loss": 0.7254143357276917, "eval_runtime": 8.3242, "eval_samples_per_second": 68.355, "eval_steps_per_second": 8.649, "step": 700 }, { "epoch": 2.65, "grad_norm": 5.382541656494141, "learning_rate": 7.3544776119403e-05, "loss": 0.1722, "step": 710 }, { "epoch": 2.69, "grad_norm": 5.573971748352051, "learning_rate": 7.317164179104478e-05, "loss": 0.327, "step": 720 }, { "epoch": 2.72, "grad_norm": 3.5606117248535156, "learning_rate": 7.279850746268657e-05, "loss": 0.2702, "step": 730 }, { "epoch": 2.76, "grad_norm": 1.7398028373718262, "learning_rate": 7.242537313432837e-05, "loss": 0.238, "step": 740 }, { "epoch": 2.8, "grad_norm": 2.7511751651763916, "learning_rate": 7.205223880597015e-05, "loss": 0.1848, "step": 750 }, { "epoch": 2.84, "grad_norm": 3.381510019302368, "learning_rate": 7.167910447761195e-05, "loss": 0.2261, "step": 760 }, { "epoch": 2.87, "grad_norm": 4.65634298324585, "learning_rate": 7.130597014925373e-05, "loss": 0.237, "step": 770 }, { "epoch": 2.91, "grad_norm": 10.35020923614502, "learning_rate": 7.093283582089553e-05, "loss": 0.3012, "step": 780 }, { "epoch": 2.95, "grad_norm": 8.878485679626465, "learning_rate": 7.055970149253732e-05, "loss": 0.4094, "step": 790 }, { "epoch": 2.99, "grad_norm": 2.9728074073791504, "learning_rate": 7.01865671641791e-05, "loss": 0.4084, "step": 800 }, { "epoch": 2.99, "eval_accuracy": 0.8101933216168717, "eval_loss": 0.6746156811714172, "eval_runtime": 8.2718, "eval_samples_per_second": 68.788, "eval_steps_per_second": 8.704, "step": 800 }, { "epoch": 3.02, "grad_norm": 4.835368633270264, "learning_rate": 6.98134328358209e-05, "loss": 0.3152, "step": 810 }, { "epoch": 3.06, "grad_norm": 2.9197049140930176, "learning_rate": 6.944029850746268e-05, "loss": 0.3433, "step": 820 }, { "epoch": 3.1, "grad_norm": 5.646128177642822, "learning_rate": 6.906716417910448e-05, "loss": 0.2604, "step": 830 }, { "epoch": 3.13, "grad_norm": 3.860607862472534, "learning_rate": 6.869402985074627e-05, "loss": 0.2831, "step": 840 }, { "epoch": 3.17, "grad_norm": 0.1358175426721573, "learning_rate": 6.832089552238807e-05, "loss": 0.242, "step": 850 }, { "epoch": 3.21, "grad_norm": 1.1011104583740234, "learning_rate": 6.794776119402985e-05, "loss": 0.2621, "step": 860 }, { "epoch": 3.25, "grad_norm": 7.837879180908203, "learning_rate": 6.757462686567164e-05, "loss": 0.249, "step": 870 }, { "epoch": 3.28, "grad_norm": 6.8647613525390625, "learning_rate": 6.720149253731343e-05, "loss": 0.3398, "step": 880 }, { "epoch": 3.32, "grad_norm": 2.8186678886413574, "learning_rate": 6.682835820895522e-05, "loss": 0.3092, "step": 890 }, { "epoch": 3.36, "grad_norm": 4.623282432556152, "learning_rate": 6.645522388059702e-05, "loss": 0.2533, "step": 900 }, { "epoch": 3.36, "eval_accuracy": 0.8189806678383128, "eval_loss": 0.6866591572761536, "eval_runtime": 8.3143, "eval_samples_per_second": 68.436, "eval_steps_per_second": 8.66, "step": 900 }, { "epoch": 3.4, "grad_norm": 4.85120964050293, "learning_rate": 6.608208955223882e-05, "loss": 0.2279, "step": 910 }, { "epoch": 3.43, "grad_norm": 0.7263774275779724, "learning_rate": 6.57089552238806e-05, "loss": 0.1725, "step": 920 }, { "epoch": 3.47, "grad_norm": 6.813180923461914, "learning_rate": 6.53358208955224e-05, "loss": 0.3304, "step": 930 }, { "epoch": 3.51, "grad_norm": 8.58501148223877, "learning_rate": 6.496268656716418e-05, "loss": 0.1864, "step": 940 }, { "epoch": 3.54, "grad_norm": 2.814436435699463, "learning_rate": 6.458955223880597e-05, "loss": 0.1496, "step": 950 }, { "epoch": 3.58, "grad_norm": 8.36603832244873, "learning_rate": 6.421641791044777e-05, "loss": 0.208, "step": 960 }, { "epoch": 3.62, "grad_norm": 3.5715956687927246, "learning_rate": 6.384328358208955e-05, "loss": 0.2429, "step": 970 }, { "epoch": 3.66, "grad_norm": 4.983556270599365, "learning_rate": 6.347014925373135e-05, "loss": 0.4053, "step": 980 }, { "epoch": 3.69, "grad_norm": 4.936723232269287, "learning_rate": 6.309701492537313e-05, "loss": 0.1545, "step": 990 }, { "epoch": 3.73, "grad_norm": 6.59185791015625, "learning_rate": 6.272388059701493e-05, "loss": 0.3147, "step": 1000 }, { "epoch": 3.73, "eval_accuracy": 0.8189806678383128, "eval_loss": 0.7077136635780334, "eval_runtime": 8.3117, "eval_samples_per_second": 68.457, "eval_steps_per_second": 8.662, "step": 1000 }, { "epoch": 3.77, "grad_norm": 9.348366737365723, "learning_rate": 6.235074626865672e-05, "loss": 0.3634, "step": 1010 }, { "epoch": 3.81, "grad_norm": 9.918521881103516, "learning_rate": 6.19776119402985e-05, "loss": 0.3151, "step": 1020 }, { "epoch": 3.84, "grad_norm": 5.687044143676758, "learning_rate": 6.16044776119403e-05, "loss": 0.3088, "step": 1030 }, { "epoch": 3.88, "grad_norm": 3.8347887992858887, "learning_rate": 6.123134328358209e-05, "loss": 0.2128, "step": 1040 }, { "epoch": 3.92, "grad_norm": 5.380050182342529, "learning_rate": 6.0858208955223884e-05, "loss": 0.255, "step": 1050 }, { "epoch": 3.96, "grad_norm": 8.848828315734863, "learning_rate": 6.0485074626865676e-05, "loss": 0.2794, "step": 1060 }, { "epoch": 3.99, "grad_norm": 3.9666404724121094, "learning_rate": 6.011194029850746e-05, "loss": 0.1954, "step": 1070 }, { "epoch": 4.03, "grad_norm": 0.3369455635547638, "learning_rate": 5.973880597014926e-05, "loss": 0.2298, "step": 1080 }, { "epoch": 4.07, "grad_norm": 16.327823638916016, "learning_rate": 5.9365671641791044e-05, "loss": 0.2504, "step": 1090 }, { "epoch": 4.1, "grad_norm": 7.070168495178223, "learning_rate": 5.8992537313432835e-05, "loss": 0.3182, "step": 1100 }, { "epoch": 4.1, "eval_accuracy": 0.8189806678383128, "eval_loss": 0.6661401987075806, "eval_runtime": 8.2263, "eval_samples_per_second": 69.169, "eval_steps_per_second": 8.752, "step": 1100 }, { "epoch": 4.14, "grad_norm": 2.853975534439087, "learning_rate": 5.8619402985074634e-05, "loss": 0.201, "step": 1110 }, { "epoch": 4.18, "grad_norm": 0.958690881729126, "learning_rate": 5.824626865671642e-05, "loss": 0.1833, "step": 1120 }, { "epoch": 4.22, "grad_norm": 3.4794461727142334, "learning_rate": 5.787313432835822e-05, "loss": 0.2796, "step": 1130 }, { "epoch": 4.25, "grad_norm": 4.793296813964844, "learning_rate": 5.7499999999999995e-05, "loss": 0.2281, "step": 1140 }, { "epoch": 4.29, "grad_norm": 6.200154781341553, "learning_rate": 5.712686567164179e-05, "loss": 0.2814, "step": 1150 }, { "epoch": 4.33, "grad_norm": 5.616389274597168, "learning_rate": 5.675373134328359e-05, "loss": 0.1656, "step": 1160 }, { "epoch": 4.37, "grad_norm": 9.382554054260254, "learning_rate": 5.6380597014925376e-05, "loss": 0.19, "step": 1170 }, { "epoch": 4.4, "grad_norm": 3.526240587234497, "learning_rate": 5.600746268656717e-05, "loss": 0.2063, "step": 1180 }, { "epoch": 4.44, "grad_norm": 3.494896650314331, "learning_rate": 5.563432835820895e-05, "loss": 0.1681, "step": 1190 }, { "epoch": 4.48, "grad_norm": 5.764057636260986, "learning_rate": 5.526119402985075e-05, "loss": 0.2248, "step": 1200 }, { "epoch": 4.48, "eval_accuracy": 0.8418277680140598, "eval_loss": 0.6632041335105896, "eval_runtime": 8.1661, "eval_samples_per_second": 69.679, "eval_steps_per_second": 8.817, "step": 1200 }, { "epoch": 4.51, "grad_norm": 4.680635452270508, "learning_rate": 5.488805970149254e-05, "loss": 0.2179, "step": 1210 }, { "epoch": 4.55, "grad_norm": 10.24306869506836, "learning_rate": 5.451492537313433e-05, "loss": 0.2187, "step": 1220 }, { "epoch": 4.59, "grad_norm": 3.054690361022949, "learning_rate": 5.4141791044776126e-05, "loss": 0.1729, "step": 1230 }, { "epoch": 4.63, "grad_norm": 4.907272815704346, "learning_rate": 5.376865671641791e-05, "loss": 0.2762, "step": 1240 }, { "epoch": 4.66, "grad_norm": 4.774748802185059, "learning_rate": 5.33955223880597e-05, "loss": 0.1965, "step": 1250 }, { "epoch": 4.7, "grad_norm": 5.757875919342041, "learning_rate": 5.30223880597015e-05, "loss": 0.1564, "step": 1260 }, { "epoch": 4.74, "grad_norm": 0.3608088791370392, "learning_rate": 5.2649253731343286e-05, "loss": 0.0946, "step": 1270 }, { "epoch": 4.78, "grad_norm": 3.6289939880371094, "learning_rate": 5.227611940298508e-05, "loss": 0.3364, "step": 1280 }, { "epoch": 4.81, "grad_norm": 5.132009029388428, "learning_rate": 5.190298507462686e-05, "loss": 0.231, "step": 1290 }, { "epoch": 4.85, "grad_norm": 1.0347099304199219, "learning_rate": 5.152985074626866e-05, "loss": 0.1617, "step": 1300 }, { "epoch": 4.85, "eval_accuracy": 0.8172231985940246, "eval_loss": 0.7277125716209412, "eval_runtime": 8.4693, "eval_samples_per_second": 67.184, "eval_steps_per_second": 8.501, "step": 1300 }, { "epoch": 4.89, "grad_norm": 2.5996298789978027, "learning_rate": 5.115671641791045e-05, "loss": 0.385, "step": 1310 }, { "epoch": 4.93, "grad_norm": 3.724181890487671, "learning_rate": 5.078358208955224e-05, "loss": 0.1786, "step": 1320 }, { "epoch": 4.96, "grad_norm": 2.150557518005371, "learning_rate": 5.0410447761194035e-05, "loss": 0.2122, "step": 1330 }, { "epoch": 5.0, "grad_norm": 3.8813323974609375, "learning_rate": 5.003731343283582e-05, "loss": 0.2425, "step": 1340 }, { "epoch": 5.04, "grad_norm": 0.896369457244873, "learning_rate": 4.966417910447762e-05, "loss": 0.2208, "step": 1350 }, { "epoch": 5.07, "grad_norm": 9.002110481262207, "learning_rate": 4.92910447761194e-05, "loss": 0.1432, "step": 1360 }, { "epoch": 5.11, "grad_norm": 9.619662284851074, "learning_rate": 4.8917910447761195e-05, "loss": 0.1347, "step": 1370 }, { "epoch": 5.15, "grad_norm": 3.5148773193359375, "learning_rate": 4.8544776119402986e-05, "loss": 0.2837, "step": 1380 }, { "epoch": 5.19, "grad_norm": 7.631669044494629, "learning_rate": 4.817164179104478e-05, "loss": 0.1887, "step": 1390 }, { "epoch": 5.22, "grad_norm": 11.738872528076172, "learning_rate": 4.779850746268657e-05, "loss": 0.2578, "step": 1400 }, { "epoch": 5.22, "eval_accuracy": 0.8189806678383128, "eval_loss": 0.7114442586898804, "eval_runtime": 8.2672, "eval_samples_per_second": 68.826, "eval_steps_per_second": 8.709, "step": 1400 }, { "epoch": 5.26, "grad_norm": 6.67802095413208, "learning_rate": 4.742537313432836e-05, "loss": 0.2527, "step": 1410 }, { "epoch": 5.3, "grad_norm": 4.491325378417969, "learning_rate": 4.705223880597015e-05, "loss": 0.2386, "step": 1420 }, { "epoch": 5.34, "grad_norm": 1.1810379028320312, "learning_rate": 4.667910447761194e-05, "loss": 0.1693, "step": 1430 }, { "epoch": 5.37, "grad_norm": 6.075868129730225, "learning_rate": 4.6305970149253736e-05, "loss": 0.167, "step": 1440 }, { "epoch": 5.41, "grad_norm": 2.315635919570923, "learning_rate": 4.593283582089553e-05, "loss": 0.2243, "step": 1450 }, { "epoch": 5.45, "grad_norm": 10.839255332946777, "learning_rate": 4.555970149253732e-05, "loss": 0.2414, "step": 1460 }, { "epoch": 5.49, "grad_norm": 4.562304496765137, "learning_rate": 4.5186567164179104e-05, "loss": 0.264, "step": 1470 }, { "epoch": 5.52, "grad_norm": 1.8821789026260376, "learning_rate": 4.4813432835820895e-05, "loss": 0.1407, "step": 1480 }, { "epoch": 5.56, "grad_norm": 8.406396865844727, "learning_rate": 4.4440298507462694e-05, "loss": 0.1454, "step": 1490 }, { "epoch": 5.6, "grad_norm": 0.2816010117530823, "learning_rate": 4.406716417910448e-05, "loss": 0.1864, "step": 1500 }, { "epoch": 5.6, "eval_accuracy": 0.8172231985940246, "eval_loss": 0.755394458770752, "eval_runtime": 8.2598, "eval_samples_per_second": 68.888, "eval_steps_per_second": 8.717, "step": 1500 }, { "epoch": 5.63, "grad_norm": 6.619854927062988, "learning_rate": 4.369402985074627e-05, "loss": 0.2806, "step": 1510 }, { "epoch": 5.67, "grad_norm": 2.056018829345703, "learning_rate": 4.332089552238806e-05, "loss": 0.2583, "step": 1520 }, { "epoch": 5.71, "grad_norm": 0.966521680355072, "learning_rate": 4.294776119402985e-05, "loss": 0.0997, "step": 1530 }, { "epoch": 5.75, "grad_norm": 2.8261241912841797, "learning_rate": 4.2574626865671645e-05, "loss": 0.1604, "step": 1540 }, { "epoch": 5.78, "grad_norm": 3.089912176132202, "learning_rate": 4.2201492537313436e-05, "loss": 0.2775, "step": 1550 }, { "epoch": 5.82, "grad_norm": 7.935690879821777, "learning_rate": 4.182835820895523e-05, "loss": 0.2522, "step": 1560 }, { "epoch": 5.86, "grad_norm": 0.7999266982078552, "learning_rate": 4.145522388059702e-05, "loss": 0.0752, "step": 1570 }, { "epoch": 5.9, "grad_norm": 6.0712480545043945, "learning_rate": 4.1082089552238804e-05, "loss": 0.1933, "step": 1580 }, { "epoch": 5.93, "grad_norm": 10.768308639526367, "learning_rate": 4.07089552238806e-05, "loss": 0.1664, "step": 1590 }, { "epoch": 5.97, "grad_norm": 9.641716003417969, "learning_rate": 4.0335820895522394e-05, "loss": 0.3134, "step": 1600 }, { "epoch": 5.97, "eval_accuracy": 0.8154657293497364, "eval_loss": 0.7593356966972351, "eval_runtime": 8.4455, "eval_samples_per_second": 67.373, "eval_steps_per_second": 8.525, "step": 1600 }, { "epoch": 6.01, "grad_norm": 6.7538838386535645, "learning_rate": 3.996268656716418e-05, "loss": 0.1747, "step": 1610 }, { "epoch": 6.04, "grad_norm": 6.237377166748047, "learning_rate": 3.958955223880597e-05, "loss": 0.2406, "step": 1620 }, { "epoch": 6.08, "grad_norm": 7.950930118560791, "learning_rate": 3.921641791044776e-05, "loss": 0.1884, "step": 1630 }, { "epoch": 6.12, "grad_norm": 4.41484260559082, "learning_rate": 3.8843283582089554e-05, "loss": 0.1445, "step": 1640 }, { "epoch": 6.16, "grad_norm": 6.339887619018555, "learning_rate": 3.8470149253731345e-05, "loss": 0.2906, "step": 1650 }, { "epoch": 6.19, "grad_norm": 7.597599983215332, "learning_rate": 3.809701492537314e-05, "loss": 0.1576, "step": 1660 }, { "epoch": 6.23, "grad_norm": 2.379629373550415, "learning_rate": 3.772388059701493e-05, "loss": 0.2016, "step": 1670 }, { "epoch": 6.27, "grad_norm": 2.7694478034973145, "learning_rate": 3.735074626865671e-05, "loss": 0.1188, "step": 1680 }, { "epoch": 6.31, "grad_norm": 2.1837210655212402, "learning_rate": 3.6977611940298505e-05, "loss": 0.1908, "step": 1690 }, { "epoch": 6.34, "grad_norm": 4.4665350914001465, "learning_rate": 3.66044776119403e-05, "loss": 0.24, "step": 1700 }, { "epoch": 6.34, "eval_accuracy": 0.8260105448154658, "eval_loss": 0.7510848641395569, "eval_runtime": 8.2044, "eval_samples_per_second": 69.353, "eval_steps_per_second": 8.776, "step": 1700 }, { "epoch": 6.38, "grad_norm": 4.6551995277404785, "learning_rate": 3.6231343283582095e-05, "loss": 0.1631, "step": 1710 }, { "epoch": 6.42, "grad_norm": 1.098407506942749, "learning_rate": 3.585820895522388e-05, "loss": 0.0912, "step": 1720 }, { "epoch": 6.46, "grad_norm": 0.37138649821281433, "learning_rate": 3.548507462686567e-05, "loss": 0.2621, "step": 1730 }, { "epoch": 6.49, "grad_norm": 7.4571757316589355, "learning_rate": 3.511194029850746e-05, "loss": 0.268, "step": 1740 }, { "epoch": 6.53, "grad_norm": 0.5180323123931885, "learning_rate": 3.4738805970149254e-05, "loss": 0.2135, "step": 1750 }, { "epoch": 6.57, "grad_norm": 1.0866820812225342, "learning_rate": 3.4365671641791046e-05, "loss": 0.1489, "step": 1760 }, { "epoch": 6.6, "grad_norm": 8.90451717376709, "learning_rate": 3.399253731343284e-05, "loss": 0.288, "step": 1770 }, { "epoch": 6.64, "grad_norm": 1.1608803272247314, "learning_rate": 3.361940298507463e-05, "loss": 0.18, "step": 1780 }, { "epoch": 6.68, "grad_norm": 2.9207170009613037, "learning_rate": 3.3246268656716414e-05, "loss": 0.2414, "step": 1790 }, { "epoch": 6.72, "grad_norm": 0.2674783170223236, "learning_rate": 3.287313432835821e-05, "loss": 0.2359, "step": 1800 }, { "epoch": 6.72, "eval_accuracy": 0.8137082601054482, "eval_loss": 0.7501537203788757, "eval_runtime": 8.1528, "eval_samples_per_second": 69.792, "eval_steps_per_second": 8.831, "step": 1800 }, { "epoch": 6.75, "grad_norm": 8.241676330566406, "learning_rate": 3.2500000000000004e-05, "loss": 0.1975, "step": 1810 }, { "epoch": 6.79, "grad_norm": 2.0347325801849365, "learning_rate": 3.2126865671641796e-05, "loss": 0.218, "step": 1820 }, { "epoch": 6.83, "grad_norm": 1.0338706970214844, "learning_rate": 3.175373134328358e-05, "loss": 0.1437, "step": 1830 }, { "epoch": 6.87, "grad_norm": 0.34902578592300415, "learning_rate": 3.138059701492537e-05, "loss": 0.1883, "step": 1840 }, { "epoch": 6.9, "grad_norm": 6.642534255981445, "learning_rate": 3.100746268656717e-05, "loss": 0.2513, "step": 1850 }, { "epoch": 6.94, "grad_norm": 4.432920455932617, "learning_rate": 3.0634328358208955e-05, "loss": 0.1058, "step": 1860 }, { "epoch": 6.98, "grad_norm": 4.381640434265137, "learning_rate": 3.0261194029850747e-05, "loss": 0.2114, "step": 1870 }, { "epoch": 7.01, "grad_norm": 7.730411529541016, "learning_rate": 2.9888059701492538e-05, "loss": 0.2542, "step": 1880 }, { "epoch": 7.05, "grad_norm": 7.122923851013184, "learning_rate": 2.9514925373134326e-05, "loss": 0.2594, "step": 1890 }, { "epoch": 7.09, "grad_norm": 1.411278486251831, "learning_rate": 2.9141791044776125e-05, "loss": 0.2322, "step": 1900 }, { "epoch": 7.09, "eval_accuracy": 0.8347978910369068, "eval_loss": 0.6952534317970276, "eval_runtime": 8.3769, "eval_samples_per_second": 67.925, "eval_steps_per_second": 8.595, "step": 1900 }, { "epoch": 7.13, "grad_norm": 2.219285011291504, "learning_rate": 2.8768656716417913e-05, "loss": 0.1344, "step": 1910 }, { "epoch": 7.16, "grad_norm": 6.302455902099609, "learning_rate": 2.8395522388059705e-05, "loss": 0.2098, "step": 1920 }, { "epoch": 7.2, "grad_norm": 1.2837783098220825, "learning_rate": 2.8022388059701493e-05, "loss": 0.0906, "step": 1930 }, { "epoch": 7.24, "grad_norm": 6.604355335235596, "learning_rate": 2.7649253731343284e-05, "loss": 0.2352, "step": 1940 }, { "epoch": 7.28, "grad_norm": 9.916419982910156, "learning_rate": 2.727611940298508e-05, "loss": 0.1422, "step": 1950 }, { "epoch": 7.31, "grad_norm": 2.7665014266967773, "learning_rate": 2.6902985074626868e-05, "loss": 0.1722, "step": 1960 }, { "epoch": 7.35, "grad_norm": 0.24231348931789398, "learning_rate": 2.652985074626866e-05, "loss": 0.2935, "step": 1970 }, { "epoch": 7.39, "grad_norm": 0.8025885224342346, "learning_rate": 2.6156716417910447e-05, "loss": 0.157, "step": 1980 }, { "epoch": 7.43, "grad_norm": 1.6752264499664307, "learning_rate": 2.578358208955224e-05, "loss": 0.1256, "step": 1990 }, { "epoch": 7.46, "grad_norm": 2.404883861541748, "learning_rate": 2.5410447761194027e-05, "loss": 0.1514, "step": 2000 }, { "epoch": 7.46, "eval_accuracy": 0.8260105448154658, "eval_loss": 0.7120960354804993, "eval_runtime": 8.1425, "eval_samples_per_second": 69.88, "eval_steps_per_second": 8.842, "step": 2000 }, { "epoch": 7.5, "grad_norm": 5.409728050231934, "learning_rate": 2.5037313432835825e-05, "loss": 0.222, "step": 2010 }, { "epoch": 7.54, "grad_norm": 3.949014663696289, "learning_rate": 2.4664179104477614e-05, "loss": 0.245, "step": 2020 }, { "epoch": 7.57, "grad_norm": 8.40086555480957, "learning_rate": 2.4291044776119405e-05, "loss": 0.1408, "step": 2030 }, { "epoch": 7.61, "grad_norm": 7.694955348968506, "learning_rate": 2.3917910447761197e-05, "loss": 0.2072, "step": 2040 }, { "epoch": 7.65, "grad_norm": 1.9109055995941162, "learning_rate": 2.3544776119402985e-05, "loss": 0.145, "step": 2050 }, { "epoch": 7.69, "grad_norm": 12.803776741027832, "learning_rate": 2.3171641791044777e-05, "loss": 0.1274, "step": 2060 }, { "epoch": 7.72, "grad_norm": 3.3325235843658447, "learning_rate": 2.2798507462686568e-05, "loss": 0.1564, "step": 2070 }, { "epoch": 7.76, "grad_norm": 1.105327844619751, "learning_rate": 2.242537313432836e-05, "loss": 0.2008, "step": 2080 }, { "epoch": 7.8, "grad_norm": 1.7592620849609375, "learning_rate": 2.2052238805970148e-05, "loss": 0.203, "step": 2090 }, { "epoch": 7.84, "grad_norm": 0.13264060020446777, "learning_rate": 2.1679104477611943e-05, "loss": 0.2089, "step": 2100 }, { "epoch": 7.84, "eval_accuracy": 0.827768014059754, "eval_loss": 0.693087637424469, "eval_runtime": 8.2375, "eval_samples_per_second": 69.074, "eval_steps_per_second": 8.741, "step": 2100 }, { "epoch": 7.87, "grad_norm": 5.904381275177002, "learning_rate": 2.130597014925373e-05, "loss": 0.1754, "step": 2110 }, { "epoch": 7.91, "grad_norm": 1.7469266653060913, "learning_rate": 2.0932835820895526e-05, "loss": 0.1322, "step": 2120 }, { "epoch": 7.95, "grad_norm": 4.313326835632324, "learning_rate": 2.0559701492537314e-05, "loss": 0.1418, "step": 2130 }, { "epoch": 7.99, "grad_norm": 0.14211903512477875, "learning_rate": 2.0186567164179106e-05, "loss": 0.1534, "step": 2140 }, { "epoch": 8.02, "grad_norm": 5.527184009552002, "learning_rate": 1.9813432835820897e-05, "loss": 0.2122, "step": 2150 }, { "epoch": 8.06, "grad_norm": 0.2312430739402771, "learning_rate": 1.9440298507462686e-05, "loss": 0.1617, "step": 2160 }, { "epoch": 8.1, "grad_norm": 0.23949085175991058, "learning_rate": 1.906716417910448e-05, "loss": 0.1286, "step": 2170 }, { "epoch": 8.13, "grad_norm": 0.1903185099363327, "learning_rate": 1.869402985074627e-05, "loss": 0.0846, "step": 2180 }, { "epoch": 8.17, "grad_norm": 0.08518023788928986, "learning_rate": 1.832089552238806e-05, "loss": 0.0801, "step": 2190 }, { "epoch": 8.21, "grad_norm": 4.424215793609619, "learning_rate": 1.7947761194029852e-05, "loss": 0.2245, "step": 2200 }, { "epoch": 8.21, "eval_accuracy": 0.8330404217926186, "eval_loss": 0.7087014317512512, "eval_runtime": 8.1117, "eval_samples_per_second": 70.145, "eval_steps_per_second": 8.876, "step": 2200 }, { "epoch": 8.25, "grad_norm": 7.247931480407715, "learning_rate": 1.7574626865671644e-05, "loss": 0.0722, "step": 2210 }, { "epoch": 8.28, "grad_norm": 4.80264949798584, "learning_rate": 1.7201492537313435e-05, "loss": 0.0844, "step": 2220 }, { "epoch": 8.32, "grad_norm": 8.001790046691895, "learning_rate": 1.6828358208955223e-05, "loss": 0.1077, "step": 2230 }, { "epoch": 8.36, "grad_norm": 5.419641017913818, "learning_rate": 1.6455223880597015e-05, "loss": 0.1627, "step": 2240 }, { "epoch": 8.4, "grad_norm": 0.031686268746852875, "learning_rate": 1.6082089552238806e-05, "loss": 0.0984, "step": 2250 }, { "epoch": 8.43, "grad_norm": 6.095193862915039, "learning_rate": 1.5708955223880598e-05, "loss": 0.1756, "step": 2260 }, { "epoch": 8.47, "grad_norm": 5.179446220397949, "learning_rate": 1.5335820895522386e-05, "loss": 0.1708, "step": 2270 }, { "epoch": 8.51, "grad_norm": 4.06497049331665, "learning_rate": 1.496268656716418e-05, "loss": 0.1493, "step": 2280 }, { "epoch": 8.54, "grad_norm": 1.4721342325210571, "learning_rate": 1.458955223880597e-05, "loss": 0.2587, "step": 2290 }, { "epoch": 8.58, "grad_norm": 4.418783664703369, "learning_rate": 1.4216417910447763e-05, "loss": 0.1328, "step": 2300 }, { "epoch": 8.58, "eval_accuracy": 0.8312829525483304, "eval_loss": 0.700339674949646, "eval_runtime": 8.481, "eval_samples_per_second": 67.091, "eval_steps_per_second": 8.49, "step": 2300 }, { "epoch": 8.62, "grad_norm": 1.5734038352966309, "learning_rate": 1.3843283582089553e-05, "loss": 0.165, "step": 2310 }, { "epoch": 8.66, "grad_norm": 2.624784231185913, "learning_rate": 1.3470149253731342e-05, "loss": 0.0837, "step": 2320 }, { "epoch": 8.69, "grad_norm": 2.7039573192596436, "learning_rate": 1.3097014925373136e-05, "loss": 0.2098, "step": 2330 }, { "epoch": 8.73, "grad_norm": 6.542816638946533, "learning_rate": 1.2723880597014926e-05, "loss": 0.129, "step": 2340 }, { "epoch": 8.77, "grad_norm": 2.9511120319366455, "learning_rate": 1.2350746268656717e-05, "loss": 0.1762, "step": 2350 }, { "epoch": 8.81, "grad_norm": 3.435502529144287, "learning_rate": 1.1977611940298509e-05, "loss": 0.1345, "step": 2360 }, { "epoch": 8.84, "grad_norm": 2.1689364910125732, "learning_rate": 1.1604477611940299e-05, "loss": 0.1011, "step": 2370 }, { "epoch": 8.88, "grad_norm": 2.3366479873657227, "learning_rate": 1.123134328358209e-05, "loss": 0.1733, "step": 2380 }, { "epoch": 8.92, "grad_norm": 5.928171634674072, "learning_rate": 1.085820895522388e-05, "loss": 0.1089, "step": 2390 }, { "epoch": 8.96, "grad_norm": 0.08636012673377991, "learning_rate": 1.0485074626865672e-05, "loss": 0.1304, "step": 2400 }, { "epoch": 8.96, "eval_accuracy": 0.8224956063268892, "eval_loss": 0.7306046485900879, "eval_runtime": 8.4262, "eval_samples_per_second": 67.528, "eval_steps_per_second": 8.545, "step": 2400 }, { "epoch": 8.99, "grad_norm": 0.14256200194358826, "learning_rate": 1.0111940298507463e-05, "loss": 0.1506, "step": 2410 }, { "epoch": 9.03, "grad_norm": 0.4166848659515381, "learning_rate": 9.738805970149255e-06, "loss": 0.2058, "step": 2420 }, { "epoch": 9.07, "grad_norm": 0.3997032344341278, "learning_rate": 9.365671641791045e-06, "loss": 0.0482, "step": 2430 }, { "epoch": 9.1, "grad_norm": 9.076058387756348, "learning_rate": 8.992537313432836e-06, "loss": 0.2201, "step": 2440 }, { "epoch": 9.14, "grad_norm": 4.368849277496338, "learning_rate": 8.619402985074628e-06, "loss": 0.1288, "step": 2450 }, { "epoch": 9.18, "grad_norm": 4.311466693878174, "learning_rate": 8.24626865671642e-06, "loss": 0.3058, "step": 2460 }, { "epoch": 9.22, "grad_norm": 0.2911408543586731, "learning_rate": 7.87313432835821e-06, "loss": 0.1303, "step": 2470 }, { "epoch": 9.25, "grad_norm": 5.493233680725098, "learning_rate": 7.5e-06, "loss": 0.0915, "step": 2480 }, { "epoch": 9.29, "grad_norm": 0.09431172162294388, "learning_rate": 7.126865671641792e-06, "loss": 0.0954, "step": 2490 }, { "epoch": 9.33, "grad_norm": 1.8603869676589966, "learning_rate": 6.7537313432835825e-06, "loss": 0.1514, "step": 2500 }, { "epoch": 9.33, "eval_accuracy": 0.8260105448154658, "eval_loss": 0.7162156701087952, "eval_runtime": 8.3201, "eval_samples_per_second": 68.389, "eval_steps_per_second": 8.654, "step": 2500 }, { "epoch": 9.37, "grad_norm": 4.870584964752197, "learning_rate": 6.380597014925374e-06, "loss": 0.1354, "step": 2510 }, { "epoch": 9.4, "grad_norm": 2.316840410232544, "learning_rate": 6.007462686567165e-06, "loss": 0.1348, "step": 2520 }, { "epoch": 9.44, "grad_norm": 1.9005101919174194, "learning_rate": 5.6343283582089556e-06, "loss": 0.1755, "step": 2530 }, { "epoch": 9.48, "grad_norm": 0.1674620360136032, "learning_rate": 5.261194029850746e-06, "loss": 0.0878, "step": 2540 }, { "epoch": 9.51, "grad_norm": 5.729959011077881, "learning_rate": 4.888059701492537e-06, "loss": 0.1637, "step": 2550 }, { "epoch": 9.55, "grad_norm": 0.02724504843354225, "learning_rate": 4.514925373134329e-06, "loss": 0.1603, "step": 2560 }, { "epoch": 9.59, "grad_norm": 2.728663921356201, "learning_rate": 4.141791044776119e-06, "loss": 0.1152, "step": 2570 }, { "epoch": 9.63, "grad_norm": 8.920695304870605, "learning_rate": 3.7686567164179105e-06, "loss": 0.1964, "step": 2580 }, { "epoch": 9.66, "grad_norm": 2.3974239826202393, "learning_rate": 3.3955223880597013e-06, "loss": 0.0842, "step": 2590 }, { "epoch": 9.7, "grad_norm": 1.6431355476379395, "learning_rate": 3.022388059701493e-06, "loss": 0.2571, "step": 2600 }, { "epoch": 9.7, "eval_accuracy": 0.8347978910369068, "eval_loss": 0.7012546062469482, "eval_runtime": 8.3265, "eval_samples_per_second": 68.336, "eval_steps_per_second": 8.647, "step": 2600 }, { "epoch": 9.74, "grad_norm": 0.10621854662895203, "learning_rate": 2.6492537313432836e-06, "loss": 0.2632, "step": 2610 }, { "epoch": 9.78, "grad_norm": 4.150152206420898, "learning_rate": 2.2761194029850747e-06, "loss": 0.2804, "step": 2620 }, { "epoch": 9.81, "grad_norm": 4.01139497756958, "learning_rate": 1.9029850746268657e-06, "loss": 0.1696, "step": 2630 }, { "epoch": 9.85, "grad_norm": 4.7402262687683105, "learning_rate": 1.5298507462686568e-06, "loss": 0.1891, "step": 2640 }, { "epoch": 9.89, "grad_norm": 4.460111141204834, "learning_rate": 1.1567164179104478e-06, "loss": 0.1178, "step": 2650 }, { "epoch": 9.93, "grad_norm": 5.822507858276367, "learning_rate": 7.835820895522387e-07, "loss": 0.089, "step": 2660 }, { "epoch": 9.96, "grad_norm": 2.4408085346221924, "learning_rate": 4.1044776119402984e-07, "loss": 0.158, "step": 2670 }, { "epoch": 10.0, "grad_norm": 10.792135238647461, "learning_rate": 3.7313432835820895e-08, "loss": 0.2038, "step": 2680 }, { "epoch": 10.0, "step": 2680, "total_flos": 3.3230947683690086e+18, "train_loss": 0.23535207314277762, "train_runtime": 1371.8304, "train_samples_per_second": 31.258, "train_steps_per_second": 1.954 } ], "logging_steps": 10, "max_steps": 2680, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 3.3230947683690086e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }