{ "best_metric": 0.4589254856109619, "best_model_checkpoint": "Action_model/checkpoint-1500", "epoch": 10.0, "eval_steps": 100, "global_step": 2680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.7369908094406128, "learning_rate": 9.96268656716418e-05, "loss": 2.2759, "step": 10 }, { "epoch": 0.07, "grad_norm": 1.753720998764038, "learning_rate": 9.925373134328359e-05, "loss": 2.1743, "step": 20 }, { "epoch": 0.11, "grad_norm": 1.8532754182815552, "learning_rate": 9.888059701492539e-05, "loss": 2.0233, "step": 30 }, { "epoch": 0.15, "grad_norm": 2.195688486099243, "learning_rate": 9.850746268656717e-05, "loss": 1.8293, "step": 40 }, { "epoch": 0.19, "grad_norm": 2.392077684402466, "learning_rate": 9.813432835820896e-05, "loss": 1.7307, "step": 50 }, { "epoch": 0.22, "grad_norm": 2.851775646209717, "learning_rate": 9.776119402985075e-05, "loss": 1.5716, "step": 60 }, { "epoch": 0.26, "grad_norm": 2.2557411193847656, "learning_rate": 9.738805970149254e-05, "loss": 1.4694, "step": 70 }, { "epoch": 0.3, "grad_norm": 2.4612302780151367, "learning_rate": 9.701492537313434e-05, "loss": 1.3609, "step": 80 }, { "epoch": 0.34, "grad_norm": 2.7514560222625732, "learning_rate": 9.664179104477612e-05, "loss": 1.2871, "step": 90 }, { "epoch": 0.37, "grad_norm": 3.6256659030914307, "learning_rate": 9.626865671641792e-05, "loss": 1.2754, "step": 100 }, { "epoch": 0.37, "eval_accuracy": 0.7328646748681898, "eval_loss": 1.1163370609283447, "eval_runtime": 12.5514, "eval_samples_per_second": 45.333, "eval_steps_per_second": 5.736, "step": 100 }, { "epoch": 0.41, "grad_norm": 2.642601728439331, "learning_rate": 9.58955223880597e-05, "loss": 1.2354, "step": 110 }, { "epoch": 0.45, "grad_norm": 2.4862725734710693, "learning_rate": 9.552238805970149e-05, "loss": 1.169, "step": 120 }, { "epoch": 0.49, "grad_norm": 3.962764263153076, "learning_rate": 9.514925373134329e-05, "loss": 1.2546, "step": 130 }, { "epoch": 0.52, "grad_norm": 2.9388816356658936, "learning_rate": 9.477611940298507e-05, "loss": 1.1702, "step": 140 }, { "epoch": 0.56, "grad_norm": 4.958592414855957, "learning_rate": 9.440298507462687e-05, "loss": 1.0865, "step": 150 }, { "epoch": 0.6, "grad_norm": 3.4470815658569336, "learning_rate": 9.402985074626867e-05, "loss": 1.0097, "step": 160 }, { "epoch": 0.63, "grad_norm": 4.423004627227783, "learning_rate": 9.365671641791045e-05, "loss": 1.0749, "step": 170 }, { "epoch": 0.67, "grad_norm": 2.808164358139038, "learning_rate": 9.328358208955224e-05, "loss": 0.9732, "step": 180 }, { "epoch": 0.71, "grad_norm": 6.00456428527832, "learning_rate": 9.291044776119402e-05, "loss": 1.0009, "step": 190 }, { "epoch": 0.75, "grad_norm": 5.091552734375, "learning_rate": 9.253731343283582e-05, "loss": 0.9345, "step": 200 }, { "epoch": 0.75, "eval_accuracy": 0.7996485061511424, "eval_loss": 0.8296495079994202, "eval_runtime": 7.8912, "eval_samples_per_second": 72.105, "eval_steps_per_second": 9.124, "step": 200 }, { "epoch": 0.78, "grad_norm": 3.2533326148986816, "learning_rate": 9.216417910447762e-05, "loss": 0.793, "step": 210 }, { "epoch": 0.82, "grad_norm": 6.073918342590332, "learning_rate": 9.17910447761194e-05, "loss": 0.9835, "step": 220 }, { "epoch": 0.86, "grad_norm": 3.6311192512512207, "learning_rate": 9.14179104477612e-05, "loss": 0.8801, "step": 230 }, { "epoch": 0.9, "grad_norm": 4.446895599365234, "learning_rate": 9.104477611940299e-05, "loss": 1.0534, "step": 240 }, { "epoch": 0.93, "grad_norm": 4.668705463409424, "learning_rate": 9.067164179104479e-05, "loss": 0.9396, "step": 250 }, { "epoch": 0.97, "grad_norm": 6.191302299499512, "learning_rate": 9.029850746268657e-05, "loss": 0.9275, "step": 260 }, { "epoch": 1.01, "grad_norm": 3.170959711074829, "learning_rate": 8.992537313432836e-05, "loss": 0.8595, "step": 270 }, { "epoch": 1.04, "grad_norm": 3.690964460372925, "learning_rate": 8.955223880597016e-05, "loss": 0.733, "step": 280 }, { "epoch": 1.08, "grad_norm": 4.871851444244385, "learning_rate": 8.917910447761194e-05, "loss": 0.7623, "step": 290 }, { "epoch": 1.12, "grad_norm": 3.3851799964904785, "learning_rate": 8.880597014925374e-05, "loss": 0.8816, "step": 300 }, { "epoch": 1.12, "eval_accuracy": 0.8101933216168717, "eval_loss": 0.7156229615211487, "eval_runtime": 7.8519, "eval_samples_per_second": 72.467, "eval_steps_per_second": 9.17, "step": 300 }, { "epoch": 1.16, "grad_norm": 3.334380865097046, "learning_rate": 8.843283582089554e-05, "loss": 0.8567, "step": 310 }, { "epoch": 1.19, "grad_norm": 4.673859596252441, "learning_rate": 8.805970149253732e-05, "loss": 0.7926, "step": 320 }, { "epoch": 1.23, "grad_norm": 3.3042550086975098, "learning_rate": 8.76865671641791e-05, "loss": 0.6847, "step": 330 }, { "epoch": 1.27, "grad_norm": 5.4356513023376465, "learning_rate": 8.731343283582089e-05, "loss": 0.7656, "step": 340 }, { "epoch": 1.31, "grad_norm": 7.050413131713867, "learning_rate": 8.694029850746269e-05, "loss": 0.6658, "step": 350 }, { "epoch": 1.34, "grad_norm": 5.980592727661133, "learning_rate": 8.656716417910447e-05, "loss": 0.7948, "step": 360 }, { "epoch": 1.38, "grad_norm": 3.894716739654541, "learning_rate": 8.619402985074627e-05, "loss": 0.8381, "step": 370 }, { "epoch": 1.42, "grad_norm": 7.189664363861084, "learning_rate": 8.582089552238807e-05, "loss": 0.6532, "step": 380 }, { "epoch": 1.46, "grad_norm": 4.317276477813721, "learning_rate": 8.544776119402986e-05, "loss": 0.7763, "step": 390 }, { "epoch": 1.49, "grad_norm": 4.480589866638184, "learning_rate": 8.511194029850747e-05, "loss": 0.7425, "step": 400 }, { "epoch": 1.49, "eval_accuracy": 0.8066783831282952, "eval_loss": 0.6529447436332703, "eval_runtime": 7.793, "eval_samples_per_second": 73.014, "eval_steps_per_second": 9.239, "step": 400 }, { "epoch": 1.53, "grad_norm": 4.1799163818359375, "learning_rate": 8.473880597014926e-05, "loss": 0.6928, "step": 410 }, { "epoch": 1.57, "grad_norm": 4.81996488571167, "learning_rate": 8.436567164179105e-05, "loss": 0.7769, "step": 420 }, { "epoch": 1.6, "grad_norm": 7.18645715713501, "learning_rate": 8.399253731343283e-05, "loss": 0.6848, "step": 430 }, { "epoch": 1.64, "grad_norm": 3.888197660446167, "learning_rate": 8.361940298507463e-05, "loss": 0.5977, "step": 440 }, { "epoch": 1.68, "grad_norm": 7.374312877655029, "learning_rate": 8.324626865671642e-05, "loss": 0.6001, "step": 450 }, { "epoch": 1.72, "grad_norm": 6.553064823150635, "learning_rate": 8.287313432835821e-05, "loss": 0.6683, "step": 460 }, { "epoch": 1.75, "grad_norm": 3.466761589050293, "learning_rate": 8.25e-05, "loss": 0.6484, "step": 470 }, { "epoch": 1.79, "grad_norm": 3.534076690673828, "learning_rate": 8.21268656716418e-05, "loss": 0.6589, "step": 480 }, { "epoch": 1.83, "grad_norm": 3.581280469894409, "learning_rate": 8.17537313432836e-05, "loss": 0.6173, "step": 490 }, { "epoch": 1.87, "grad_norm": 6.162041664123535, "learning_rate": 8.138059701492538e-05, "loss": 0.6883, "step": 500 }, { "epoch": 1.87, "eval_accuracy": 0.8242530755711776, "eval_loss": 0.6078779697418213, "eval_runtime": 7.6716, "eval_samples_per_second": 74.169, "eval_steps_per_second": 9.385, "step": 500 }, { "epoch": 1.9, "grad_norm": 5.477086067199707, "learning_rate": 8.100746268656717e-05, "loss": 0.5952, "step": 510 }, { "epoch": 1.94, "grad_norm": 2.389667510986328, "learning_rate": 8.063432835820895e-05, "loss": 0.5193, "step": 520 }, { "epoch": 1.98, "grad_norm": 5.730781555175781, "learning_rate": 8.026119402985075e-05, "loss": 0.6818, "step": 530 }, { "epoch": 2.01, "grad_norm": 6.305990219116211, "learning_rate": 7.992537313432836e-05, "loss": 0.5738, "step": 540 }, { "epoch": 2.05, "grad_norm": 3.507434368133545, "learning_rate": 7.955223880597016e-05, "loss": 0.5685, "step": 550 }, { "epoch": 2.09, "grad_norm": 12.683993339538574, "learning_rate": 7.917910447761194e-05, "loss": 0.6684, "step": 560 }, { "epoch": 2.13, "grad_norm": 5.5166916847229, "learning_rate": 7.880597014925374e-05, "loss": 0.4787, "step": 570 }, { "epoch": 2.16, "grad_norm": 6.427499294281006, "learning_rate": 7.843283582089552e-05, "loss": 0.5818, "step": 580 }, { "epoch": 2.2, "grad_norm": 5.062973976135254, "learning_rate": 7.805970149253732e-05, "loss": 0.4766, "step": 590 }, { "epoch": 2.24, "grad_norm": 5.720675945281982, "learning_rate": 7.768656716417911e-05, "loss": 0.5454, "step": 600 }, { "epoch": 2.24, "eval_accuracy": 0.8347978910369068, "eval_loss": 0.5604887008666992, "eval_runtime": 7.7133, "eval_samples_per_second": 73.769, "eval_steps_per_second": 9.335, "step": 600 }, { "epoch": 2.28, "grad_norm": 7.875051021575928, "learning_rate": 7.731343283582089e-05, "loss": 0.5935, "step": 610 }, { "epoch": 2.31, "grad_norm": 4.378401756286621, "learning_rate": 7.694029850746269e-05, "loss": 0.4639, "step": 620 }, { "epoch": 2.35, "grad_norm": 7.522930145263672, "learning_rate": 7.656716417910448e-05, "loss": 0.4867, "step": 630 }, { "epoch": 2.39, "grad_norm": 6.3615288734436035, "learning_rate": 7.619402985074627e-05, "loss": 0.5302, "step": 640 }, { "epoch": 2.43, "grad_norm": 3.8204784393310547, "learning_rate": 7.582089552238806e-05, "loss": 0.3864, "step": 650 }, { "epoch": 2.46, "grad_norm": 2.3520662784576416, "learning_rate": 7.544776119402986e-05, "loss": 0.6458, "step": 660 }, { "epoch": 2.5, "grad_norm": 3.9832942485809326, "learning_rate": 7.507462686567166e-05, "loss": 0.494, "step": 670 }, { "epoch": 2.54, "grad_norm": 3.6783320903778076, "learning_rate": 7.470149253731343e-05, "loss": 0.6213, "step": 680 }, { "epoch": 2.57, "grad_norm": 4.528789520263672, "learning_rate": 7.432835820895523e-05, "loss": 0.615, "step": 690 }, { "epoch": 2.61, "grad_norm": 5.556227207183838, "learning_rate": 7.395522388059701e-05, "loss": 0.5383, "step": 700 }, { "epoch": 2.61, "eval_accuracy": 0.8295254833040422, "eval_loss": 0.5571200251579285, "eval_runtime": 7.8934, "eval_samples_per_second": 72.085, "eval_steps_per_second": 9.122, "step": 700 }, { "epoch": 2.65, "grad_norm": 4.617480754852295, "learning_rate": 7.358208955223881e-05, "loss": 0.4987, "step": 710 }, { "epoch": 2.69, "grad_norm": 4.6940412521362305, "learning_rate": 7.32089552238806e-05, "loss": 0.5466, "step": 720 }, { "epoch": 2.72, "grad_norm": 3.8839175701141357, "learning_rate": 7.283582089552239e-05, "loss": 0.5409, "step": 730 }, { "epoch": 2.76, "grad_norm": 6.855696201324463, "learning_rate": 7.246268656716419e-05, "loss": 0.3972, "step": 740 }, { "epoch": 2.8, "grad_norm": 3.9779269695281982, "learning_rate": 7.208955223880597e-05, "loss": 0.4719, "step": 750 }, { "epoch": 2.84, "grad_norm": 10.327420234680176, "learning_rate": 7.171641791044776e-05, "loss": 0.668, "step": 760 }, { "epoch": 2.87, "grad_norm": 5.06951904296875, "learning_rate": 7.134328358208956e-05, "loss": 0.5899, "step": 770 }, { "epoch": 2.91, "grad_norm": 5.539373397827148, "learning_rate": 7.097014925373134e-05, "loss": 0.5813, "step": 780 }, { "epoch": 2.95, "grad_norm": 4.622121334075928, "learning_rate": 7.059701492537314e-05, "loss": 0.5294, "step": 790 }, { "epoch": 2.99, "grad_norm": 2.6457552909851074, "learning_rate": 7.022388059701493e-05, "loss": 0.5442, "step": 800 }, { "epoch": 2.99, "eval_accuracy": 0.8189806678383128, "eval_loss": 0.5864126682281494, "eval_runtime": 7.8507, "eval_samples_per_second": 72.478, "eval_steps_per_second": 9.171, "step": 800 }, { "epoch": 3.02, "grad_norm": 3.373798370361328, "learning_rate": 6.985074626865672e-05, "loss": 0.4183, "step": 810 }, { "epoch": 3.06, "grad_norm": 4.0179667472839355, "learning_rate": 6.947761194029851e-05, "loss": 0.3611, "step": 820 }, { "epoch": 3.1, "grad_norm": 7.72437858581543, "learning_rate": 6.91044776119403e-05, "loss": 0.4543, "step": 830 }, { "epoch": 3.13, "grad_norm": 3.1097893714904785, "learning_rate": 6.873134328358209e-05, "loss": 0.5194, "step": 840 }, { "epoch": 3.17, "grad_norm": 6.581250190734863, "learning_rate": 6.835820895522388e-05, "loss": 0.3839, "step": 850 }, { "epoch": 3.21, "grad_norm": 5.605171203613281, "learning_rate": 6.798507462686568e-05, "loss": 0.4499, "step": 860 }, { "epoch": 3.25, "grad_norm": 2.834651231765747, "learning_rate": 6.761194029850747e-05, "loss": 0.5067, "step": 870 }, { "epoch": 3.28, "grad_norm": 4.615099906921387, "learning_rate": 6.723880597014926e-05, "loss": 0.4869, "step": 880 }, { "epoch": 3.32, "grad_norm": 6.115981101989746, "learning_rate": 6.686567164179106e-05, "loss": 0.4793, "step": 890 }, { "epoch": 3.36, "grad_norm": 1.1021697521209717, "learning_rate": 6.649253731343283e-05, "loss": 0.3986, "step": 900 }, { "epoch": 3.36, "eval_accuracy": 0.8312829525483304, "eval_loss": 0.5632173418998718, "eval_runtime": 7.731, "eval_samples_per_second": 73.6, "eval_steps_per_second": 9.313, "step": 900 }, { "epoch": 3.4, "grad_norm": 7.019008159637451, "learning_rate": 6.611940298507463e-05, "loss": 0.383, "step": 910 }, { "epoch": 3.43, "grad_norm": 2.586031913757324, "learning_rate": 6.574626865671642e-05, "loss": 0.2752, "step": 920 }, { "epoch": 3.47, "grad_norm": 2.5189669132232666, "learning_rate": 6.537313432835821e-05, "loss": 0.2944, "step": 930 }, { "epoch": 3.51, "grad_norm": 10.028382301330566, "learning_rate": 6.500000000000001e-05, "loss": 0.4378, "step": 940 }, { "epoch": 3.54, "grad_norm": 1.8697803020477295, "learning_rate": 6.462686567164179e-05, "loss": 0.3956, "step": 950 }, { "epoch": 3.58, "grad_norm": 5.872415065765381, "learning_rate": 6.425373134328359e-05, "loss": 0.338, "step": 960 }, { "epoch": 3.62, "grad_norm": 8.272451400756836, "learning_rate": 6.388059701492538e-05, "loss": 0.4264, "step": 970 }, { "epoch": 3.66, "grad_norm": 9.422249794006348, "learning_rate": 6.350746268656716e-05, "loss": 0.4258, "step": 980 }, { "epoch": 3.69, "grad_norm": 8.768738746643066, "learning_rate": 6.313432835820896e-05, "loss": 0.3308, "step": 990 }, { "epoch": 3.73, "grad_norm": 6.355968475341797, "learning_rate": 6.276119402985074e-05, "loss": 0.3438, "step": 1000 }, { "epoch": 3.73, "eval_accuracy": 0.836555360281195, "eval_loss": 0.5606371760368347, "eval_runtime": 7.818, "eval_samples_per_second": 72.781, "eval_steps_per_second": 9.21, "step": 1000 }, { "epoch": 3.77, "grad_norm": 3.973480463027954, "learning_rate": 6.238805970149254e-05, "loss": 0.5042, "step": 1010 }, { "epoch": 3.81, "grad_norm": 5.739313125610352, "learning_rate": 6.201492537313434e-05, "loss": 0.4515, "step": 1020 }, { "epoch": 3.84, "grad_norm": 4.196649074554443, "learning_rate": 6.164179104477613e-05, "loss": 0.4404, "step": 1030 }, { "epoch": 3.88, "grad_norm": 4.671971321105957, "learning_rate": 6.126865671641791e-05, "loss": 0.4746, "step": 1040 }, { "epoch": 3.92, "grad_norm": 6.87581205368042, "learning_rate": 6.08955223880597e-05, "loss": 0.4637, "step": 1050 }, { "epoch": 3.96, "grad_norm": 7.224815368652344, "learning_rate": 6.052238805970149e-05, "loss": 0.4754, "step": 1060 }, { "epoch": 3.99, "grad_norm": 4.4340314865112305, "learning_rate": 6.014925373134329e-05, "loss": 0.4165, "step": 1070 }, { "epoch": 4.03, "grad_norm": 1.151932716369629, "learning_rate": 5.9776119402985076e-05, "loss": 0.3498, "step": 1080 }, { "epoch": 4.07, "grad_norm": 6.31879997253418, "learning_rate": 5.940298507462687e-05, "loss": 0.3505, "step": 1090 }, { "epoch": 4.1, "grad_norm": 4.674696445465088, "learning_rate": 5.902985074626865e-05, "loss": 0.4345, "step": 1100 }, { "epoch": 4.1, "eval_accuracy": 0.836555360281195, "eval_loss": 0.5353797674179077, "eval_runtime": 7.9559, "eval_samples_per_second": 71.519, "eval_steps_per_second": 9.05, "step": 1100 }, { "epoch": 4.14, "grad_norm": 6.790203094482422, "learning_rate": 5.865671641791045e-05, "loss": 0.3189, "step": 1110 }, { "epoch": 4.18, "grad_norm": 5.554905414581299, "learning_rate": 5.828358208955225e-05, "loss": 0.3255, "step": 1120 }, { "epoch": 4.22, "grad_norm": 1.87189781665802, "learning_rate": 5.7910447761194034e-05, "loss": 0.2613, "step": 1130 }, { "epoch": 4.25, "grad_norm": 3.4729249477386475, "learning_rate": 5.7537313432835826e-05, "loss": 0.4037, "step": 1140 }, { "epoch": 4.29, "grad_norm": 3.2373063564300537, "learning_rate": 5.716417910447761e-05, "loss": 0.384, "step": 1150 }, { "epoch": 4.33, "grad_norm": 1.8042526245117188, "learning_rate": 5.679104477611941e-05, "loss": 0.4024, "step": 1160 }, { "epoch": 4.37, "grad_norm": 0.9592193365097046, "learning_rate": 5.64179104477612e-05, "loss": 0.3646, "step": 1170 }, { "epoch": 4.4, "grad_norm": 4.0469584465026855, "learning_rate": 5.6044776119402986e-05, "loss": 0.3622, "step": 1180 }, { "epoch": 4.44, "grad_norm": 4.470405578613281, "learning_rate": 5.5671641791044784e-05, "loss": 0.2996, "step": 1190 }, { "epoch": 4.48, "grad_norm": 6.086768627166748, "learning_rate": 5.529850746268657e-05, "loss": 0.4523, "step": 1200 }, { "epoch": 4.48, "eval_accuracy": 0.8576449912126538, "eval_loss": 0.49876561760902405, "eval_runtime": 7.8527, "eval_samples_per_second": 72.459, "eval_steps_per_second": 9.169, "step": 1200 }, { "epoch": 4.51, "grad_norm": 3.478428363800049, "learning_rate": 5.492537313432836e-05, "loss": 0.4198, "step": 1210 }, { "epoch": 4.55, "grad_norm": 4.539990425109863, "learning_rate": 5.455223880597016e-05, "loss": 0.3125, "step": 1220 }, { "epoch": 4.59, "grad_norm": 3.971435070037842, "learning_rate": 5.4179104477611943e-05, "loss": 0.2773, "step": 1230 }, { "epoch": 4.63, "grad_norm": 7.168191909790039, "learning_rate": 5.3805970149253735e-05, "loss": 0.4852, "step": 1240 }, { "epoch": 4.66, "grad_norm": 2.896576166152954, "learning_rate": 5.343283582089552e-05, "loss": 0.3425, "step": 1250 }, { "epoch": 4.7, "grad_norm": 1.4190607070922852, "learning_rate": 5.305970149253732e-05, "loss": 0.2219, "step": 1260 }, { "epoch": 4.74, "grad_norm": 5.066045761108398, "learning_rate": 5.268656716417911e-05, "loss": 0.3447, "step": 1270 }, { "epoch": 4.78, "grad_norm": 4.2649126052856445, "learning_rate": 5.2313432835820895e-05, "loss": 0.3931, "step": 1280 }, { "epoch": 4.81, "grad_norm": 5.704684734344482, "learning_rate": 5.197761194029851e-05, "loss": 0.4274, "step": 1290 }, { "epoch": 4.85, "grad_norm": 6.395939350128174, "learning_rate": 5.16044776119403e-05, "loss": 0.3162, "step": 1300 }, { "epoch": 4.85, "eval_accuracy": 0.8541300527240774, "eval_loss": 0.5099390745162964, "eval_runtime": 7.9919, "eval_samples_per_second": 71.197, "eval_steps_per_second": 9.009, "step": 1300 }, { "epoch": 4.89, "grad_norm": 2.4717729091644287, "learning_rate": 5.123134328358209e-05, "loss": 0.3442, "step": 1310 }, { "epoch": 4.93, "grad_norm": 0.6504545211791992, "learning_rate": 5.0858208955223885e-05, "loss": 0.3313, "step": 1320 }, { "epoch": 4.96, "grad_norm": 4.316141128540039, "learning_rate": 5.048507462686567e-05, "loss": 0.3787, "step": 1330 }, { "epoch": 5.0, "grad_norm": 4.9243998527526855, "learning_rate": 5.011194029850746e-05, "loss": 0.38, "step": 1340 }, { "epoch": 5.04, "grad_norm": 5.312038421630859, "learning_rate": 4.973880597014925e-05, "loss": 0.3268, "step": 1350 }, { "epoch": 5.07, "grad_norm": 3.5483176708221436, "learning_rate": 4.9365671641791045e-05, "loss": 0.3423, "step": 1360 }, { "epoch": 5.11, "grad_norm": 4.414547920227051, "learning_rate": 4.899253731343284e-05, "loss": 0.2421, "step": 1370 }, { "epoch": 5.15, "grad_norm": 5.7323689460754395, "learning_rate": 4.861940298507463e-05, "loss": 0.2795, "step": 1380 }, { "epoch": 5.19, "grad_norm": 4.2763471603393555, "learning_rate": 4.824626865671642e-05, "loss": 0.2402, "step": 1390 }, { "epoch": 5.22, "grad_norm": 9.259199142456055, "learning_rate": 4.787313432835821e-05, "loss": 0.3793, "step": 1400 }, { "epoch": 5.22, "eval_accuracy": 0.843585237258348, "eval_loss": 0.5190387964248657, "eval_runtime": 7.7562, "eval_samples_per_second": 73.361, "eval_steps_per_second": 9.283, "step": 1400 }, { "epoch": 5.26, "grad_norm": 4.773892402648926, "learning_rate": 4.75e-05, "loss": 0.3476, "step": 1410 }, { "epoch": 5.3, "grad_norm": 1.1271159648895264, "learning_rate": 4.7126865671641794e-05, "loss": 0.1949, "step": 1420 }, { "epoch": 5.34, "grad_norm": 2.823958158493042, "learning_rate": 4.6753731343283586e-05, "loss": 0.3009, "step": 1430 }, { "epoch": 5.37, "grad_norm": 0.35977163910865784, "learning_rate": 4.638059701492538e-05, "loss": 0.1821, "step": 1440 }, { "epoch": 5.41, "grad_norm": 3.380308151245117, "learning_rate": 4.600746268656716e-05, "loss": 0.323, "step": 1450 }, { "epoch": 5.45, "grad_norm": 5.946179389953613, "learning_rate": 4.5634328358208954e-05, "loss": 0.5344, "step": 1460 }, { "epoch": 5.49, "grad_norm": 8.254781723022461, "learning_rate": 4.526119402985075e-05, "loss": 0.2799, "step": 1470 }, { "epoch": 5.52, "grad_norm": 6.808130741119385, "learning_rate": 4.4888059701492544e-05, "loss": 0.3173, "step": 1480 }, { "epoch": 5.56, "grad_norm": 17.452037811279297, "learning_rate": 4.451492537313433e-05, "loss": 0.3251, "step": 1490 }, { "epoch": 5.6, "grad_norm": 2.3097095489501953, "learning_rate": 4.414179104477612e-05, "loss": 0.3228, "step": 1500 }, { "epoch": 5.6, "eval_accuracy": 0.8576449912126538, "eval_loss": 0.4589254856109619, "eval_runtime": 8.0547, "eval_samples_per_second": 70.642, "eval_steps_per_second": 8.939, "step": 1500 }, { "epoch": 5.63, "grad_norm": 3.337970018386841, "learning_rate": 4.376865671641791e-05, "loss": 0.2528, "step": 1510 }, { "epoch": 5.67, "grad_norm": 0.5921415090560913, "learning_rate": 4.33955223880597e-05, "loss": 0.2459, "step": 1520 }, { "epoch": 5.71, "grad_norm": 4.148998260498047, "learning_rate": 4.3022388059701495e-05, "loss": 0.2927, "step": 1530 }, { "epoch": 5.75, "grad_norm": 5.740537166595459, "learning_rate": 4.2649253731343286e-05, "loss": 0.423, "step": 1540 }, { "epoch": 5.78, "grad_norm": 5.316250324249268, "learning_rate": 4.227611940298508e-05, "loss": 0.3735, "step": 1550 }, { "epoch": 5.82, "grad_norm": 5.52378511428833, "learning_rate": 4.190298507462686e-05, "loss": 0.3613, "step": 1560 }, { "epoch": 5.86, "grad_norm": 2.1002511978149414, "learning_rate": 4.152985074626866e-05, "loss": 0.259, "step": 1570 }, { "epoch": 5.9, "grad_norm": 5.339119911193848, "learning_rate": 4.115671641791045e-05, "loss": 0.3355, "step": 1580 }, { "epoch": 5.93, "grad_norm": 3.0551536083221436, "learning_rate": 4.0783582089552244e-05, "loss": 0.4342, "step": 1590 }, { "epoch": 5.97, "grad_norm": 6.549235820770264, "learning_rate": 4.041044776119403e-05, "loss": 0.1795, "step": 1600 }, { "epoch": 5.97, "eval_accuracy": 0.8488576449912126, "eval_loss": 0.5095508694648743, "eval_runtime": 7.7872, "eval_samples_per_second": 73.068, "eval_steps_per_second": 9.246, "step": 1600 }, { "epoch": 6.01, "grad_norm": 11.5170316696167, "learning_rate": 4.003731343283582e-05, "loss": 0.3778, "step": 1610 }, { "epoch": 6.04, "grad_norm": 6.004143238067627, "learning_rate": 3.966417910447761e-05, "loss": 0.3624, "step": 1620 }, { "epoch": 6.08, "grad_norm": 4.328847885131836, "learning_rate": 3.9291044776119404e-05, "loss": 0.3478, "step": 1630 }, { "epoch": 6.12, "grad_norm": 3.5757558345794678, "learning_rate": 3.8917910447761195e-05, "loss": 0.2208, "step": 1640 }, { "epoch": 6.16, "grad_norm": 8.37783432006836, "learning_rate": 3.854477611940299e-05, "loss": 0.3614, "step": 1650 }, { "epoch": 6.19, "grad_norm": 2.4890713691711426, "learning_rate": 3.817164179104478e-05, "loss": 0.2514, "step": 1660 }, { "epoch": 6.23, "grad_norm": 8.873276710510254, "learning_rate": 3.7798507462686563e-05, "loss": 0.2233, "step": 1670 }, { "epoch": 6.27, "grad_norm": 0.29393309354782104, "learning_rate": 3.742537313432836e-05, "loss": 0.2474, "step": 1680 }, { "epoch": 6.31, "grad_norm": 3.810150384902954, "learning_rate": 3.7052238805970153e-05, "loss": 0.2481, "step": 1690 }, { "epoch": 6.34, "grad_norm": 1.989057183265686, "learning_rate": 3.6679104477611945e-05, "loss": 0.2626, "step": 1700 }, { "epoch": 6.34, "eval_accuracy": 0.8488576449912126, "eval_loss": 0.5402765274047852, "eval_runtime": 7.9293, "eval_samples_per_second": 71.759, "eval_steps_per_second": 9.08, "step": 1700 }, { "epoch": 6.38, "grad_norm": 8.488819122314453, "learning_rate": 3.630597014925373e-05, "loss": 0.2826, "step": 1710 }, { "epoch": 6.42, "grad_norm": 5.542993068695068, "learning_rate": 3.593283582089552e-05, "loss": 0.3552, "step": 1720 }, { "epoch": 6.46, "grad_norm": 6.646905422210693, "learning_rate": 3.555970149253732e-05, "loss": 0.4405, "step": 1730 }, { "epoch": 6.49, "grad_norm": 4.022976398468018, "learning_rate": 3.5186567164179105e-05, "loss": 0.2738, "step": 1740 }, { "epoch": 6.53, "grad_norm": 3.5472657680511475, "learning_rate": 3.4813432835820896e-05, "loss": 0.2807, "step": 1750 }, { "epoch": 6.57, "grad_norm": 12.070052146911621, "learning_rate": 3.444029850746269e-05, "loss": 0.3634, "step": 1760 }, { "epoch": 6.6, "grad_norm": 5.368374347686768, "learning_rate": 3.406716417910448e-05, "loss": 0.3252, "step": 1770 }, { "epoch": 6.64, "grad_norm": 5.566130638122559, "learning_rate": 3.369402985074627e-05, "loss": 0.3034, "step": 1780 }, { "epoch": 6.68, "grad_norm": 5.875336170196533, "learning_rate": 3.332089552238806e-05, "loss": 0.3406, "step": 1790 }, { "epoch": 6.72, "grad_norm": 2.4168920516967773, "learning_rate": 3.2947761194029854e-05, "loss": 0.3041, "step": 1800 }, { "epoch": 6.72, "eval_accuracy": 0.8488576449912126, "eval_loss": 0.4907586872577667, "eval_runtime": 7.8209, "eval_samples_per_second": 72.754, "eval_steps_per_second": 9.206, "step": 1800 }, { "epoch": 6.75, "grad_norm": 3.1040282249450684, "learning_rate": 3.2574626865671646e-05, "loss": 0.3167, "step": 1810 }, { "epoch": 6.79, "grad_norm": 1.8458846807479858, "learning_rate": 3.220149253731343e-05, "loss": 0.2061, "step": 1820 }, { "epoch": 6.83, "grad_norm": 0.4053177833557129, "learning_rate": 3.182835820895523e-05, "loss": 0.3113, "step": 1830 }, { "epoch": 6.87, "grad_norm": 0.23064230382442474, "learning_rate": 3.145522388059702e-05, "loss": 0.2368, "step": 1840 }, { "epoch": 6.9, "grad_norm": 1.006479263305664, "learning_rate": 3.1082089552238805e-05, "loss": 0.2265, "step": 1850 }, { "epoch": 6.94, "grad_norm": 4.072957992553711, "learning_rate": 3.07089552238806e-05, "loss": 0.2976, "step": 1860 }, { "epoch": 6.98, "grad_norm": 16.575963973999023, "learning_rate": 3.033582089552239e-05, "loss": 0.1504, "step": 1870 }, { "epoch": 7.01, "grad_norm": 2.9144656658172607, "learning_rate": 2.9962686567164183e-05, "loss": 0.2156, "step": 1880 }, { "epoch": 7.05, "grad_norm": 4.547207832336426, "learning_rate": 2.958955223880597e-05, "loss": 0.2693, "step": 1890 }, { "epoch": 7.09, "grad_norm": 0.5566532611846924, "learning_rate": 2.9216417910447763e-05, "loss": 0.1831, "step": 1900 }, { "epoch": 7.09, "eval_accuracy": 0.8383128295254832, "eval_loss": 0.5721341967582703, "eval_runtime": 7.7377, "eval_samples_per_second": 73.536, "eval_steps_per_second": 9.305, "step": 1900 }, { "epoch": 7.13, "grad_norm": 7.9241838455200195, "learning_rate": 2.8843283582089555e-05, "loss": 0.3037, "step": 1910 }, { "epoch": 7.16, "grad_norm": 4.847833156585693, "learning_rate": 2.8470149253731343e-05, "loss": 0.2744, "step": 1920 }, { "epoch": 7.2, "grad_norm": 4.368974208831787, "learning_rate": 2.8097014925373134e-05, "loss": 0.1603, "step": 1930 }, { "epoch": 7.24, "grad_norm": 5.848027229309082, "learning_rate": 2.772388059701493e-05, "loss": 0.3318, "step": 1940 }, { "epoch": 7.28, "grad_norm": 5.53363037109375, "learning_rate": 2.7350746268656718e-05, "loss": 0.2568, "step": 1950 }, { "epoch": 7.31, "grad_norm": 1.3791863918304443, "learning_rate": 2.697761194029851e-05, "loss": 0.2186, "step": 1960 }, { "epoch": 7.35, "grad_norm": 13.533841133117676, "learning_rate": 2.6604477611940297e-05, "loss": 0.2772, "step": 1970 }, { "epoch": 7.39, "grad_norm": 1.113595962524414, "learning_rate": 2.623134328358209e-05, "loss": 0.3396, "step": 1980 }, { "epoch": 7.43, "grad_norm": 3.193376064300537, "learning_rate": 2.5858208955223884e-05, "loss": 0.2171, "step": 1990 }, { "epoch": 7.46, "grad_norm": 2.8687243461608887, "learning_rate": 2.5485074626865672e-05, "loss": 0.2275, "step": 2000 }, { "epoch": 7.46, "eval_accuracy": 0.8312829525483304, "eval_loss": 0.5349107980728149, "eval_runtime": 8.0113, "eval_samples_per_second": 71.025, "eval_steps_per_second": 8.987, "step": 2000 }, { "epoch": 7.5, "grad_norm": 6.330258846282959, "learning_rate": 2.5111940298507464e-05, "loss": 0.2165, "step": 2010 }, { "epoch": 7.54, "grad_norm": 2.457519769668579, "learning_rate": 2.4738805970149252e-05, "loss": 0.3275, "step": 2020 }, { "epoch": 7.57, "grad_norm": 1.468772053718567, "learning_rate": 2.4365671641791047e-05, "loss": 0.186, "step": 2030 }, { "epoch": 7.61, "grad_norm": 4.308888912200928, "learning_rate": 2.3992537313432835e-05, "loss": 0.3182, "step": 2040 }, { "epoch": 7.65, "grad_norm": 1.8849867582321167, "learning_rate": 2.361940298507463e-05, "loss": 0.2631, "step": 2050 }, { "epoch": 7.69, "grad_norm": 2.6795170307159424, "learning_rate": 2.3246268656716418e-05, "loss": 0.1724, "step": 2060 }, { "epoch": 7.72, "grad_norm": 0.22702960669994354, "learning_rate": 2.287313432835821e-05, "loss": 0.2542, "step": 2070 }, { "epoch": 7.76, "grad_norm": 4.6633429527282715, "learning_rate": 2.25e-05, "loss": 0.259, "step": 2080 }, { "epoch": 7.8, "grad_norm": 6.543178558349609, "learning_rate": 2.2126865671641793e-05, "loss": 0.3752, "step": 2090 }, { "epoch": 7.84, "grad_norm": 7.109080791473389, "learning_rate": 2.1753731343283585e-05, "loss": 0.1762, "step": 2100 }, { "epoch": 7.84, "eval_accuracy": 0.8541300527240774, "eval_loss": 0.5203543901443481, "eval_runtime": 7.8922, "eval_samples_per_second": 72.096, "eval_steps_per_second": 9.123, "step": 2100 }, { "epoch": 7.87, "grad_norm": 3.3965115547180176, "learning_rate": 2.1380597014925373e-05, "loss": 0.1965, "step": 2110 }, { "epoch": 7.91, "grad_norm": 0.1386798918247223, "learning_rate": 2.1007462686567164e-05, "loss": 0.1448, "step": 2120 }, { "epoch": 7.95, "grad_norm": 8.268773078918457, "learning_rate": 2.0634328358208956e-05, "loss": 0.2203, "step": 2130 }, { "epoch": 7.99, "grad_norm": 2.712890625, "learning_rate": 2.0261194029850748e-05, "loss": 0.2104, "step": 2140 }, { "epoch": 8.02, "grad_norm": 2.0390050411224365, "learning_rate": 1.988805970149254e-05, "loss": 0.2063, "step": 2150 }, { "epoch": 8.06, "grad_norm": 4.355598449707031, "learning_rate": 1.951492537313433e-05, "loss": 0.1356, "step": 2160 }, { "epoch": 8.1, "grad_norm": 9.854630470275879, "learning_rate": 1.914179104477612e-05, "loss": 0.1686, "step": 2170 }, { "epoch": 8.13, "grad_norm": 4.178330421447754, "learning_rate": 1.8768656716417914e-05, "loss": 0.2578, "step": 2180 }, { "epoch": 8.17, "grad_norm": 5.019784450531006, "learning_rate": 1.8395522388059702e-05, "loss": 0.1923, "step": 2190 }, { "epoch": 8.21, "grad_norm": 3.8136210441589355, "learning_rate": 1.8022388059701494e-05, "loss": 0.2112, "step": 2200 }, { "epoch": 8.21, "eval_accuracy": 0.8629173989455184, "eval_loss": 0.5188840627670288, "eval_runtime": 8.1412, "eval_samples_per_second": 69.891, "eval_steps_per_second": 8.844, "step": 2200 }, { "epoch": 8.25, "grad_norm": 2.7035305500030518, "learning_rate": 1.7649253731343285e-05, "loss": 0.2501, "step": 2210 }, { "epoch": 8.28, "grad_norm": 6.736306190490723, "learning_rate": 1.7276119402985073e-05, "loss": 0.2213, "step": 2220 }, { "epoch": 8.32, "grad_norm": 3.0436556339263916, "learning_rate": 1.690298507462687e-05, "loss": 0.1285, "step": 2230 }, { "epoch": 8.36, "grad_norm": 4.729572772979736, "learning_rate": 1.6529850746268657e-05, "loss": 0.2984, "step": 2240 }, { "epoch": 8.4, "grad_norm": 3.6665098667144775, "learning_rate": 1.6156716417910448e-05, "loss": 0.1796, "step": 2250 }, { "epoch": 8.43, "grad_norm": 8.485068321228027, "learning_rate": 1.578358208955224e-05, "loss": 0.2137, "step": 2260 }, { "epoch": 8.47, "grad_norm": 4.643974304199219, "learning_rate": 1.541044776119403e-05, "loss": 0.3009, "step": 2270 }, { "epoch": 8.51, "grad_norm": 2.91859769821167, "learning_rate": 1.5037313432835823e-05, "loss": 0.1855, "step": 2280 }, { "epoch": 8.54, "grad_norm": 9.799684524536133, "learning_rate": 1.4664179104477613e-05, "loss": 0.2186, "step": 2290 }, { "epoch": 8.58, "grad_norm": 4.92659330368042, "learning_rate": 1.4291044776119403e-05, "loss": 0.1242, "step": 2300 }, { "epoch": 8.58, "eval_accuracy": 0.8471001757469244, "eval_loss": 0.5376706123352051, "eval_runtime": 7.8653, "eval_samples_per_second": 72.343, "eval_steps_per_second": 9.154, "step": 2300 }, { "epoch": 8.62, "grad_norm": 0.7728621363639832, "learning_rate": 1.3917910447761196e-05, "loss": 0.2769, "step": 2310 }, { "epoch": 8.66, "grad_norm": 3.757192373275757, "learning_rate": 1.3544776119402986e-05, "loss": 0.31, "step": 2320 }, { "epoch": 8.69, "grad_norm": 5.901330471038818, "learning_rate": 1.3171641791044777e-05, "loss": 0.2488, "step": 2330 }, { "epoch": 8.73, "grad_norm": 0.1360226422548294, "learning_rate": 1.2798507462686567e-05, "loss": 0.2359, "step": 2340 }, { "epoch": 8.77, "grad_norm": 5.801501750946045, "learning_rate": 1.2425373134328359e-05, "loss": 0.23, "step": 2350 }, { "epoch": 8.81, "grad_norm": 3.3060359954833984, "learning_rate": 1.2052238805970149e-05, "loss": 0.1114, "step": 2360 }, { "epoch": 8.84, "grad_norm": 2.0813100337982178, "learning_rate": 1.167910447761194e-05, "loss": 0.1569, "step": 2370 }, { "epoch": 8.88, "grad_norm": 0.42951256036758423, "learning_rate": 1.1305970149253732e-05, "loss": 0.2636, "step": 2380 }, { "epoch": 8.92, "grad_norm": 3.2714788913726807, "learning_rate": 1.0932835820895524e-05, "loss": 0.2197, "step": 2390 }, { "epoch": 8.96, "grad_norm": 4.24855375289917, "learning_rate": 1.0559701492537313e-05, "loss": 0.1207, "step": 2400 }, { "epoch": 8.96, "eval_accuracy": 0.8558875219683656, "eval_loss": 0.5324714779853821, "eval_runtime": 7.9022, "eval_samples_per_second": 72.006, "eval_steps_per_second": 9.111, "step": 2400 }, { "epoch": 8.99, "grad_norm": 3.989713430404663, "learning_rate": 1.0186567164179105e-05, "loss": 0.2336, "step": 2410 }, { "epoch": 9.03, "grad_norm": 5.590869903564453, "learning_rate": 9.813432835820897e-06, "loss": 0.2292, "step": 2420 }, { "epoch": 9.07, "grad_norm": 3.405966281890869, "learning_rate": 9.440298507462688e-06, "loss": 0.1654, "step": 2430 }, { "epoch": 9.1, "grad_norm": 3.733381986618042, "learning_rate": 9.067164179104478e-06, "loss": 0.2104, "step": 2440 }, { "epoch": 9.14, "grad_norm": 0.1994183361530304, "learning_rate": 8.694029850746268e-06, "loss": 0.0789, "step": 2450 }, { "epoch": 9.18, "grad_norm": 7.948019504547119, "learning_rate": 8.32089552238806e-06, "loss": 0.3335, "step": 2460 }, { "epoch": 9.22, "grad_norm": 3.020522117614746, "learning_rate": 7.947761194029851e-06, "loss": 0.1838, "step": 2470 }, { "epoch": 9.25, "grad_norm": 2.4797592163085938, "learning_rate": 7.574626865671643e-06, "loss": 0.1573, "step": 2480 }, { "epoch": 9.29, "grad_norm": 0.7854322195053101, "learning_rate": 7.201492537313433e-06, "loss": 0.1868, "step": 2490 }, { "epoch": 9.33, "grad_norm": 8.424530982971191, "learning_rate": 6.828358208955224e-06, "loss": 0.1806, "step": 2500 }, { "epoch": 9.33, "eval_accuracy": 0.8646748681898067, "eval_loss": 0.5149648785591125, "eval_runtime": 7.8422, "eval_samples_per_second": 72.556, "eval_steps_per_second": 9.181, "step": 2500 }, { "epoch": 9.37, "grad_norm": 2.9176523685455322, "learning_rate": 6.455223880597015e-06, "loss": 0.1977, "step": 2510 }, { "epoch": 9.4, "grad_norm": 4.15384578704834, "learning_rate": 6.082089552238806e-06, "loss": 0.2007, "step": 2520 }, { "epoch": 9.44, "grad_norm": 2.4758641719818115, "learning_rate": 5.708955223880597e-06, "loss": 0.2, "step": 2530 }, { "epoch": 9.48, "grad_norm": 4.053123950958252, "learning_rate": 5.335820895522389e-06, "loss": 0.2514, "step": 2540 }, { "epoch": 9.51, "grad_norm": 2.3916337490081787, "learning_rate": 4.9626865671641796e-06, "loss": 0.2104, "step": 2550 }, { "epoch": 9.55, "grad_norm": 4.113661766052246, "learning_rate": 4.58955223880597e-06, "loss": 0.1998, "step": 2560 }, { "epoch": 9.59, "grad_norm": 3.558722972869873, "learning_rate": 4.216417910447761e-06, "loss": 0.144, "step": 2570 }, { "epoch": 9.63, "grad_norm": 2.689765691757202, "learning_rate": 3.843283582089553e-06, "loss": 0.1691, "step": 2580 }, { "epoch": 9.66, "grad_norm": 4.95484733581543, "learning_rate": 3.4701492537313434e-06, "loss": 0.1875, "step": 2590 }, { "epoch": 9.7, "grad_norm": 6.025635242462158, "learning_rate": 3.0970149253731345e-06, "loss": 0.1793, "step": 2600 }, { "epoch": 9.7, "eval_accuracy": 0.8664323374340949, "eval_loss": 0.5153330564498901, "eval_runtime": 7.9144, "eval_samples_per_second": 71.894, "eval_steps_per_second": 9.097, "step": 2600 }, { "epoch": 9.74, "grad_norm": 0.3092793822288513, "learning_rate": 2.7238805970149257e-06, "loss": 0.1385, "step": 2610 }, { "epoch": 9.78, "grad_norm": 1.1317028999328613, "learning_rate": 2.3507462686567164e-06, "loss": 0.1628, "step": 2620 }, { "epoch": 9.81, "grad_norm": 7.642726898193359, "learning_rate": 1.9776119402985076e-06, "loss": 0.2142, "step": 2630 }, { "epoch": 9.85, "grad_norm": 4.3891191482543945, "learning_rate": 1.6044776119402985e-06, "loss": 0.2115, "step": 2640 }, { "epoch": 9.89, "grad_norm": 5.876834869384766, "learning_rate": 1.2313432835820897e-06, "loss": 0.2859, "step": 2650 }, { "epoch": 9.93, "grad_norm": 1.6104581356048584, "learning_rate": 8.582089552238806e-07, "loss": 0.2752, "step": 2660 }, { "epoch": 9.96, "grad_norm": 5.835386276245117, "learning_rate": 4.850746268656717e-07, "loss": 0.2057, "step": 2670 }, { "epoch": 10.0, "grad_norm": 7.006475925445557, "learning_rate": 1.119402985074627e-07, "loss": 0.2098, "step": 2680 }, { "epoch": 10.0, "step": 2680, "total_flos": 3.3230947683690086e+18, "train_loss": 0.45543073504718384, "train_runtime": 1353.2313, "train_samples_per_second": 31.687, "train_steps_per_second": 1.98 } ], "logging_steps": 10, "max_steps": 2680, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 3.3230947683690086e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }