{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9986300118732303, "eval_steps": 200, "global_step": 4104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0073066033427710295, "grad_norm": 1.7583225965499878, "learning_rate": 2.1897810218978103e-06, "loss": 3.655, "step": 10 }, { "epoch": 0.014613206685542059, "grad_norm": NaN, "learning_rate": 4.379562043795621e-06, "loss": 4.0596, "step": 20 }, { "epoch": 0.02191981002831309, "grad_norm": 2.3730218410491943, "learning_rate": 6.812652068126521e-06, "loss": 4.2837, "step": 30 }, { "epoch": 0.029226413371084118, "grad_norm": 3.1703426837921143, "learning_rate": 9.24574209245742e-06, "loss": 3.8455, "step": 40 }, { "epoch": 0.03653301671385514, "grad_norm": 2.987401247024536, "learning_rate": 1.1678832116788322e-05, "loss": 3.5357, "step": 50 }, { "epoch": 0.04383962005662618, "grad_norm": 4.219663619995117, "learning_rate": 1.411192214111922e-05, "loss": 3.6343, "step": 60 }, { "epoch": 0.05114622339939721, "grad_norm": 4.096388816833496, "learning_rate": 1.654501216545012e-05, "loss": 3.4301, "step": 70 }, { "epoch": 0.058452826742168236, "grad_norm": 10.184355735778809, "learning_rate": 1.8734793187347933e-05, "loss": 3.2771, "step": 80 }, { "epoch": 0.06575943008493926, "grad_norm": 8.512189865112305, "learning_rate": 2.1167883211678834e-05, "loss": 2.9386, "step": 90 }, { "epoch": 0.07306603342771029, "grad_norm": 5.223749160766602, "learning_rate": 2.360097323600973e-05, "loss": 2.6416, "step": 100 }, { "epoch": 0.08037263677048133, "grad_norm": 5.041982173919678, "learning_rate": 2.6034063260340636e-05, "loss": 2.78, "step": 110 }, { "epoch": 0.08767924011325236, "grad_norm": 8.065332412719727, "learning_rate": 2.8467153284671533e-05, "loss": 2.7657, "step": 120 }, { "epoch": 0.09498584345602339, "grad_norm": 2.3226640224456787, "learning_rate": 3.0900243309002434e-05, "loss": 2.7316, "step": 130 }, { "epoch": 0.10229244679879441, "grad_norm": 3.4772486686706543, "learning_rate": 3.3333333333333335e-05, "loss": 2.6205, "step": 140 }, { "epoch": 0.10959905014156544, "grad_norm": 2.8939208984375, "learning_rate": 3.5766423357664236e-05, "loss": 2.648, "step": 150 }, { "epoch": 0.11690565348433647, "grad_norm": 2.951481342315674, "learning_rate": 3.819951338199514e-05, "loss": 2.6972, "step": 160 }, { "epoch": 0.1242122568271075, "grad_norm": 2.277855157852173, "learning_rate": 4.063260340632604e-05, "loss": 2.4787, "step": 170 }, { "epoch": 0.13151886016987852, "grad_norm": 9.363456726074219, "learning_rate": 4.306569343065693e-05, "loss": 2.6347, "step": 180 }, { "epoch": 0.13882546351264954, "grad_norm": 3.6133177280426025, "learning_rate": 4.549878345498784e-05, "loss": 2.656, "step": 190 }, { "epoch": 0.14613206685542057, "grad_norm": 4.037402153015137, "learning_rate": 4.793187347931874e-05, "loss": 2.5512, "step": 200 }, { "epoch": 0.14613206685542057, "eval_loss": 2.5883829593658447, "eval_runtime": 109.0986, "eval_samples_per_second": 11.155, "eval_steps_per_second": 11.155, "step": 200 }, { "epoch": 0.15343867019819163, "grad_norm": 3.737877368927002, "learning_rate": 5.036496350364964e-05, "loss": 2.5724, "step": 210 }, { "epoch": 0.16074527354096266, "grad_norm": 3.6852433681488037, "learning_rate": 5.279805352798054e-05, "loss": 2.6093, "step": 220 }, { "epoch": 0.1680518768837337, "grad_norm": 4.3811936378479, "learning_rate": 5.5231143552311436e-05, "loss": 2.5968, "step": 230 }, { "epoch": 0.17535848022650471, "grad_norm": 4.019372940063477, "learning_rate": 5.766423357664234e-05, "loss": 2.7364, "step": 240 }, { "epoch": 0.18266508356927574, "grad_norm": 4.277368068695068, "learning_rate": 6.0097323600973245e-05, "loss": 2.7445, "step": 250 }, { "epoch": 0.18997168691204677, "grad_norm": 11.767447471618652, "learning_rate": 6.253041362530415e-05, "loss": 2.6712, "step": 260 }, { "epoch": 0.1972782902548178, "grad_norm": 3.2719593048095703, "learning_rate": 6.496350364963504e-05, "loss": 2.5184, "step": 270 }, { "epoch": 0.20458489359758883, "grad_norm": 3.923417091369629, "learning_rate": 6.739659367396593e-05, "loss": 2.6089, "step": 280 }, { "epoch": 0.21189149694035986, "grad_norm": 5.958123207092285, "learning_rate": 6.982968369829684e-05, "loss": 2.5422, "step": 290 }, { "epoch": 0.2191981002831309, "grad_norm": 4.775500774383545, "learning_rate": 7.226277372262774e-05, "loss": 2.3889, "step": 300 }, { "epoch": 0.22650470362590192, "grad_norm": 7.674183368682861, "learning_rate": 7.469586374695864e-05, "loss": 2.4629, "step": 310 }, { "epoch": 0.23381130696867294, "grad_norm": 5.1322760581970215, "learning_rate": 7.712895377128954e-05, "loss": 2.5144, "step": 320 }, { "epoch": 0.24111791031144397, "grad_norm": 5.469252586364746, "learning_rate": 7.956204379562045e-05, "loss": 2.5871, "step": 330 }, { "epoch": 0.248424513654215, "grad_norm": 4.472785472869873, "learning_rate": 8.199513381995134e-05, "loss": 2.4764, "step": 340 }, { "epoch": 0.25573111699698603, "grad_norm": 7.12663459777832, "learning_rate": 8.442822384428223e-05, "loss": 2.5562, "step": 350 }, { "epoch": 0.26303772033975703, "grad_norm": 3.2579259872436523, "learning_rate": 8.686131386861314e-05, "loss": 2.4707, "step": 360 }, { "epoch": 0.2703443236825281, "grad_norm": 4.707943916320801, "learning_rate": 8.929440389294405e-05, "loss": 2.5207, "step": 370 }, { "epoch": 0.2776509270252991, "grad_norm": 8.757339477539062, "learning_rate": 9.172749391727494e-05, "loss": 2.5424, "step": 380 }, { "epoch": 0.28495753036807014, "grad_norm": 6.260244369506836, "learning_rate": 9.416058394160584e-05, "loss": 2.5552, "step": 390 }, { "epoch": 0.29226413371084115, "grad_norm": 3.4638991355895996, "learning_rate": 9.659367396593674e-05, "loss": 2.3861, "step": 400 }, { "epoch": 0.29226413371084115, "eval_loss": 2.477276563644409, "eval_runtime": 107.9591, "eval_samples_per_second": 11.273, "eval_steps_per_second": 11.273, "step": 400 }, { "epoch": 0.2995707370536122, "grad_norm": 4.783712863922119, "learning_rate": 9.902676399026765e-05, "loss": 2.5961, "step": 410 }, { "epoch": 0.30687734039638326, "grad_norm": 6.2626729011535645, "learning_rate": 9.999934869757279e-05, "loss": 2.5863, "step": 420 }, { "epoch": 0.31418394373915426, "grad_norm": 4.87408971786499, "learning_rate": 9.999536857752013e-05, "loss": 2.3565, "step": 430 }, { "epoch": 0.3214905470819253, "grad_norm": 3.7447562217712402, "learning_rate": 9.998777045977545e-05, "loss": 2.4354, "step": 440 }, { "epoch": 0.3287971504246963, "grad_norm": 5.334987640380859, "learning_rate": 9.997655489418913e-05, "loss": 2.412, "step": 450 }, { "epoch": 0.3361037537674674, "grad_norm": 5.347410678863525, "learning_rate": 9.996172269239417e-05, "loss": 2.4486, "step": 460 }, { "epoch": 0.3434103571102384, "grad_norm": 5.353787899017334, "learning_rate": 9.99432749277474e-05, "loss": 2.3383, "step": 470 }, { "epoch": 0.35071696045300943, "grad_norm": 5.850295543670654, "learning_rate": 9.992121293525189e-05, "loss": 2.3838, "step": 480 }, { "epoch": 0.35802356379578043, "grad_norm": 4.7518792152404785, "learning_rate": 9.98955383114603e-05, "loss": 2.427, "step": 490 }, { "epoch": 0.3653301671385515, "grad_norm": 7.027746677398682, "learning_rate": 9.986625291435933e-05, "loss": 2.608, "step": 500 }, { "epoch": 0.3726367704813225, "grad_norm": 5.194112300872803, "learning_rate": 9.983335886323523e-05, "loss": 2.5837, "step": 510 }, { "epoch": 0.37994337382409354, "grad_norm": 5.106868743896484, "learning_rate": 9.979685853852057e-05, "loss": 2.4636, "step": 520 }, { "epoch": 0.38724997716686455, "grad_norm": 4.584632396697998, "learning_rate": 9.975675458162177e-05, "loss": 2.3892, "step": 530 }, { "epoch": 0.3945565805096356, "grad_norm": 3.168287992477417, "learning_rate": 9.971304989472819e-05, "loss": 2.4344, "step": 540 }, { "epoch": 0.4018631838524066, "grad_norm": 5.590039253234863, "learning_rate": 9.966574764060186e-05, "loss": 2.377, "step": 550 }, { "epoch": 0.40916978719517766, "grad_norm": 3.9408605098724365, "learning_rate": 9.961485124234881e-05, "loss": 2.4833, "step": 560 }, { "epoch": 0.41647639053794866, "grad_norm": 3.6905062198638916, "learning_rate": 9.956036438317124e-05, "loss": 2.5575, "step": 570 }, { "epoch": 0.4237829938807197, "grad_norm": 5.282037258148193, "learning_rate": 9.9502291006101e-05, "loss": 2.438, "step": 580 }, { "epoch": 0.4310895972234907, "grad_norm": 3.3918325901031494, "learning_rate": 9.944063531371423e-05, "loss": 2.5227, "step": 590 }, { "epoch": 0.4383962005662618, "grad_norm": 3.809112787246704, "learning_rate": 9.937540176782732e-05, "loss": 2.5938, "step": 600 }, { "epoch": 0.4383962005662618, "eval_loss": 2.399010181427002, "eval_runtime": 107.2947, "eval_samples_per_second": 11.343, "eval_steps_per_second": 11.343, "step": 600 }, { "epoch": 0.4457028039090328, "grad_norm": 6.199471473693848, "learning_rate": 9.930659508917388e-05, "loss": 2.4369, "step": 610 }, { "epoch": 0.45300940725180383, "grad_norm": 4.105823993682861, "learning_rate": 9.923422025706323e-05, "loss": 2.5947, "step": 620 }, { "epoch": 0.46031601059457483, "grad_norm": 4.27271842956543, "learning_rate": 9.915828250902004e-05, "loss": 2.3976, "step": 630 }, { "epoch": 0.4676226139373459, "grad_norm": 3.6033222675323486, "learning_rate": 9.907878734040525e-05, "loss": 2.3318, "step": 640 }, { "epoch": 0.4749292172801169, "grad_norm": 3.8373398780822754, "learning_rate": 9.89957405040185e-05, "loss": 2.4319, "step": 650 }, { "epoch": 0.48223582062288795, "grad_norm": 3.295011520385742, "learning_rate": 9.890914800968171e-05, "loss": 2.2978, "step": 660 }, { "epoch": 0.48954242396565895, "grad_norm": 3.935311794281006, "learning_rate": 9.88190161238042e-05, "loss": 2.4859, "step": 670 }, { "epoch": 0.49684902730843, "grad_norm": 4.868900299072266, "learning_rate": 9.872535136892926e-05, "loss": 2.3433, "step": 680 }, { "epoch": 0.5041556306512011, "grad_norm": 6.791601181030273, "learning_rate": 9.862816052326209e-05, "loss": 2.3252, "step": 690 }, { "epoch": 0.5114622339939721, "grad_norm": 2.8761675357818604, "learning_rate": 9.852745062017927e-05, "loss": 2.4546, "step": 700 }, { "epoch": 0.5187688373367431, "grad_norm": 7.200977802276611, "learning_rate": 9.84338089316251e-05, "loss": 2.462, "step": 710 }, { "epoch": 0.5260754406795141, "grad_norm": 5.1391825675964355, "learning_rate": 9.832643310805385e-05, "loss": 2.601, "step": 720 }, { "epoch": 0.5333820440222852, "grad_norm": 3.5819122791290283, "learning_rate": 9.821556006207133e-05, "loss": 2.3209, "step": 730 }, { "epoch": 0.5406886473650562, "grad_norm": 3.3951661586761475, "learning_rate": 9.810119781718924e-05, "loss": 2.1652, "step": 740 }, { "epoch": 0.5479952507078272, "grad_norm": 5.344549655914307, "learning_rate": 9.798335464942094e-05, "loss": 2.3954, "step": 750 }, { "epoch": 0.5553018540505982, "grad_norm": 5.5213446617126465, "learning_rate": 9.786203908668255e-05, "loss": 2.1831, "step": 760 }, { "epoch": 0.5626084573933693, "grad_norm": 4.242595195770264, "learning_rate": 9.773725990817575e-05, "loss": 2.4479, "step": 770 }, { "epoch": 0.5699150607361403, "grad_norm": 5.442046165466309, "learning_rate": 9.76090261437526e-05, "loss": 2.2424, "step": 780 }, { "epoch": 0.5772216640789113, "grad_norm": 5.669281482696533, "learning_rate": 9.747734707326195e-05, "loss": 2.1442, "step": 790 }, { "epoch": 0.5845282674216823, "grad_norm": 6.425075531005859, "learning_rate": 9.734223222587792e-05, "loss": 2.3384, "step": 800 }, { "epoch": 0.5845282674216823, "eval_loss": 2.348578691482544, "eval_runtime": 108.5263, "eval_samples_per_second": 11.214, "eval_steps_per_second": 11.214, "step": 800 }, { "epoch": 0.5918348707644534, "grad_norm": 5.118699073791504, "learning_rate": 9.720369137941034e-05, "loss": 2.4253, "step": 810 }, { "epoch": 0.5991414741072244, "grad_norm": 6.51192569732666, "learning_rate": 9.706173455959715e-05, "loss": 2.5238, "step": 820 }, { "epoch": 0.6064480774499954, "grad_norm": 4.507262706756592, "learning_rate": 9.69163720393788e-05, "loss": 2.4986, "step": 830 }, { "epoch": 0.6137546807927665, "grad_norm": 5.540140151977539, "learning_rate": 9.676761433815498e-05, "loss": 2.2241, "step": 840 }, { "epoch": 0.6210612841355375, "grad_norm": 3.475051164627075, "learning_rate": 9.661547222102323e-05, "loss": 2.3395, "step": 850 }, { "epoch": 0.6283678874783085, "grad_norm": 5.291077136993408, "learning_rate": 9.645995669799995e-05, "loss": 2.2988, "step": 860 }, { "epoch": 0.6356744908210795, "grad_norm": 3.973806858062744, "learning_rate": 9.630107902322367e-05, "loss": 2.3554, "step": 870 }, { "epoch": 0.6429810941638506, "grad_norm": 4.875363349914551, "learning_rate": 9.613885069414061e-05, "loss": 2.3115, "step": 880 }, { "epoch": 0.6502876975066216, "grad_norm": 4.581294059753418, "learning_rate": 9.597328345067259e-05, "loss": 2.1619, "step": 890 }, { "epoch": 0.6575943008493926, "grad_norm": 5.785318374633789, "learning_rate": 9.580438927436756e-05, "loss": 2.3814, "step": 900 }, { "epoch": 0.6649009041921636, "grad_norm": 8.992704391479492, "learning_rate": 9.563218038753246e-05, "loss": 2.5302, "step": 910 }, { "epoch": 0.6722075075349347, "grad_norm": 8.763484954833984, "learning_rate": 9.545666925234873e-05, "loss": 2.5443, "step": 920 }, { "epoch": 0.6795141108777057, "grad_norm": 4.8247456550598145, "learning_rate": 9.52778685699705e-05, "loss": 2.269, "step": 930 }, { "epoch": 0.6868207142204767, "grad_norm": 4.4191131591796875, "learning_rate": 9.509579127960543e-05, "loss": 2.0768, "step": 940 }, { "epoch": 0.6941273175632477, "grad_norm": 3.7724087238311768, "learning_rate": 9.491045055757836e-05, "loss": 2.2629, "step": 950 }, { "epoch": 0.7014339209060189, "grad_norm": 4.011133193969727, "learning_rate": 9.472185981637775e-05, "loss": 2.3676, "step": 960 }, { "epoch": 0.7087405242487899, "grad_norm": 3.6976683139801025, "learning_rate": 9.45300327036851e-05, "loss": 2.3744, "step": 970 }, { "epoch": 0.7160471275915609, "grad_norm": 4.968256950378418, "learning_rate": 9.433498310138728e-05, "loss": 2.1978, "step": 980 }, { "epoch": 0.7233537309343319, "grad_norm": 32.12665939331055, "learning_rate": 9.413672512457197e-05, "loss": 2.1205, "step": 990 }, { "epoch": 0.730660334277103, "grad_norm": 3.214895248413086, "learning_rate": 9.393527312050618e-05, "loss": 2.3232, "step": 1000 }, { "epoch": 0.730660334277103, "eval_loss": 2.3093771934509277, "eval_runtime": 109.2454, "eval_samples_per_second": 11.14, "eval_steps_per_second": 11.14, "step": 1000 }, { "epoch": 0.737966937619874, "grad_norm": 5.2641072273254395, "learning_rate": 9.373064166759803e-05, "loss": 2.2691, "step": 1010 }, { "epoch": 0.745273540962645, "grad_norm": 4.43721866607666, "learning_rate": 9.352284557434166e-05, "loss": 2.3682, "step": 1020 }, { "epoch": 0.752580144305416, "grad_norm": 3.769683599472046, "learning_rate": 9.331189987824569e-05, "loss": 2.2732, "step": 1030 }, { "epoch": 0.7598867476481871, "grad_norm": 5.2646002769470215, "learning_rate": 9.309781984474497e-05, "loss": 2.1909, "step": 1040 }, { "epoch": 0.7671933509909581, "grad_norm": 4.442707538604736, "learning_rate": 9.288062096609588e-05, "loss": 2.2229, "step": 1050 }, { "epoch": 0.7744999543337291, "grad_norm": 7.33398962020874, "learning_rate": 9.266031896025516e-05, "loss": 2.3366, "step": 1060 }, { "epoch": 0.7818065576765001, "grad_norm": 4.366443634033203, "learning_rate": 9.243692976974254e-05, "loss": 2.0555, "step": 1070 }, { "epoch": 0.7891131610192712, "grad_norm": 6.721280097961426, "learning_rate": 9.221046956048696e-05, "loss": 2.2303, "step": 1080 }, { "epoch": 0.7964197643620422, "grad_norm": 4.3517746925354, "learning_rate": 9.198095472065668e-05, "loss": 2.3249, "step": 1090 }, { "epoch": 0.8037263677048132, "grad_norm": 5.425954818725586, "learning_rate": 9.174840185947345e-05, "loss": 2.2346, "step": 1100 }, { "epoch": 0.8110329710475842, "grad_norm": 4.331856727600098, "learning_rate": 9.151282780601039e-05, "loss": 2.4686, "step": 1110 }, { "epoch": 0.8183395743903553, "grad_norm": 7.6655707359313965, "learning_rate": 9.127424960797424e-05, "loss": 2.3503, "step": 1120 }, { "epoch": 0.8256461777331263, "grad_norm": 3.6036481857299805, "learning_rate": 9.103268453047165e-05, "loss": 2.3279, "step": 1130 }, { "epoch": 0.8329527810758973, "grad_norm": 7.087502479553223, "learning_rate": 9.078815005475974e-05, "loss": 2.4316, "step": 1140 }, { "epoch": 0.8402593844186683, "grad_norm": 3.372032880783081, "learning_rate": 9.054066387698104e-05, "loss": 2.3761, "step": 1150 }, { "epoch": 0.8475659877614394, "grad_norm": 5.309089660644531, "learning_rate": 9.02902439068829e-05, "loss": 2.4221, "step": 1160 }, { "epoch": 0.8548725911042104, "grad_norm": 4.797391414642334, "learning_rate": 9.003690826652143e-05, "loss": 2.2968, "step": 1170 }, { "epoch": 0.8621791944469814, "grad_norm": 3.347250461578369, "learning_rate": 8.978067528895003e-05, "loss": 2.2259, "step": 1180 }, { "epoch": 0.8694857977897524, "grad_norm": 2.894286870956421, "learning_rate": 8.95215635168927e-05, "loss": 2.3369, "step": 1190 }, { "epoch": 0.8767924011325235, "grad_norm": 5.936110019683838, "learning_rate": 8.925959170140218e-05, "loss": 2.2603, "step": 1200 }, { "epoch": 0.8767924011325235, "eval_loss": 2.2486681938171387, "eval_runtime": 109.2268, "eval_samples_per_second": 11.142, "eval_steps_per_second": 11.142, "step": 1200 }, { "epoch": 0.8840990044752945, "grad_norm": 4.270968437194824, "learning_rate": 8.899477880050306e-05, "loss": 2.3473, "step": 1210 }, { "epoch": 0.8914056078180655, "grad_norm": 5.154277324676514, "learning_rate": 8.872714397781965e-05, "loss": 2.3085, "step": 1220 }, { "epoch": 0.8987122111608367, "grad_norm": 4.050198078155518, "learning_rate": 8.84567066011894e-05, "loss": 2.2946, "step": 1230 }, { "epoch": 0.9060188145036077, "grad_norm": 11.016803741455078, "learning_rate": 8.818348624126122e-05, "loss": 2.1233, "step": 1240 }, { "epoch": 0.9133254178463787, "grad_norm": 4.9638447761535645, "learning_rate": 8.790750267007918e-05, "loss": 2.1703, "step": 1250 }, { "epoch": 0.9206320211891497, "grad_norm": 5.055572032928467, "learning_rate": 8.762877585965172e-05, "loss": 2.3752, "step": 1260 }, { "epoch": 0.9279386245319208, "grad_norm": 5.373875617980957, "learning_rate": 8.734732598050637e-05, "loss": 2.2983, "step": 1270 }, { "epoch": 0.9352452278746918, "grad_norm": 4.6786723136901855, "learning_rate": 8.706317340022997e-05, "loss": 2.2051, "step": 1280 }, { "epoch": 0.9425518312174628, "grad_norm": 3.9016830921173096, "learning_rate": 8.677633868199487e-05, "loss": 2.1745, "step": 1290 }, { "epoch": 0.9498584345602338, "grad_norm": 3.027470350265503, "learning_rate": 8.648684258307076e-05, "loss": 2.2724, "step": 1300 }, { "epoch": 0.9571650379030049, "grad_norm": 4.489301681518555, "learning_rate": 8.619470605332253e-05, "loss": 2.2487, "step": 1310 }, { "epoch": 0.9644716412457759, "grad_norm": 4.5948686599731445, "learning_rate": 8.589995023369429e-05, "loss": 2.2639, "step": 1320 }, { "epoch": 0.9717782445885469, "grad_norm": 5.749096393585205, "learning_rate": 8.560259645467928e-05, "loss": 2.2842, "step": 1330 }, { "epoch": 0.9790848479313179, "grad_norm": 3.869464635848999, "learning_rate": 8.53026662347765e-05, "loss": 2.3613, "step": 1340 }, { "epoch": 0.986391451274089, "grad_norm": 6.761131763458252, "learning_rate": 8.500018127893329e-05, "loss": 2.1473, "step": 1350 }, { "epoch": 0.99369805461686, "grad_norm": 3.230255365371704, "learning_rate": 8.469516347697473e-05, "loss": 2.2302, "step": 1360 }, { "epoch": 1.001004657959631, "grad_norm": 4.553093433380127, "learning_rate": 8.438763490201946e-05, "loss": 2.1864, "step": 1370 }, { "epoch": 1.0083112613024021, "grad_norm": 4.498405456542969, "learning_rate": 8.407761780888244e-05, "loss": 2.209, "step": 1380 }, { "epoch": 1.015617864645173, "grad_norm": 5.025854110717773, "learning_rate": 8.37651346324643e-05, "loss": 2.2477, "step": 1390 }, { "epoch": 1.0229244679879441, "grad_norm": 4.791236400604248, "learning_rate": 8.345020798612791e-05, "loss": 2.1007, "step": 1400 }, { "epoch": 1.0229244679879441, "eval_loss": 2.2063653469085693, "eval_runtime": 108.5908, "eval_samples_per_second": 11.207, "eval_steps_per_second": 11.207, "step": 1400 }, { "epoch": 1.0302310713307152, "grad_norm": 4.35936164855957, "learning_rate": 8.313286066006187e-05, "loss": 2.1783, "step": 1410 }, { "epoch": 1.0375376746734861, "grad_norm": 4.893729209899902, "learning_rate": 8.28131156196313e-05, "loss": 2.0583, "step": 1420 }, { "epoch": 1.0448442780162572, "grad_norm": 9.001852989196777, "learning_rate": 8.249099600371591e-05, "loss": 2.0463, "step": 1430 }, { "epoch": 1.0521508813590281, "grad_norm": 4.704784393310547, "learning_rate": 8.216652512303543e-05, "loss": 2.255, "step": 1440 }, { "epoch": 1.0594574847017992, "grad_norm": 4.346590995788574, "learning_rate": 8.183972645846283e-05, "loss": 2.2898, "step": 1450 }, { "epoch": 1.0667640880445703, "grad_norm": 6.388704299926758, "learning_rate": 8.1510623659325e-05, "loss": 2.0351, "step": 1460 }, { "epoch": 1.0740706913873412, "grad_norm": 5.978973388671875, "learning_rate": 8.117924054169133e-05, "loss": 2.091, "step": 1470 }, { "epoch": 1.0813772947301123, "grad_norm": 3.6497621536254883, "learning_rate": 8.084560108665024e-05, "loss": 2.0603, "step": 1480 }, { "epoch": 1.0886838980728835, "grad_norm": 5.628828048706055, "learning_rate": 8.050972943857375e-05, "loss": 2.1513, "step": 1490 }, { "epoch": 1.0959905014156544, "grad_norm": 15.277828216552734, "learning_rate": 8.017164990337026e-05, "loss": 2.3273, "step": 1500 }, { "epoch": 1.1032971047584255, "grad_norm": 7.737220764160156, "learning_rate": 7.983138694672552e-05, "loss": 2.2664, "step": 1510 }, { "epoch": 1.1106037081011966, "grad_norm": 6.418485164642334, "learning_rate": 7.948896519233225e-05, "loss": 2.1581, "step": 1520 }, { "epoch": 1.1179103114439675, "grad_norm": 3.803318738937378, "learning_rate": 7.914440942010807e-05, "loss": 2.2859, "step": 1530 }, { "epoch": 1.1252169147867386, "grad_norm": 3.083667278289795, "learning_rate": 7.879774456440243e-05, "loss": 2.1394, "step": 1540 }, { "epoch": 1.1325235181295095, "grad_norm": 13.434645652770996, "learning_rate": 7.844899571219202e-05, "loss": 2.0644, "step": 1550 }, { "epoch": 1.1398301214722806, "grad_norm": 5.204081058502197, "learning_rate": 7.809818810126545e-05, "loss": 2.055, "step": 1560 }, { "epoch": 1.1471367248150517, "grad_norm": 4.114535808563232, "learning_rate": 7.774534711839677e-05, "loss": 2.0104, "step": 1570 }, { "epoch": 1.1544433281578226, "grad_norm": 5.424691677093506, "learning_rate": 7.73904982975084e-05, "loss": 2.0453, "step": 1580 }, { "epoch": 1.1617499315005937, "grad_norm": 4.465676784515381, "learning_rate": 7.703366731782327e-05, "loss": 2.2777, "step": 1590 }, { "epoch": 1.1690565348433646, "grad_norm": 3.521860361099243, "learning_rate": 7.667488000200649e-05, "loss": 2.0635, "step": 1600 }, { "epoch": 1.1690565348433646, "eval_loss": 2.1665337085723877, "eval_runtime": 109.3334, "eval_samples_per_second": 11.131, "eval_steps_per_second": 11.131, "step": 1600 }, { "epoch": 1.1763631381861357, "grad_norm": 5.549830913543701, "learning_rate": 7.631416231429672e-05, "loss": 2.0994, "step": 1610 }, { "epoch": 1.1836697415289068, "grad_norm": 6.577401638031006, "learning_rate": 7.595154035862715e-05, "loss": 2.0379, "step": 1620 }, { "epoch": 1.1909763448716777, "grad_norm": 6.858834743499756, "learning_rate": 7.558704037673648e-05, "loss": 2.1925, "step": 1630 }, { "epoch": 1.1982829482144488, "grad_norm": 4.788407325744629, "learning_rate": 7.522068874626988e-05, "loss": 2.1162, "step": 1640 }, { "epoch": 1.20558955155722, "grad_norm": 3.5293450355529785, "learning_rate": 7.48525119788702e-05, "loss": 2.1279, "step": 1650 }, { "epoch": 1.2128961548999908, "grad_norm": 4.138194561004639, "learning_rate": 7.448253671825927e-05, "loss": 2.1242, "step": 1660 }, { "epoch": 1.220202758242762, "grad_norm": 3.9674646854400635, "learning_rate": 7.411078973830987e-05, "loss": 2.1451, "step": 1670 }, { "epoch": 1.227509361585533, "grad_norm": 10.673015594482422, "learning_rate": 7.373729794110826e-05, "loss": 2.0227, "step": 1680 }, { "epoch": 1.234815964928304, "grad_norm": 5.9692511558532715, "learning_rate": 7.33620883550072e-05, "loss": 2.191, "step": 1690 }, { "epoch": 1.242122568271075, "grad_norm": 8.75203800201416, "learning_rate": 7.298518813267015e-05, "loss": 1.9689, "step": 1700 }, { "epoch": 1.249429171613846, "grad_norm": 6.571184158325195, "learning_rate": 7.260662454910621e-05, "loss": 2.0869, "step": 1710 }, { "epoch": 1.256735774956617, "grad_norm": 6.252017498016357, "learning_rate": 7.222642499969646e-05, "loss": 2.0596, "step": 1720 }, { "epoch": 1.2640423782993881, "grad_norm": 7.060279369354248, "learning_rate": 7.184461699821126e-05, "loss": 2.211, "step": 1730 }, { "epoch": 1.271348981642159, "grad_norm": 5.095629692077637, "learning_rate": 7.14612281748193e-05, "loss": 2.0639, "step": 1740 }, { "epoch": 1.2786555849849301, "grad_norm": 3.8237876892089844, "learning_rate": 7.107628627408813e-05, "loss": 2.0824, "step": 1750 }, { "epoch": 1.285962188327701, "grad_norm": 4.18397331237793, "learning_rate": 7.068981915297626e-05, "loss": 2.0253, "step": 1760 }, { "epoch": 1.2932687916704722, "grad_norm": 4.53864049911499, "learning_rate": 7.030185477881726e-05, "loss": 2.1168, "step": 1770 }, { "epoch": 1.3005753950132433, "grad_norm": 6.684192657470703, "learning_rate": 6.991242122729597e-05, "loss": 1.9231, "step": 1780 }, { "epoch": 1.3078819983560144, "grad_norm": 6.962904453277588, "learning_rate": 6.952154668041666e-05, "loss": 1.9549, "step": 1790 }, { "epoch": 1.3151886016987853, "grad_norm": 8.408102035522461, "learning_rate": 6.91292594244636e-05, "loss": 2.1212, "step": 1800 }, { "epoch": 1.3151886016987853, "eval_loss": 2.135313034057617, "eval_runtime": 108.3855, "eval_samples_per_second": 11.228, "eval_steps_per_second": 11.228, "step": 1800 }, { "epoch": 1.3224952050415564, "grad_norm": 4.266064167022705, "learning_rate": 6.873558784795412e-05, "loss": 1.9563, "step": 1810 }, { "epoch": 1.3298018083843273, "grad_norm": 5.321630001068115, "learning_rate": 6.834056043958419e-05, "loss": 2.3073, "step": 1820 }, { "epoch": 1.3371084117270984, "grad_norm": 5.667231559753418, "learning_rate": 6.794420578616679e-05, "loss": 2.0931, "step": 1830 }, { "epoch": 1.3444150150698695, "grad_norm": 5.461520671844482, "learning_rate": 6.754655257056322e-05, "loss": 2.0288, "step": 1840 }, { "epoch": 1.3517216184126404, "grad_norm": 3.88820743560791, "learning_rate": 6.71476295696073e-05, "loss": 2.0259, "step": 1850 }, { "epoch": 1.3590282217554115, "grad_norm": 5.630788803100586, "learning_rate": 6.674746565202309e-05, "loss": 1.9281, "step": 1860 }, { "epoch": 1.3663348250981824, "grad_norm": 3.890331745147705, "learning_rate": 6.634608977633555e-05, "loss": 2.1578, "step": 1870 }, { "epoch": 1.3736414284409535, "grad_norm": 3.679614782333374, "learning_rate": 6.594353098877503e-05, "loss": 2.1456, "step": 1880 }, { "epoch": 1.3809480317837246, "grad_norm": 5.798458099365234, "learning_rate": 6.553981842117526e-05, "loss": 2.0121, "step": 1890 }, { "epoch": 1.3882546351264955, "grad_norm": 4.401270389556885, "learning_rate": 6.513498128886515e-05, "loss": 2.0316, "step": 1900 }, { "epoch": 1.3955612384692666, "grad_norm": 6.3201823234558105, "learning_rate": 6.472904888855463e-05, "loss": 2.1175, "step": 1910 }, { "epoch": 1.4028678418120375, "grad_norm": 5.530188083648682, "learning_rate": 6.432205059621449e-05, "loss": 2.0955, "step": 1920 }, { "epoch": 1.4101744451548086, "grad_norm": 5.705748558044434, "learning_rate": 6.391401586495059e-05, "loss": 2.0269, "step": 1930 }, { "epoch": 1.4174810484975797, "grad_norm": 5.124378204345703, "learning_rate": 6.350497422287236e-05, "loss": 2.2386, "step": 1940 }, { "epoch": 1.4247876518403508, "grad_norm": 7.351918697357178, "learning_rate": 6.309495527095606e-05, "loss": 2.1977, "step": 1950 }, { "epoch": 1.4320942551831217, "grad_norm": 5.076743125915527, "learning_rate": 6.268398868090255e-05, "loss": 2.117, "step": 1960 }, { "epoch": 1.4394008585258928, "grad_norm": 3.8619604110717773, "learning_rate": 6.227210419299014e-05, "loss": 1.9848, "step": 1970 }, { "epoch": 1.4467074618686637, "grad_norm": 3.990159034729004, "learning_rate": 6.185933161392228e-05, "loss": 2.0853, "step": 1980 }, { "epoch": 1.4540140652114348, "grad_norm": 6.388422966003418, "learning_rate": 6.144570081467066e-05, "loss": 2.0883, "step": 1990 }, { "epoch": 1.461320668554206, "grad_norm": 5.564253807067871, "learning_rate": 6.103124172831346e-05, "loss": 2.038, "step": 2000 }, { "epoch": 1.461320668554206, "eval_loss": 2.0880391597747803, "eval_runtime": 109.0271, "eval_samples_per_second": 11.162, "eval_steps_per_second": 11.162, "step": 2000 }, { "epoch": 1.4686272718969768, "grad_norm": 6.205707550048828, "learning_rate": 6.061598434786926e-05, "loss": 1.9301, "step": 2010 }, { "epoch": 1.475933875239748, "grad_norm": 7.6283111572265625, "learning_rate": 6.019995872412649e-05, "loss": 2.0155, "step": 2020 }, { "epoch": 1.4832404785825188, "grad_norm": 6.749846458435059, "learning_rate": 5.9783194963468784e-05, "loss": 1.9461, "step": 2030 }, { "epoch": 1.49054708192529, "grad_norm": 5.651824951171875, "learning_rate": 5.936572322569629e-05, "loss": 2.0335, "step": 2040 }, { "epoch": 1.497853685268061, "grad_norm": 11.750208854675293, "learning_rate": 5.894757372184309e-05, "loss": 2.0556, "step": 2050 }, { "epoch": 1.5051602886108322, "grad_norm": 5.419318675994873, "learning_rate": 5.852877671199091e-05, "loss": 1.9466, "step": 2060 }, { "epoch": 1.512466891953603, "grad_norm": 4.80405330657959, "learning_rate": 5.810936250307935e-05, "loss": 2.1046, "step": 2070 }, { "epoch": 1.519773495296374, "grad_norm": 11.513900756835938, "learning_rate": 5.768936144671261e-05, "loss": 1.8582, "step": 2080 }, { "epoch": 1.527080098639145, "grad_norm": 5.375672817230225, "learning_rate": 5.7268803936963124e-05, "loss": 2.0872, "step": 2090 }, { "epoch": 1.5343867019819162, "grad_norm": 4.7279863357543945, "learning_rate": 5.6847720408171946e-05, "loss": 2.0174, "step": 2100 }, { "epoch": 1.5416933053246873, "grad_norm": 4.411227226257324, "learning_rate": 5.642614133274641e-05, "loss": 2.0424, "step": 2110 }, { "epoch": 1.5489999086674582, "grad_norm": 7.084948539733887, "learning_rate": 5.600409721895488e-05, "loss": 2.2553, "step": 2120 }, { "epoch": 1.5563065120102293, "grad_norm": 5.153254508972168, "learning_rate": 5.558161860871899e-05, "loss": 1.9618, "step": 2130 }, { "epoch": 1.5636131153530002, "grad_norm": 8.986722946166992, "learning_rate": 5.515873607540346e-05, "loss": 2.0533, "step": 2140 }, { "epoch": 1.5709197186957713, "grad_norm": 6.175150394439697, "learning_rate": 5.473548022160354e-05, "loss": 2.0438, "step": 2150 }, { "epoch": 1.5782263220385424, "grad_norm": 8.26689624786377, "learning_rate": 5.431188167693044e-05, "loss": 2.0011, "step": 2160 }, { "epoch": 1.5855329253813135, "grad_norm": 4.056985855102539, "learning_rate": 5.388797109579479e-05, "loss": 1.9823, "step": 2170 }, { "epoch": 1.5928395287240844, "grad_norm": 5.051248073577881, "learning_rate": 5.346377915518821e-05, "loss": 1.9149, "step": 2180 }, { "epoch": 1.6001461320668553, "grad_norm": 5.093779563903809, "learning_rate": 5.3039336552463414e-05, "loss": 2.0657, "step": 2190 }, { "epoch": 1.6074527354096264, "grad_norm": 9.080710411071777, "learning_rate": 5.261467400311266e-05, "loss": 1.9656, "step": 2200 }, { "epoch": 1.6074527354096264, "eval_loss": 2.061772108078003, "eval_runtime": 108.1317, "eval_samples_per_second": 11.255, "eval_steps_per_second": 11.255, "step": 2200 }, { "epoch": 1.6147593387523975, "grad_norm": 5.32938814163208, "learning_rate": 5.2189822238545017e-05, "loss": 1.9647, "step": 2210 }, { "epoch": 1.6220659420951686, "grad_norm": 7.389804840087891, "learning_rate": 5.176481200386245e-05, "loss": 1.904, "step": 2220 }, { "epoch": 1.6293725454379395, "grad_norm": 4.834039688110352, "learning_rate": 5.1339674055634826e-05, "loss": 1.9067, "step": 2230 }, { "epoch": 1.6366791487807104, "grad_norm": 5.643000602722168, "learning_rate": 5.0914439159674244e-05, "loss": 2.2128, "step": 2240 }, { "epoch": 1.6439857521234815, "grad_norm": 4.886800765991211, "learning_rate": 5.048913808880861e-05, "loss": 1.9497, "step": 2250 }, { "epoch": 1.6512923554662526, "grad_norm": 12.440683364868164, "learning_rate": 5.006380162065465e-05, "loss": 1.9488, "step": 2260 }, { "epoch": 1.6585989588090237, "grad_norm": 7.266822814941406, "learning_rate": 4.963846053539071e-05, "loss": 2.1738, "step": 2270 }, { "epoch": 1.6659055621517946, "grad_norm": 9.061182975769043, "learning_rate": 4.9213145613529194e-05, "loss": 2.0644, "step": 2280 }, { "epoch": 1.6732121654945658, "grad_norm": 4.660227298736572, "learning_rate": 4.878788763368921e-05, "loss": 1.8194, "step": 2290 }, { "epoch": 1.6805187688373366, "grad_norm": 6.212920665740967, "learning_rate": 4.836271737036916e-05, "loss": 2.1629, "step": 2300 }, { "epoch": 1.6878253721801078, "grad_norm": 5.760697841644287, "learning_rate": 4.7937665591719664e-05, "loss": 1.9379, "step": 2310 }, { "epoch": 1.6951319755228789, "grad_norm": 6.021162033081055, "learning_rate": 4.7512763057317014e-05, "loss": 1.9756, "step": 2320 }, { "epoch": 1.70243857886565, "grad_norm": 4.001003265380859, "learning_rate": 4.70880405159372e-05, "loss": 1.9218, "step": 2330 }, { "epoch": 1.7097451822084209, "grad_norm": 6.9795002937316895, "learning_rate": 4.666352870333072e-05, "loss": 2.1045, "step": 2340 }, { "epoch": 1.7170517855511918, "grad_norm": 5.518786907196045, "learning_rate": 4.623925833999832e-05, "loss": 2.0515, "step": 2350 }, { "epoch": 1.7243583888939629, "grad_norm": 3.8339755535125732, "learning_rate": 4.5815260128967894e-05, "loss": 1.9462, "step": 2360 }, { "epoch": 1.731664992236734, "grad_norm": 4.569764137268066, "learning_rate": 4.539156475357257e-05, "loss": 2.0918, "step": 2370 }, { "epoch": 1.738971595579505, "grad_norm": 4.515227317810059, "learning_rate": 4.496820287523027e-05, "loss": 2.1785, "step": 2380 }, { "epoch": 1.746278198922276, "grad_norm": 4.677797794342041, "learning_rate": 4.454520513122484e-05, "loss": 2.1057, "step": 2390 }, { "epoch": 1.7535848022650469, "grad_norm": 8.1099214553833, "learning_rate": 4.412260213248898e-05, "loss": 2.0285, "step": 2400 }, { "epoch": 1.7535848022650469, "eval_loss": 2.0392701625823975, "eval_runtime": 108.4121, "eval_samples_per_second": 11.226, "eval_steps_per_second": 11.226, "step": 2400 }, { "epoch": 1.760891405607818, "grad_norm": 5.997411727905273, "learning_rate": 4.370042446138897e-05, "loss": 2.0577, "step": 2410 }, { "epoch": 1.768198008950589, "grad_norm": 4.918084144592285, "learning_rate": 4.3278702669511506e-05, "loss": 1.9363, "step": 2420 }, { "epoch": 1.7755046122933602, "grad_norm": 5.5613932609558105, "learning_rate": 4.285746727545291e-05, "loss": 2.041, "step": 2430 }, { "epoch": 1.7828112156361313, "grad_norm": 11.325989723205566, "learning_rate": 4.2436748762610465e-05, "loss": 1.9016, "step": 2440 }, { "epoch": 1.7901178189789022, "grad_norm": 5.7209296226501465, "learning_rate": 4.201657757697651e-05, "loss": 2.0692, "step": 2450 }, { "epoch": 1.797424422321673, "grad_norm": 7.241997718811035, "learning_rate": 4.159698412493515e-05, "loss": 1.9591, "step": 2460 }, { "epoch": 1.8047310256644442, "grad_norm": 4.004190921783447, "learning_rate": 4.117799877106181e-05, "loss": 2.0189, "step": 2470 }, { "epoch": 1.8120376290072153, "grad_norm": 7.879773139953613, "learning_rate": 4.075965183592592e-05, "loss": 1.9538, "step": 2480 }, { "epoch": 1.8193442323499864, "grad_norm": 5.945032596588135, "learning_rate": 4.034197359389666e-05, "loss": 1.853, "step": 2490 }, { "epoch": 1.8266508356927573, "grad_norm": 6.213232517242432, "learning_rate": 3.992499427095213e-05, "loss": 2.102, "step": 2500 }, { "epoch": 1.8339574390355282, "grad_norm": 4.097675323486328, "learning_rate": 3.950874404249199e-05, "loss": 2.1149, "step": 2510 }, { "epoch": 1.8412640423782993, "grad_norm": 3.8982291221618652, "learning_rate": 3.9093253031153755e-05, "loss": 1.961, "step": 2520 }, { "epoch": 1.8485706457210704, "grad_norm": 4.43918514251709, "learning_rate": 3.8678551304632965e-05, "loss": 1.8826, "step": 2530 }, { "epoch": 1.8558772490638415, "grad_norm": 6.397246360778809, "learning_rate": 3.8264668873507245e-05, "loss": 1.9244, "step": 2540 }, { "epoch": 1.8631838524066124, "grad_norm": 4.929690361022949, "learning_rate": 3.7851635689064546e-05, "loss": 1.8184, "step": 2550 }, { "epoch": 1.8704904557493836, "grad_norm": 4.7890801429748535, "learning_rate": 3.743948164113567e-05, "loss": 1.9089, "step": 2560 }, { "epoch": 1.8777970590921544, "grad_norm": 5.918452262878418, "learning_rate": 3.702823655593128e-05, "loss": 2.0868, "step": 2570 }, { "epoch": 1.8851036624349256, "grad_norm": 5.867265701293945, "learning_rate": 3.6617930193883384e-05, "loss": 2.0125, "step": 2580 }, { "epoch": 1.8924102657776967, "grad_norm": 4.536275863647461, "learning_rate": 3.62085922474918e-05, "loss": 1.9122, "step": 2590 }, { "epoch": 1.8997168691204678, "grad_norm": 5.288751602172852, "learning_rate": 3.580025233917529e-05, "loss": 1.9932, "step": 2600 }, { "epoch": 1.8997168691204678, "eval_loss": 2.0270345211029053, "eval_runtime": 111.0925, "eval_samples_per_second": 10.955, "eval_steps_per_second": 10.955, "step": 2600 }, { "epoch": 1.9070234724632387, "grad_norm": 5.767890453338623, "learning_rate": 3.5392940019127977e-05, "loss": 1.9772, "step": 2610 }, { "epoch": 1.9143300758060096, "grad_norm": 5.185894012451172, "learning_rate": 3.498668476318083e-05, "loss": 1.6921, "step": 2620 }, { "epoch": 1.9216366791487807, "grad_norm": 6.62992525100708, "learning_rate": 3.458151597066863e-05, "loss": 1.9544, "step": 2630 }, { "epoch": 1.9289432824915518, "grad_norm": 11.95433235168457, "learning_rate": 3.417746296230244e-05, "loss": 2.0189, "step": 2640 }, { "epoch": 1.936249885834323, "grad_norm": 4.77971076965332, "learning_rate": 3.3774554978047756e-05, "loss": 1.9203, "step": 2650 }, { "epoch": 1.9435564891770938, "grad_norm": 7.271909236907959, "learning_rate": 3.337282117500847e-05, "loss": 2.0945, "step": 2660 }, { "epoch": 1.9508630925198647, "grad_norm": 8.73035717010498, "learning_rate": 3.297229062531696e-05, "loss": 2.0409, "step": 2670 }, { "epoch": 1.9581696958626358, "grad_norm": 9.147926330566406, "learning_rate": 3.257299231403014e-05, "loss": 1.9463, "step": 2680 }, { "epoch": 1.965476299205407, "grad_norm": 29.067060470581055, "learning_rate": 3.217495513703198e-05, "loss": 2.0646, "step": 2690 }, { "epoch": 1.972782902548178, "grad_norm": 17.448034286499023, "learning_rate": 3.177820789894234e-05, "loss": 1.8981, "step": 2700 }, { "epoch": 1.980089505890949, "grad_norm": 5.893951892852783, "learning_rate": 3.138277931103254e-05, "loss": 1.7993, "step": 2710 }, { "epoch": 1.98739610923372, "grad_norm": 8.20900821685791, "learning_rate": 3.09886979891476e-05, "loss": 2.1449, "step": 2720 }, { "epoch": 1.994702712576491, "grad_norm": 5.447132587432861, "learning_rate": 3.059599245163538e-05, "loss": 1.8473, "step": 2730 }, { "epoch": 2.002009315919262, "grad_norm": 6.67828893661499, "learning_rate": 3.0204691117282856e-05, "loss": 1.9341, "step": 2740 }, { "epoch": 2.009315919262033, "grad_norm": 7.406982898712158, "learning_rate": 2.981482230325946e-05, "loss": 1.9778, "step": 2750 }, { "epoch": 2.0166225226048042, "grad_norm": 4.272141456604004, "learning_rate": 2.9426414223067978e-05, "loss": 1.8293, "step": 2760 }, { "epoch": 2.023929125947575, "grad_norm": 5.6061320304870605, "learning_rate": 2.9039494984502734e-05, "loss": 1.7844, "step": 2770 }, { "epoch": 2.031235729290346, "grad_norm": 6.46597957611084, "learning_rate": 2.865409258761557e-05, "loss": 1.8147, "step": 2780 }, { "epoch": 2.038542332633117, "grad_norm": 6.094344139099121, "learning_rate": 2.8270234922689597e-05, "loss": 1.7082, "step": 2790 }, { "epoch": 2.0458489359758882, "grad_norm": 5.04010534286499, "learning_rate": 2.788794976822077e-05, "loss": 1.6477, "step": 2800 }, { "epoch": 2.0458489359758882, "eval_loss": 2.0164308547973633, "eval_runtime": 111.1374, "eval_samples_per_second": 10.95, "eval_steps_per_second": 10.95, "step": 2800 }, { "epoch": 2.0531555393186594, "grad_norm": 13.458788871765137, "learning_rate": 2.7507264788907783e-05, "loss": 1.7778, "step": 2810 }, { "epoch": 2.0604621426614305, "grad_norm": 6.946293830871582, "learning_rate": 2.712820753364998e-05, "loss": 1.6629, "step": 2820 }, { "epoch": 2.067768746004201, "grad_norm": 8.358539581298828, "learning_rate": 2.6750805433553728e-05, "loss": 2.0157, "step": 2830 }, { "epoch": 2.0750753493469722, "grad_norm": 6.011590480804443, "learning_rate": 2.637508579994741e-05, "loss": 1.9037, "step": 2840 }, { "epoch": 2.0823819526897434, "grad_norm": 5.1315226554870605, "learning_rate": 2.6001075822404864e-05, "loss": 1.8682, "step": 2850 }, { "epoch": 2.0896885560325145, "grad_norm": 5.693300247192383, "learning_rate": 2.5628802566777904e-05, "loss": 1.8367, "step": 2860 }, { "epoch": 2.0969951593752856, "grad_norm": 5.308597087860107, "learning_rate": 2.5258292973237536e-05, "loss": 1.9468, "step": 2870 }, { "epoch": 2.1043017627180562, "grad_norm": 6.057744979858398, "learning_rate": 2.4889573854324443e-05, "loss": 1.7683, "step": 2880 }, { "epoch": 2.1116083660608274, "grad_norm": 5.609562397003174, "learning_rate": 2.452267189300864e-05, "loss": 1.8463, "step": 2890 }, { "epoch": 2.1189149694035985, "grad_norm": 4.729313850402832, "learning_rate": 2.415761364075857e-05, "loss": 1.9935, "step": 2900 }, { "epoch": 2.1262215727463696, "grad_norm": 4.285727500915527, "learning_rate": 2.3794425515619535e-05, "loss": 1.8125, "step": 2910 }, { "epoch": 2.1335281760891407, "grad_norm": 5.974366664886475, "learning_rate": 2.343313380030207e-05, "loss": 1.8855, "step": 2920 }, { "epoch": 2.140834779431912, "grad_norm": 5.998205661773682, "learning_rate": 2.30737646402798e-05, "loss": 2.0122, "step": 2930 }, { "epoch": 2.1481413827746825, "grad_norm": 7.216914176940918, "learning_rate": 2.271634404189752e-05, "loss": 1.8831, "step": 2940 }, { "epoch": 2.1554479861174536, "grad_norm": 6.571489334106445, "learning_rate": 2.2360897870489055e-05, "loss": 1.6656, "step": 2950 }, { "epoch": 2.1627545894602247, "grad_norm": 5.754743576049805, "learning_rate": 2.2007451848505627e-05, "loss": 1.7651, "step": 2960 }, { "epoch": 2.170061192802996, "grad_norm": 4.261959552764893, "learning_rate": 2.1656031553654272e-05, "loss": 2.0409, "step": 2970 }, { "epoch": 2.177367796145767, "grad_norm": 4.679163932800293, "learning_rate": 2.1306662417046968e-05, "loss": 1.7555, "step": 2980 }, { "epoch": 2.1846743994885376, "grad_norm": 5.427217483520508, "learning_rate": 2.0959369721360183e-05, "loss": 2.0074, "step": 2990 }, { "epoch": 2.1919810028313087, "grad_norm": 8.53897476196289, "learning_rate": 2.0614178599005356e-05, "loss": 1.5985, "step": 3000 }, { "epoch": 2.1919810028313087, "eval_loss": 2.0024373531341553, "eval_runtime": 110.7503, "eval_samples_per_second": 10.989, "eval_steps_per_second": 10.989, "step": 3000 }, { "epoch": 2.19928760617408, "grad_norm": 8.221793174743652, "learning_rate": 2.0271114030310035e-05, "loss": 1.8105, "step": 3010 }, { "epoch": 2.206594209516851, "grad_norm": 5.500391006469727, "learning_rate": 1.9930200841710193e-05, "loss": 1.844, "step": 3020 }, { "epoch": 2.213900812859622, "grad_norm": 8.051352500915527, "learning_rate": 1.9591463703953672e-05, "loss": 1.6881, "step": 3030 }, { "epoch": 2.221207416202393, "grad_norm": 4.4952898025512695, "learning_rate": 1.9254927130314726e-05, "loss": 1.8373, "step": 3040 }, { "epoch": 2.228514019545164, "grad_norm": 7.687475204467773, "learning_rate": 1.8920615474820152e-05, "loss": 1.9104, "step": 3050 }, { "epoch": 2.235820622887935, "grad_norm": 5.942209243774414, "learning_rate": 1.8588552930486915e-05, "loss": 1.9455, "step": 3060 }, { "epoch": 2.243127226230706, "grad_norm": 4.695751667022705, "learning_rate": 1.8258763527571243e-05, "loss": 1.7007, "step": 3070 }, { "epoch": 2.250433829573477, "grad_norm": 5.661278247833252, "learning_rate": 1.7931271131829758e-05, "loss": 1.7467, "step": 3080 }, { "epoch": 2.257740432916248, "grad_norm": 7.149540901184082, "learning_rate": 1.7606099442792373e-05, "loss": 1.6398, "step": 3090 }, { "epoch": 2.265047036259019, "grad_norm": 7.075028419494629, "learning_rate": 1.728327199204716e-05, "loss": 1.7911, "step": 3100 }, { "epoch": 2.27235363960179, "grad_norm": 5.169941425323486, "learning_rate": 1.696281214153757e-05, "loss": 1.9487, "step": 3110 }, { "epoch": 2.279660242944561, "grad_norm": 8.544054985046387, "learning_rate": 1.664474308187167e-05, "loss": 1.9077, "step": 3120 }, { "epoch": 2.2869668462873323, "grad_norm": 5.160293102264404, "learning_rate": 1.6329087830644053e-05, "loss": 1.7878, "step": 3130 }, { "epoch": 2.2942734496301034, "grad_norm": 7.8444342613220215, "learning_rate": 1.6015869230769992e-05, "loss": 1.6692, "step": 3140 }, { "epoch": 2.301580052972874, "grad_norm": 5.798057556152344, "learning_rate": 1.5705109948832526e-05, "loss": 1.8317, "step": 3150 }, { "epoch": 2.308886656315645, "grad_norm": 7.343387126922607, "learning_rate": 1.5396832473442e-05, "loss": 1.9332, "step": 3160 }, { "epoch": 2.3161932596584163, "grad_norm": 5.731598377227783, "learning_rate": 1.5091059113608785e-05, "loss": 1.6953, "step": 3170 }, { "epoch": 2.3234998630011874, "grad_norm": 7.075622081756592, "learning_rate": 1.4787811997128737e-05, "loss": 1.8049, "step": 3180 }, { "epoch": 2.3308064663439585, "grad_norm": 9.426651000976562, "learning_rate": 1.4487113068981934e-05, "loss": 2.002, "step": 3190 }, { "epoch": 2.338113069686729, "grad_norm": 6.5062360763549805, "learning_rate": 1.418898408974456e-05, "loss": 1.8708, "step": 3200 }, { "epoch": 2.338113069686729, "eval_loss": 1.9918181896209717, "eval_runtime": 110.5621, "eval_samples_per_second": 11.007, "eval_steps_per_second": 11.007, "step": 3200 }, { "epoch": 2.3454196730295003, "grad_norm": 5.592414379119873, "learning_rate": 1.3893446634014257e-05, "loss": 1.8714, "step": 3210 }, { "epoch": 2.3527262763722714, "grad_norm": 5.582223892211914, "learning_rate": 1.3600522088848689e-05, "loss": 1.7765, "step": 3220 }, { "epoch": 2.3600328797150425, "grad_norm": 7.002323150634766, "learning_rate": 1.3310231652217997e-05, "loss": 1.9461, "step": 3230 }, { "epoch": 2.3673394830578136, "grad_norm": 12.038192749023438, "learning_rate": 1.3022596331470632e-05, "loss": 1.8854, "step": 3240 }, { "epoch": 2.3746460864005847, "grad_norm": 7.399435520172119, "learning_rate": 1.2737636941813196e-05, "loss": 1.9169, "step": 3250 }, { "epoch": 2.3819526897433554, "grad_norm": 8.563164710998535, "learning_rate": 1.245537410480414e-05, "loss": 1.9562, "step": 3260 }, { "epoch": 2.3892592930861265, "grad_norm": 5.194084167480469, "learning_rate": 1.2175828246861359e-05, "loss": 1.7997, "step": 3270 }, { "epoch": 2.3965658964288976, "grad_norm": 6.553465366363525, "learning_rate": 1.1899019597784117e-05, "loss": 1.8751, "step": 3280 }, { "epoch": 2.4038724997716687, "grad_norm": 5.508111476898193, "learning_rate": 1.1624968189288965e-05, "loss": 1.7808, "step": 3290 }, { "epoch": 2.41117910311444, "grad_norm": 6.692938327789307, "learning_rate": 1.1353693853560216e-05, "loss": 1.6511, "step": 3300 }, { "epoch": 2.4184857064572105, "grad_norm": 6.976585388183594, "learning_rate": 1.1085216221814665e-05, "loss": 1.8939, "step": 3310 }, { "epoch": 2.4257923097999816, "grad_norm": 7.673839569091797, "learning_rate": 1.0819554722881048e-05, "loss": 2.0006, "step": 3320 }, { "epoch": 2.4330989131427527, "grad_norm": 6.984090805053711, "learning_rate": 1.055672858179393e-05, "loss": 1.7107, "step": 3330 }, { "epoch": 2.440405516485524, "grad_norm": 6.236724853515625, "learning_rate": 1.0296756818402531e-05, "loss": 1.6792, "step": 3340 }, { "epoch": 2.447712119828295, "grad_norm": 7.661273002624512, "learning_rate": 1.0039658245994277e-05, "loss": 1.818, "step": 3350 }, { "epoch": 2.455018723171066, "grad_norm": 8.403165817260742, "learning_rate": 9.78545146993342e-06, "loss": 1.9927, "step": 3360 }, { "epoch": 2.4623253265138367, "grad_norm": 5.08246374130249, "learning_rate": 9.534154886314517e-06, "loss": 1.7398, "step": 3370 }, { "epoch": 2.469631929856608, "grad_norm": 7.482173919677734, "learning_rate": 9.28578668063127e-06, "loss": 1.9105, "step": 3380 }, { "epoch": 2.476938533199379, "grad_norm": 4.506008148193359, "learning_rate": 9.040364826460423e-06, "loss": 1.9258, "step": 3390 }, { "epoch": 2.48424513654215, "grad_norm": 6.290830612182617, "learning_rate": 8.797907084161155e-06, "loss": 1.8533, "step": 3400 }, { "epoch": 2.48424513654215, "eval_loss": 1.9872702360153198, "eval_runtime": 110.4675, "eval_samples_per_second": 11.017, "eval_steps_per_second": 11.017, "step": 3400 }, { "epoch": 2.491551739884921, "grad_norm": 5.8204522132873535, "learning_rate": 8.558430999589723e-06, "loss": 1.9186, "step": 3410 }, { "epoch": 2.498858343227692, "grad_norm": 7.549562931060791, "learning_rate": 8.321953902829842e-06, "loss": 1.6548, "step": 3420 }, { "epoch": 2.506164946570463, "grad_norm": 5.260471820831299, "learning_rate": 8.08849290693846e-06, "loss": 1.877, "step": 3430 }, { "epoch": 2.513471549913234, "grad_norm": 7.633718967437744, "learning_rate": 7.85806490670739e-06, "loss": 1.7109, "step": 3440 }, { "epoch": 2.520778153256005, "grad_norm": 6.902381420135498, "learning_rate": 7.630686577440722e-06, "loss": 1.8875, "step": 3450 }, { "epoch": 2.5280847565987763, "grad_norm": 8.275242805480957, "learning_rate": 7.406374373748004e-06, "loss": 1.8563, "step": 3460 }, { "epoch": 2.5353913599415474, "grad_norm": 5.76878023147583, "learning_rate": 7.185144528353583e-06, "loss": 1.7962, "step": 3470 }, { "epoch": 2.542697963284318, "grad_norm": 4.979648590087891, "learning_rate": 6.967013050921795e-06, "loss": 1.9158, "step": 3480 }, { "epoch": 2.550004566627089, "grad_norm": 9.704955101013184, "learning_rate": 6.751995726898464e-06, "loss": 2.0237, "step": 3490 }, { "epoch": 2.5573111699698603, "grad_norm": 9.094025611877441, "learning_rate": 6.540108116368515e-06, "loss": 1.8791, "step": 3500 }, { "epoch": 2.5646177733126314, "grad_norm": 6.252310752868652, "learning_rate": 6.33136555293003e-06, "loss": 1.8576, "step": 3510 }, { "epoch": 2.571924376655402, "grad_norm": 7.935026168823242, "learning_rate": 6.125783142584479e-06, "loss": 1.9725, "step": 3520 }, { "epoch": 2.579230979998173, "grad_norm": 6.289064884185791, "learning_rate": 5.923375762643668e-06, "loss": 1.7741, "step": 3530 }, { "epoch": 2.5865375833409443, "grad_norm": 6.07642126083374, "learning_rate": 5.724158060653029e-06, "loss": 1.8716, "step": 3540 }, { "epoch": 2.5938441866837154, "grad_norm": 7.241240501403809, "learning_rate": 5.528144453331696e-06, "loss": 1.7577, "step": 3550 }, { "epoch": 2.6011507900264865, "grad_norm": 5.838751316070557, "learning_rate": 5.335349125529154e-06, "loss": 1.8637, "step": 3560 }, { "epoch": 2.6084573933692576, "grad_norm": 7.343444347381592, "learning_rate": 5.14578602919879e-06, "loss": 1.8605, "step": 3570 }, { "epoch": 2.6157639967120287, "grad_norm": 5.384237289428711, "learning_rate": 4.959468882388163e-06, "loss": 1.739, "step": 3580 }, { "epoch": 2.6230706000547994, "grad_norm": 8.187921524047852, "learning_rate": 4.776411168246353e-06, "loss": 1.8639, "step": 3590 }, { "epoch": 2.6303772033975705, "grad_norm": 5.8128838539123535, "learning_rate": 4.596626134048176e-06, "loss": 1.7357, "step": 3600 }, { "epoch": 2.6303772033975705, "eval_loss": 1.9813354015350342, "eval_runtime": 104.4364, "eval_samples_per_second": 11.653, "eval_steps_per_second": 11.653, "step": 3600 }, { "epoch": 2.6376838067403416, "grad_norm": 5.993869304656982, "learning_rate": 4.420126790235552e-06, "loss": 1.9172, "step": 3610 }, { "epoch": 2.6449904100831128, "grad_norm": 6.651885509490967, "learning_rate": 4.246925909475957e-06, "loss": 1.7701, "step": 3620 }, { "epoch": 2.6522970134258834, "grad_norm": 5.599200248718262, "learning_rate": 4.077036025738118e-06, "loss": 1.9576, "step": 3630 }, { "epoch": 2.6596036167686545, "grad_norm": 5.870293140411377, "learning_rate": 3.910469433385017e-06, "loss": 1.7158, "step": 3640 }, { "epoch": 2.6669102201114256, "grad_norm": 7.64784574508667, "learning_rate": 3.7472381862840967e-06, "loss": 1.8767, "step": 3650 }, { "epoch": 2.6742168234541968, "grad_norm": 5.308115005493164, "learning_rate": 3.5873540969350415e-06, "loss": 1.9414, "step": 3660 }, { "epoch": 2.681523426796968, "grad_norm": 6.556251525878906, "learning_rate": 3.430828735614916e-06, "loss": 1.8932, "step": 3670 }, { "epoch": 2.688830030139739, "grad_norm": 5.682919025421143, "learning_rate": 3.277673429540862e-06, "loss": 1.8576, "step": 3680 }, { "epoch": 2.69613663348251, "grad_norm": 5.832954406738281, "learning_rate": 3.1278992620503877e-06, "loss": 1.6419, "step": 3690 }, { "epoch": 2.7034432368252808, "grad_norm": 5.423207759857178, "learning_rate": 2.9815170717993115e-06, "loss": 1.7736, "step": 3700 }, { "epoch": 2.710749840168052, "grad_norm": 5.549076080322266, "learning_rate": 2.83853745197738e-06, "loss": 1.9699, "step": 3710 }, { "epoch": 2.718056443510823, "grad_norm": 5.261001110076904, "learning_rate": 2.6989707495417292e-06, "loss": 1.5671, "step": 3720 }, { "epoch": 2.725363046853594, "grad_norm": 9.049291610717773, "learning_rate": 2.5628270644680265e-06, "loss": 1.9909, "step": 3730 }, { "epoch": 2.7326696501963648, "grad_norm": 5.768558979034424, "learning_rate": 2.430116249019665e-06, "loss": 1.8207, "step": 3740 }, { "epoch": 2.739976253539136, "grad_norm": 7.210824012756348, "learning_rate": 2.3008479070346867e-06, "loss": 1.8219, "step": 3750 }, { "epoch": 2.747282856881907, "grad_norm": 5.107162952423096, "learning_rate": 2.1750313932308806e-06, "loss": 1.7551, "step": 3760 }, { "epoch": 2.754589460224678, "grad_norm": 6.917966365814209, "learning_rate": 2.0526758125287427e-06, "loss": 1.6674, "step": 3770 }, { "epoch": 2.761896063567449, "grad_norm": 5.870244026184082, "learning_rate": 1.933790019392634e-06, "loss": 1.8, "step": 3780 }, { "epoch": 2.7692026669102203, "grad_norm": 5.093700408935547, "learning_rate": 1.8183826171899677e-06, "loss": 1.7592, "step": 3790 }, { "epoch": 2.776509270252991, "grad_norm": 9.029096603393555, "learning_rate": 1.7064619575686336e-06, "loss": 1.8041, "step": 3800 }, { "epoch": 2.776509270252991, "eval_loss": 1.9782235622406006, "eval_runtime": 105.0311, "eval_samples_per_second": 11.587, "eval_steps_per_second": 11.587, "step": 3800 }, { "epoch": 2.783815873595762, "grad_norm": 5.48701810836792, "learning_rate": 1.5980361398526267e-06, "loss": 2.1588, "step": 3810 }, { "epoch": 2.791122476938533, "grad_norm": 8.91283893585205, "learning_rate": 1.4931130104559154e-06, "loss": 1.8274, "step": 3820 }, { "epoch": 2.7984290802813043, "grad_norm": 7.846653938293457, "learning_rate": 1.3917001623146186e-06, "loss": 1.9029, "step": 3830 }, { "epoch": 2.805735683624075, "grad_norm": 5.458700656890869, "learning_rate": 1.2938049343375502e-06, "loss": 1.8208, "step": 3840 }, { "epoch": 2.813042286966846, "grad_norm": 6.338927745819092, "learning_rate": 1.1994344108750833e-06, "loss": 1.808, "step": 3850 }, { "epoch": 2.820348890309617, "grad_norm": 6.470076560974121, "learning_rate": 1.108595421206532e-06, "loss": 1.8503, "step": 3860 }, { "epoch": 2.8276554936523883, "grad_norm": 4.856273174285889, "learning_rate": 1.021294539045914e-06, "loss": 1.9037, "step": 3870 }, { "epoch": 2.8349620969951594, "grad_norm": 6.1774139404296875, "learning_rate": 9.375380820662194e-07, "loss": 1.8673, "step": 3880 }, { "epoch": 2.8422687003379306, "grad_norm": 5.6335296630859375, "learning_rate": 8.57332111442255e-07, "loss": 1.6094, "step": 3890 }, { "epoch": 2.8495753036807017, "grad_norm": 6.399806022644043, "learning_rate": 7.806824314119832e-07, "loss": 1.8876, "step": 3900 }, { "epoch": 2.8568819070234723, "grad_norm": 8.844847679138184, "learning_rate": 7.075945888565194e-07, "loss": 1.9354, "step": 3910 }, { "epoch": 2.8641885103662434, "grad_norm": 7.803627014160156, "learning_rate": 6.380738728986924e-07, "loss": 1.7139, "step": 3920 }, { "epoch": 2.8714951137090146, "grad_norm": 8.815073013305664, "learning_rate": 5.721253145203165e-07, "loss": 1.8449, "step": 3930 }, { "epoch": 2.8788017170517857, "grad_norm": 5.23136568069458, "learning_rate": 5.097536861981e-07, "loss": 1.774, "step": 3940 }, { "epoch": 2.8861083203945563, "grad_norm": 5.498495101928711, "learning_rate": 4.5096350155827693e-07, "loss": 1.7747, "step": 3950 }, { "epoch": 2.8934149237373274, "grad_norm": 6.538133144378662, "learning_rate": 3.957590150499735e-07, "loss": 1.8453, "step": 3960 }, { "epoch": 2.9007215270800986, "grad_norm": 6.259313106536865, "learning_rate": 3.441442216373436e-07, "loss": 1.9312, "step": 3970 }, { "epoch": 2.9080281304228697, "grad_norm": 5.1868486404418945, "learning_rate": 2.9612285651042795e-07, "loss": 1.8508, "step": 3980 }, { "epoch": 2.915334733765641, "grad_norm": 6.202200412750244, "learning_rate": 2.5169839481489764e-07, "loss": 1.8832, "step": 3990 }, { "epoch": 2.922641337108412, "grad_norm": 5.868015289306641, "learning_rate": 2.1087405140053362e-07, "loss": 1.7998, "step": 4000 }, { "epoch": 2.922641337108412, "eval_loss": 1.9779986143112183, "eval_runtime": 103.4275, "eval_samples_per_second": 11.767, "eval_steps_per_second": 11.767, "step": 4000 }, { "epoch": 2.929947940451183, "grad_norm": 9.322286605834961, "learning_rate": 1.736527805885957e-07, "loss": 1.8784, "step": 4010 }, { "epoch": 2.9372545437939537, "grad_norm": 5.733661651611328, "learning_rate": 1.4003727595802152e-07, "loss": 1.9243, "step": 4020 }, { "epoch": 2.944561147136725, "grad_norm": 6.326545238494873, "learning_rate": 1.1002997015050476e-07, "loss": 1.8568, "step": 4030 }, { "epoch": 2.951867750479496, "grad_norm": 5.118636608123779, "learning_rate": 8.363303469445805e-08, "loss": 1.7683, "step": 4040 }, { "epoch": 2.959174353822267, "grad_norm": 5.2632317543029785, "learning_rate": 6.084837984786096e-08, "loss": 1.9612, "step": 4050 }, { "epoch": 2.9664809571650377, "grad_norm": 5.4591898918151855, "learning_rate": 4.167765446000393e-08, "loss": 1.8417, "step": 4060 }, { "epoch": 2.973787560507809, "grad_norm": 6.642773628234863, "learning_rate": 2.6122245852205906e-08, "loss": 1.9892, "step": 4070 }, { "epoch": 2.98109416385058, "grad_norm": 5.449717998504639, "learning_rate": 1.4183279717389087e-08, "loss": 1.8122, "step": 4080 }, { "epoch": 2.988400767193351, "grad_norm": 5.309086322784424, "learning_rate": 5.861620038610794e-09, "loss": 1.7895, "step": 4090 }, { "epoch": 2.995707370536122, "grad_norm": 6.38864803314209, "learning_rate": 1.157869026574554e-09, "loss": 1.7208, "step": 4100 }, { "epoch": 2.9986300118732303, "step": 4104, "total_flos": 1.3798092749438976e+17, "train_loss": 2.123500373163651, "train_runtime": 11771.2816, "train_samples_per_second": 2.79, "train_steps_per_second": 0.349 } ], "logging_steps": 10, "max_steps": 4104, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.3798092749438976e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }