{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4140012070006036, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 53.90464782714844, "learning_rate": 9.946323134728933e-06, "loss": 5.2914, "step": 10 }, { "epoch": 0.03, "grad_norm": 53.05311965942383, "learning_rate": 9.892646269457864e-06, "loss": 4.9539, "step": 20 }, { "epoch": 0.05, "grad_norm": 64.5956802368164, "learning_rate": 9.838969404186796e-06, "loss": 4.7374, "step": 30 }, { "epoch": 0.06, "grad_norm": 54.43781280517578, "learning_rate": 9.785292538915728e-06, "loss": 4.5988, "step": 40 }, { "epoch": 0.08, "grad_norm": 48.784854888916016, "learning_rate": 9.73161567364466e-06, "loss": 4.4899, "step": 50 }, { "epoch": 0.1, "grad_norm": 56.800575256347656, "learning_rate": 9.677938808373591e-06, "loss": 4.4391, "step": 60 }, { "epoch": 0.11, "grad_norm": 51.58082962036133, "learning_rate": 9.624261943102525e-06, "loss": 4.3347, "step": 70 }, { "epoch": 0.13, "grad_norm": 50.625308990478516, "learning_rate": 9.570585077831455e-06, "loss": 4.3145, "step": 80 }, { "epoch": 0.14, "grad_norm": 46.56022262573242, "learning_rate": 9.516908212560388e-06, "loss": 4.1742, "step": 90 }, { "epoch": 0.16, "grad_norm": 51.267852783203125, "learning_rate": 9.463231347289318e-06, "loss": 4.1308, "step": 100 }, { "epoch": 0.18, "grad_norm": 52.627098083496094, "learning_rate": 9.40955448201825e-06, "loss": 4.1251, "step": 110 }, { "epoch": 0.19, "grad_norm": 51.49211120605469, "learning_rate": 9.355877616747183e-06, "loss": 4.073, "step": 120 }, { "epoch": 0.21, "grad_norm": 50.88691329956055, "learning_rate": 9.302200751476115e-06, "loss": 3.9998, "step": 130 }, { "epoch": 0.23, "grad_norm": 45.862796783447266, "learning_rate": 9.248523886205046e-06, "loss": 3.8666, "step": 140 }, { "epoch": 0.24, "grad_norm": 51.27406692504883, "learning_rate": 9.194847020933978e-06, "loss": 3.9477, "step": 150 }, { "epoch": 0.26, "grad_norm": 47.50687026977539, "learning_rate": 9.14117015566291e-06, "loss": 3.7899, "step": 160 }, { "epoch": 0.27, "grad_norm": 48.58837127685547, "learning_rate": 9.087493290391842e-06, "loss": 3.7295, "step": 170 }, { "epoch": 0.29, "grad_norm": 48.3990592956543, "learning_rate": 9.033816425120775e-06, "loss": 3.7023, "step": 180 }, { "epoch": 0.31, "grad_norm": 46.30027770996094, "learning_rate": 8.980139559849705e-06, "loss": 3.6479, "step": 190 }, { "epoch": 0.32, "grad_norm": 47.15484619140625, "learning_rate": 8.926462694578637e-06, "loss": 3.6161, "step": 200 }, { "epoch": 0.34, "grad_norm": 49.22386932373047, "learning_rate": 8.87278582930757e-06, "loss": 3.5503, "step": 210 }, { "epoch": 0.35, "grad_norm": 45.485557556152344, "learning_rate": 8.819108964036502e-06, "loss": 3.4459, "step": 220 }, { "epoch": 0.37, "grad_norm": 50.84394454956055, "learning_rate": 8.765432098765432e-06, "loss": 3.4918, "step": 230 }, { "epoch": 0.39, "grad_norm": 43.17815399169922, "learning_rate": 8.711755233494365e-06, "loss": 3.3527, "step": 240 }, { "epoch": 0.4, "grad_norm": 41.90092849731445, "learning_rate": 8.658078368223295e-06, "loss": 3.347, "step": 250 }, { "epoch": 0.42, "grad_norm": 43.23625564575195, "learning_rate": 8.60440150295223e-06, "loss": 3.3433, "step": 260 }, { "epoch": 0.43, "grad_norm": 44.05680847167969, "learning_rate": 8.55072463768116e-06, "loss": 3.2831, "step": 270 }, { "epoch": 0.45, "grad_norm": 46.9146842956543, "learning_rate": 8.497047772410092e-06, "loss": 3.2932, "step": 280 }, { "epoch": 0.47, "grad_norm": 48.542701721191406, "learning_rate": 8.443370907139024e-06, "loss": 3.2108, "step": 290 }, { "epoch": 0.48, "grad_norm": 39.1197509765625, "learning_rate": 8.389694041867955e-06, "loss": 3.1704, "step": 300 }, { "epoch": 0.5, "grad_norm": 45.300697326660156, "learning_rate": 8.336017176596887e-06, "loss": 3.2239, "step": 310 }, { "epoch": 0.51, "grad_norm": 47.439823150634766, "learning_rate": 8.28234031132582e-06, "loss": 3.1709, "step": 320 }, { "epoch": 0.53, "grad_norm": 42.67869567871094, "learning_rate": 8.228663446054752e-06, "loss": 3.0708, "step": 330 }, { "epoch": 0.55, "grad_norm": 46.0591926574707, "learning_rate": 8.174986580783682e-06, "loss": 3.0959, "step": 340 }, { "epoch": 0.56, "grad_norm": 42.061805725097656, "learning_rate": 8.121309715512614e-06, "loss": 3.0442, "step": 350 }, { "epoch": 0.58, "grad_norm": 44.487159729003906, "learning_rate": 8.067632850241547e-06, "loss": 2.8881, "step": 360 }, { "epoch": 0.6, "grad_norm": 44.4419059753418, "learning_rate": 8.013955984970479e-06, "loss": 2.9994, "step": 370 }, { "epoch": 0.61, "grad_norm": 40.00767135620117, "learning_rate": 7.96027911969941e-06, "loss": 2.8931, "step": 380 }, { "epoch": 0.63, "grad_norm": 44.14014434814453, "learning_rate": 7.906602254428342e-06, "loss": 2.9288, "step": 390 }, { "epoch": 0.64, "grad_norm": 42.418888092041016, "learning_rate": 7.852925389157274e-06, "loss": 2.7776, "step": 400 }, { "epoch": 0.66, "grad_norm": 49.98628234863281, "learning_rate": 7.799248523886206e-06, "loss": 2.8408, "step": 410 }, { "epoch": 0.68, "grad_norm": 39.240352630615234, "learning_rate": 7.745571658615137e-06, "loss": 2.7995, "step": 420 }, { "epoch": 0.69, "grad_norm": 43.185569763183594, "learning_rate": 7.691894793344069e-06, "loss": 2.7486, "step": 430 }, { "epoch": 0.71, "grad_norm": 37.41790008544922, "learning_rate": 7.638217928073001e-06, "loss": 2.7459, "step": 440 }, { "epoch": 0.72, "grad_norm": 38.520973205566406, "learning_rate": 7.584541062801934e-06, "loss": 2.6785, "step": 450 }, { "epoch": 0.74, "grad_norm": 42.01523971557617, "learning_rate": 7.530864197530865e-06, "loss": 2.7745, "step": 460 }, { "epoch": 0.76, "grad_norm": 40.55296325683594, "learning_rate": 7.477187332259796e-06, "loss": 2.7046, "step": 470 }, { "epoch": 0.77, "grad_norm": 38.31270217895508, "learning_rate": 7.423510466988728e-06, "loss": 2.6057, "step": 480 }, { "epoch": 0.79, "grad_norm": 37.31733322143555, "learning_rate": 7.369833601717661e-06, "loss": 2.5229, "step": 490 }, { "epoch": 0.8, "grad_norm": 37.55893325805664, "learning_rate": 7.316156736446592e-06, "loss": 2.5846, "step": 500 }, { "epoch": 0.82, "grad_norm": 40.64212417602539, "learning_rate": 7.262479871175524e-06, "loss": 2.4992, "step": 510 }, { "epoch": 0.84, "grad_norm": 41.34495544433594, "learning_rate": 7.208803005904456e-06, "loss": 2.5575, "step": 520 }, { "epoch": 0.85, "grad_norm": 38.53882598876953, "learning_rate": 7.155126140633387e-06, "loss": 2.5468, "step": 530 }, { "epoch": 0.87, "grad_norm": 39.23023986816406, "learning_rate": 7.10144927536232e-06, "loss": 2.5009, "step": 540 }, { "epoch": 0.89, "grad_norm": 35.68672180175781, "learning_rate": 7.047772410091251e-06, "loss": 2.2929, "step": 550 }, { "epoch": 0.9, "grad_norm": 38.0786247253418, "learning_rate": 6.994095544820183e-06, "loss": 2.3178, "step": 560 }, { "epoch": 0.92, "grad_norm": 42.21394348144531, "learning_rate": 6.940418679549115e-06, "loss": 2.4645, "step": 570 }, { "epoch": 0.93, "grad_norm": 37.197696685791016, "learning_rate": 6.886741814278046e-06, "loss": 2.4797, "step": 580 }, { "epoch": 0.95, "grad_norm": 40.341514587402344, "learning_rate": 6.833064949006979e-06, "loss": 2.3048, "step": 590 }, { "epoch": 0.97, "grad_norm": 36.51962661743164, "learning_rate": 6.779388083735911e-06, "loss": 2.4652, "step": 600 }, { "epoch": 0.98, "grad_norm": 35.32337951660156, "learning_rate": 6.725711218464842e-06, "loss": 2.2979, "step": 610 }, { "epoch": 1.0, "grad_norm": 40.75404739379883, "learning_rate": 6.672034353193773e-06, "loss": 2.354, "step": 620 }, { "epoch": 1.01, "grad_norm": 30.498510360717773, "learning_rate": 6.6183574879227065e-06, "loss": 1.8592, "step": 630 }, { "epoch": 1.03, "grad_norm": 31.13385772705078, "learning_rate": 6.564680622651638e-06, "loss": 1.7776, "step": 640 }, { "epoch": 1.05, "grad_norm": 34.9401969909668, "learning_rate": 6.511003757380569e-06, "loss": 1.9329, "step": 650 }, { "epoch": 1.06, "grad_norm": 32.90480041503906, "learning_rate": 6.457326892109501e-06, "loss": 1.8312, "step": 660 }, { "epoch": 1.08, "grad_norm": 32.94902420043945, "learning_rate": 6.403650026838433e-06, "loss": 1.8556, "step": 670 }, { "epoch": 1.09, "grad_norm": 32.02881622314453, "learning_rate": 6.349973161567365e-06, "loss": 1.8142, "step": 680 }, { "epoch": 1.11, "grad_norm": 32.752323150634766, "learning_rate": 6.296296296296297e-06, "loss": 1.7429, "step": 690 }, { "epoch": 1.13, "grad_norm": 31.938289642333984, "learning_rate": 6.242619431025229e-06, "loss": 1.821, "step": 700 }, { "epoch": 1.14, "grad_norm": 32.64255142211914, "learning_rate": 6.18894256575416e-06, "loss": 1.7492, "step": 710 }, { "epoch": 1.16, "grad_norm": 30.172483444213867, "learning_rate": 6.135265700483092e-06, "loss": 1.7661, "step": 720 }, { "epoch": 1.17, "grad_norm": 32.1895637512207, "learning_rate": 6.081588835212025e-06, "loss": 1.6979, "step": 730 }, { "epoch": 1.19, "grad_norm": 32.555870056152344, "learning_rate": 6.027911969940956e-06, "loss": 1.778, "step": 740 }, { "epoch": 1.21, "grad_norm": 31.702539443969727, "learning_rate": 5.974235104669888e-06, "loss": 1.7263, "step": 750 }, { "epoch": 1.22, "grad_norm": 32.07310104370117, "learning_rate": 5.920558239398819e-06, "loss": 1.5491, "step": 760 }, { "epoch": 1.24, "grad_norm": 31.130224227905273, "learning_rate": 5.866881374127752e-06, "loss": 1.6249, "step": 770 }, { "epoch": 1.26, "grad_norm": 39.838436126708984, "learning_rate": 5.8132045088566835e-06, "loss": 1.6721, "step": 780 }, { "epoch": 1.27, "grad_norm": 33.75567626953125, "learning_rate": 5.759527643585615e-06, "loss": 1.6779, "step": 790 }, { "epoch": 1.29, "grad_norm": 31.251935958862305, "learning_rate": 5.705850778314546e-06, "loss": 1.6401, "step": 800 }, { "epoch": 1.3, "grad_norm": 31.644649505615234, "learning_rate": 5.652173913043479e-06, "loss": 1.6647, "step": 810 }, { "epoch": 1.32, "grad_norm": 30.14424705505371, "learning_rate": 5.598497047772411e-06, "loss": 1.6887, "step": 820 }, { "epoch": 1.34, "grad_norm": 29.70695686340332, "learning_rate": 5.544820182501342e-06, "loss": 1.5387, "step": 830 }, { "epoch": 1.35, "grad_norm": 31.330068588256836, "learning_rate": 5.4911433172302745e-06, "loss": 1.6414, "step": 840 }, { "epoch": 1.37, "grad_norm": 32.08658981323242, "learning_rate": 5.437466451959206e-06, "loss": 1.6158, "step": 850 }, { "epoch": 1.38, "grad_norm": 33.42084503173828, "learning_rate": 5.383789586688137e-06, "loss": 1.6728, "step": 860 }, { "epoch": 1.4, "grad_norm": 32.10792922973633, "learning_rate": 5.3301127214170704e-06, "loss": 1.5365, "step": 870 }, { "epoch": 1.42, "grad_norm": 34.231239318847656, "learning_rate": 5.276435856146002e-06, "loss": 1.584, "step": 880 }, { "epoch": 1.43, "grad_norm": 32.19587326049805, "learning_rate": 5.222758990874933e-06, "loss": 1.6233, "step": 890 }, { "epoch": 1.45, "grad_norm": 30.36279296875, "learning_rate": 5.169082125603865e-06, "loss": 1.5246, "step": 900 }, { "epoch": 1.46, "grad_norm": 33.34714889526367, "learning_rate": 5.115405260332798e-06, "loss": 1.5514, "step": 910 }, { "epoch": 1.48, "grad_norm": 32.581424713134766, "learning_rate": 5.061728395061729e-06, "loss": 1.495, "step": 920 }, { "epoch": 1.5, "grad_norm": 33.158203125, "learning_rate": 5.0080515297906606e-06, "loss": 1.5546, "step": 930 }, { "epoch": 1.51, "grad_norm": 29.796606063842773, "learning_rate": 4.954374664519592e-06, "loss": 1.5141, "step": 940 }, { "epoch": 1.53, "grad_norm": 31.936180114746094, "learning_rate": 4.900697799248524e-06, "loss": 1.5156, "step": 950 }, { "epoch": 1.54, "grad_norm": 30.770095825195312, "learning_rate": 4.847020933977456e-06, "loss": 1.5262, "step": 960 }, { "epoch": 1.56, "grad_norm": 32.497520446777344, "learning_rate": 4.793344068706388e-06, "loss": 1.5303, "step": 970 }, { "epoch": 1.58, "grad_norm": 31.067218780517578, "learning_rate": 4.739667203435319e-06, "loss": 1.4994, "step": 980 }, { "epoch": 1.59, "grad_norm": 27.720073699951172, "learning_rate": 4.6859903381642516e-06, "loss": 1.4268, "step": 990 }, { "epoch": 1.61, "grad_norm": 30.310941696166992, "learning_rate": 4.632313472893184e-06, "loss": 1.4636, "step": 1000 }, { "epoch": 1.63, "grad_norm": 33.62602996826172, "learning_rate": 4.578636607622115e-06, "loss": 1.4783, "step": 1010 }, { "epoch": 1.64, "grad_norm": 28.9564266204834, "learning_rate": 4.5249597423510475e-06, "loss": 1.359, "step": 1020 }, { "epoch": 1.66, "grad_norm": 29.886262893676758, "learning_rate": 4.471282877079979e-06, "loss": 1.4405, "step": 1030 }, { "epoch": 1.67, "grad_norm": 26.291038513183594, "learning_rate": 4.417606011808911e-06, "loss": 1.3914, "step": 1040 }, { "epoch": 1.69, "grad_norm": 30.628904342651367, "learning_rate": 4.3639291465378425e-06, "loss": 1.4335, "step": 1050 }, { "epoch": 1.71, "grad_norm": 27.96939468383789, "learning_rate": 4.310252281266775e-06, "loss": 1.3577, "step": 1060 }, { "epoch": 1.72, "grad_norm": 29.119224548339844, "learning_rate": 4.256575415995706e-06, "loss": 1.3808, "step": 1070 }, { "epoch": 1.74, "grad_norm": 30.36097526550293, "learning_rate": 4.202898550724638e-06, "loss": 1.3545, "step": 1080 }, { "epoch": 1.75, "grad_norm": 30.843242645263672, "learning_rate": 4.14922168545357e-06, "loss": 1.3751, "step": 1090 }, { "epoch": 1.77, "grad_norm": 29.29217529296875, "learning_rate": 4.095544820182501e-06, "loss": 1.3649, "step": 1100 }, { "epoch": 1.79, "grad_norm": 30.685625076293945, "learning_rate": 4.0418679549114335e-06, "loss": 1.4354, "step": 1110 }, { "epoch": 1.8, "grad_norm": 26.101669311523438, "learning_rate": 3.988191089640365e-06, "loss": 1.3355, "step": 1120 }, { "epoch": 1.82, "grad_norm": 29.12729835510254, "learning_rate": 3.934514224369297e-06, "loss": 1.3568, "step": 1130 }, { "epoch": 1.83, "grad_norm": 27.82271957397461, "learning_rate": 3.880837359098229e-06, "loss": 1.3702, "step": 1140 }, { "epoch": 1.85, "grad_norm": 26.432327270507812, "learning_rate": 3.827160493827161e-06, "loss": 1.3231, "step": 1150 }, { "epoch": 1.87, "grad_norm": 30.632972717285156, "learning_rate": 3.7734836285560927e-06, "loss": 1.3283, "step": 1160 }, { "epoch": 1.88, "grad_norm": 27.142309188842773, "learning_rate": 3.7198067632850245e-06, "loss": 1.3159, "step": 1170 }, { "epoch": 1.9, "grad_norm": 27.63045310974121, "learning_rate": 3.6661298980139563e-06, "loss": 1.3777, "step": 1180 }, { "epoch": 1.92, "grad_norm": 30.256242752075195, "learning_rate": 3.612453032742888e-06, "loss": 1.2845, "step": 1190 }, { "epoch": 1.93, "grad_norm": 28.592174530029297, "learning_rate": 3.5587761674718204e-06, "loss": 1.3163, "step": 1200 }, { "epoch": 1.95, "grad_norm": 29.088247299194336, "learning_rate": 3.505099302200752e-06, "loss": 1.3145, "step": 1210 }, { "epoch": 1.96, "grad_norm": 27.801074981689453, "learning_rate": 3.4514224369296832e-06, "loss": 1.3675, "step": 1220 }, { "epoch": 1.98, "grad_norm": 28.81484603881836, "learning_rate": 3.3977455716586155e-06, "loss": 1.2854, "step": 1230 }, { "epoch": 2.0, "grad_norm": 28.966217041015625, "learning_rate": 3.3440687063875473e-06, "loss": 1.3431, "step": 1240 }, { "epoch": 2.01, "grad_norm": 23.021453857421875, "learning_rate": 3.290391841116479e-06, "loss": 1.0516, "step": 1250 }, { "epoch": 2.03, "grad_norm": 25.622419357299805, "learning_rate": 3.236714975845411e-06, "loss": 0.9667, "step": 1260 }, { "epoch": 2.04, "grad_norm": 26.45795249938965, "learning_rate": 3.1830381105743428e-06, "loss": 0.9341, "step": 1270 }, { "epoch": 2.06, "grad_norm": 26.28618812561035, "learning_rate": 3.1293612453032746e-06, "loss": 0.9024, "step": 1280 }, { "epoch": 2.08, "grad_norm": 24.80799102783203, "learning_rate": 3.075684380032206e-06, "loss": 0.8784, "step": 1290 }, { "epoch": 2.09, "grad_norm": 25.70199966430664, "learning_rate": 3.0220075147611383e-06, "loss": 0.9307, "step": 1300 }, { "epoch": 2.11, "grad_norm": 24.88735580444336, "learning_rate": 2.9683306494900697e-06, "loss": 0.9037, "step": 1310 }, { "epoch": 2.12, "grad_norm": 26.51141929626465, "learning_rate": 2.914653784219002e-06, "loss": 0.9884, "step": 1320 }, { "epoch": 2.14, "grad_norm": 25.662946701049805, "learning_rate": 2.8609769189479338e-06, "loss": 0.8631, "step": 1330 }, { "epoch": 2.16, "grad_norm": 22.733741760253906, "learning_rate": 2.8073000536768656e-06, "loss": 0.9273, "step": 1340 }, { "epoch": 2.17, "grad_norm": 24.159793853759766, "learning_rate": 2.7536231884057974e-06, "loss": 0.9262, "step": 1350 }, { "epoch": 2.19, "grad_norm": 23.92421531677246, "learning_rate": 2.699946323134729e-06, "loss": 0.9048, "step": 1360 }, { "epoch": 2.2, "grad_norm": 28.564496994018555, "learning_rate": 2.646269457863661e-06, "loss": 0.9879, "step": 1370 }, { "epoch": 2.22, "grad_norm": 25.430883407592773, "learning_rate": 2.5925925925925925e-06, "loss": 0.892, "step": 1380 }, { "epoch": 2.24, "grad_norm": 23.307687759399414, "learning_rate": 2.5389157273215247e-06, "loss": 0.894, "step": 1390 }, { "epoch": 2.25, "grad_norm": 25.83247184753418, "learning_rate": 2.4852388620504566e-06, "loss": 0.8817, "step": 1400 }, { "epoch": 2.27, "grad_norm": 25.72507095336914, "learning_rate": 2.4315619967793884e-06, "loss": 0.9155, "step": 1410 }, { "epoch": 2.29, "grad_norm": 26.67945098876953, "learning_rate": 2.3778851315083202e-06, "loss": 0.9325, "step": 1420 }, { "epoch": 2.3, "grad_norm": 25.82522964477539, "learning_rate": 2.324208266237252e-06, "loss": 0.9055, "step": 1430 }, { "epoch": 2.32, "grad_norm": 22.66315269470215, "learning_rate": 2.270531400966184e-06, "loss": 0.8631, "step": 1440 }, { "epoch": 2.33, "grad_norm": 25.832313537597656, "learning_rate": 2.2168545356951157e-06, "loss": 0.9539, "step": 1450 }, { "epoch": 2.35, "grad_norm": 23.865262985229492, "learning_rate": 2.163177670424047e-06, "loss": 0.8461, "step": 1460 }, { "epoch": 2.37, "grad_norm": 23.32217025756836, "learning_rate": 2.109500805152979e-06, "loss": 0.8595, "step": 1470 }, { "epoch": 2.38, "grad_norm": 24.299062728881836, "learning_rate": 2.0558239398819112e-06, "loss": 0.899, "step": 1480 }, { "epoch": 2.4, "grad_norm": 25.582359313964844, "learning_rate": 2.002147074610843e-06, "loss": 0.898, "step": 1490 }, { "epoch": 2.41, "grad_norm": 25.416006088256836, "learning_rate": 1.948470209339775e-06, "loss": 0.898, "step": 1500 } ], "logging_steps": 10, "max_steps": 1863, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 5730542923874304.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }