{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4193548387096775, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.2860161066055298, "learning_rate": 4.999442829949762e-05, "loss": 1.1916, "step": 10 }, { "epoch": 0.04, "grad_norm": 1.6525003910064697, "learning_rate": 4.997771568149818e-05, "loss": 1.0982, "step": 20 }, { "epoch": 0.06, "grad_norm": 1.212003231048584, "learning_rate": 4.9949869595417876e-05, "loss": 1.021, "step": 30 }, { "epoch": 0.08, "grad_norm": 1.9332901239395142, "learning_rate": 4.9910902453260824e-05, "loss": 1.0453, "step": 40 }, { "epoch": 0.1, "grad_norm": 1.3961759805679321, "learning_rate": 4.986083162408669e-05, "loss": 0.9987, "step": 50 }, { "epoch": 0.12, "grad_norm": 1.5509816408157349, "learning_rate": 4.979967942626858e-05, "loss": 0.9685, "step": 60 }, { "epoch": 0.14, "grad_norm": 1.23104727268219, "learning_rate": 4.972747311754501e-05, "loss": 0.9597, "step": 70 }, { "epoch": 0.16, "grad_norm": 1.7351208925247192, "learning_rate": 4.964424488287009e-05, "loss": 0.9614, "step": 80 }, { "epoch": 0.18, "grad_norm": 1.6495784521102905, "learning_rate": 4.955003182006761e-05, "loss": 0.9191, "step": 90 }, { "epoch": 0.2, "grad_norm": 2.1316168308258057, "learning_rate": 4.944487592329509e-05, "loss": 1.012, "step": 100 }, { "epoch": 0.22, "grad_norm": 1.793336033821106, "learning_rate": 4.9328824064325566e-05, "loss": 0.9564, "step": 110 }, { "epoch": 0.24, "grad_norm": 1.4563814401626587, "learning_rate": 4.920192797165511e-05, "loss": 0.9283, "step": 120 }, { "epoch": 0.26, "grad_norm": 1.6153773069381714, "learning_rate": 4.906424420744559e-05, "loss": 0.9326, "step": 130 }, { "epoch": 0.28, "grad_norm": 1.5536580085754395, "learning_rate": 4.891583414231287e-05, "loss": 0.9359, "step": 140 }, { "epoch": 0.3, "grad_norm": 2.4636590480804443, "learning_rate": 4.875676392797168e-05, "loss": 0.9579, "step": 150 }, { "epoch": 0.32, "grad_norm": 1.6544209718704224, "learning_rate": 4.858710446774951e-05, "loss": 0.9231, "step": 160 }, { "epoch": 0.34, "grad_norm": 1.6398296356201172, "learning_rate": 4.840693138498231e-05, "loss": 0.9009, "step": 170 }, { "epoch": 0.36, "grad_norm": 1.8886735439300537, "learning_rate": 4.821632498930656e-05, "loss": 0.9199, "step": 180 }, { "epoch": 0.38, "grad_norm": 1.6928956508636475, "learning_rate": 4.801537024086229e-05, "loss": 0.9679, "step": 190 }, { "epoch": 0.4, "grad_norm": 2.1802239418029785, "learning_rate": 4.780415671242334e-05, "loss": 0.9218, "step": 200 }, { "epoch": 0.42, "grad_norm": 1.9001857042312622, "learning_rate": 4.7582778549471494e-05, "loss": 0.9036, "step": 210 }, { "epoch": 0.44, "grad_norm": 2.371631622314453, "learning_rate": 4.735133442823252e-05, "loss": 0.932, "step": 220 }, { "epoch": 0.46, "grad_norm": 1.7190943956375122, "learning_rate": 4.710992751169252e-05, "loss": 0.9329, "step": 230 }, { "epoch": 0.48, "grad_norm": 1.9145939350128174, "learning_rate": 4.685866540361456e-05, "loss": 0.9496, "step": 240 }, { "epoch": 0.5, "grad_norm": 2.0224311351776123, "learning_rate": 4.659766010057574e-05, "loss": 0.9074, "step": 250 }, { "epoch": 0.52, "grad_norm": 1.7447651624679565, "learning_rate": 4.6327027942046286e-05, "loss": 0.875, "step": 260 }, { "epoch": 0.54, "grad_norm": 1.789925217628479, "learning_rate": 4.604688955853293e-05, "loss": 0.8906, "step": 270 }, { "epoch": 0.56, "grad_norm": 1.6785054206848145, "learning_rate": 4.5757369817809415e-05, "loss": 0.89, "step": 280 }, { "epoch": 0.58, "grad_norm": 1.9952877759933472, "learning_rate": 4.5458597769258535e-05, "loss": 0.9216, "step": 290 }, { "epoch": 0.6, "grad_norm": 1.9971449375152588, "learning_rate": 4.515070658635013e-05, "loss": 0.9329, "step": 300 }, { "epoch": 0.62, "grad_norm": 1.9367233514785767, "learning_rate": 4.4833833507280884e-05, "loss": 0.8769, "step": 310 }, { "epoch": 0.65, "grad_norm": 2.0975091457366943, "learning_rate": 4.45081197738023e-05, "loss": 0.9395, "step": 320 }, { "epoch": 0.67, "grad_norm": 1.8488638401031494, "learning_rate": 4.417371056826417e-05, "loss": 0.9189, "step": 330 }, { "epoch": 0.69, "grad_norm": 1.9082151651382446, "learning_rate": 4.383075494890159e-05, "loss": 0.9306, "step": 340 }, { "epoch": 0.71, "grad_norm": 1.708274483680725, "learning_rate": 4.347940578339428e-05, "loss": 0.8993, "step": 350 }, { "epoch": 0.73, "grad_norm": 1.8434147834777832, "learning_rate": 4.3119819680728e-05, "loss": 0.9212, "step": 360 }, { "epoch": 0.75, "grad_norm": 2.0493478775024414, "learning_rate": 4.2752156921388264e-05, "loss": 0.8769, "step": 370 }, { "epoch": 0.77, "grad_norm": 1.8880027532577515, "learning_rate": 4.2376581385917547e-05, "loss": 0.8963, "step": 380 }, { "epoch": 0.79, "grad_norm": 2.563737630844116, "learning_rate": 4.199326048186782e-05, "loss": 0.9929, "step": 390 }, { "epoch": 0.81, "grad_norm": 1.850777506828308, "learning_rate": 4.160236506918098e-05, "loss": 0.917, "step": 400 }, { "epoch": 0.83, "grad_norm": 1.723869800567627, "learning_rate": 4.1204069384030396e-05, "loss": 0.9051, "step": 410 }, { "epoch": 0.85, "grad_norm": 2.1148617267608643, "learning_rate": 4.07985509611576e-05, "loss": 0.9369, "step": 420 }, { "epoch": 0.87, "grad_norm": 1.888234257698059, "learning_rate": 4.038599055473863e-05, "loss": 0.9504, "step": 430 }, { "epoch": 0.89, "grad_norm": 1.6350065469741821, "learning_rate": 3.9966572057815373e-05, "loss": 0.8794, "step": 440 }, { "epoch": 0.91, "grad_norm": 2.113657236099243, "learning_rate": 3.9540482420327845e-05, "loss": 0.8842, "step": 450 }, { "epoch": 0.93, "grad_norm": 1.951561450958252, "learning_rate": 3.910791156578382e-05, "loss": 0.8631, "step": 460 }, { "epoch": 0.95, "grad_norm": 1.7464616298675537, "learning_rate": 3.866905230660309e-05, "loss": 0.8699, "step": 470 }, { "epoch": 0.97, "grad_norm": 2.176771640777588, "learning_rate": 3.822410025817406e-05, "loss": 0.9188, "step": 480 }, { "epoch": 0.99, "grad_norm": 2.0469777584075928, "learning_rate": 3.777325375166088e-05, "loss": 0.9286, "step": 490 }, { "epoch": 1.01, "grad_norm": 2.167973518371582, "learning_rate": 3.731671374560007e-05, "loss": 0.909, "step": 500 }, { "epoch": 1.03, "grad_norm": 2.1665143966674805, "learning_rate": 3.6854683736326125e-05, "loss": 0.8855, "step": 510 }, { "epoch": 1.05, "grad_norm": 2.2790310382843018, "learning_rate": 3.638736966726585e-05, "loss": 0.9811, "step": 520 }, { "epoch": 1.07, "grad_norm": 2.1944940090179443, "learning_rate": 3.59149798371419e-05, "loss": 0.8948, "step": 530 }, { "epoch": 1.09, "grad_norm": 2.0177252292633057, "learning_rate": 3.543772480712658e-05, "loss": 0.8636, "step": 540 }, { "epoch": 1.11, "grad_norm": 2.1394903659820557, "learning_rate": 3.4955817306987124e-05, "loss": 0.8467, "step": 550 }, { "epoch": 1.13, "grad_norm": 2.131467342376709, "learning_rate": 3.44694721402644e-05, "loss": 0.894, "step": 560 }, { "epoch": 1.15, "grad_norm": 1.935081958770752, "learning_rate": 3.397890608852718e-05, "loss": 0.868, "step": 570 }, { "epoch": 1.17, "grad_norm": 1.752773642539978, "learning_rate": 3.348433781474481e-05, "loss": 0.8807, "step": 580 }, { "epoch": 1.19, "grad_norm": 1.7388436794281006, "learning_rate": 3.298598776582126e-05, "loss": 0.9313, "step": 590 }, { "epoch": 1.21, "grad_norm": 2.0265705585479736, "learning_rate": 3.2484078074333954e-05, "loss": 0.9366, "step": 600 }, { "epoch": 1.23, "grad_norm": 1.8241621255874634, "learning_rate": 3.197883245952131e-05, "loss": 0.8784, "step": 610 }, { "epoch": 1.25, "grad_norm": 2.0975425243377686, "learning_rate": 3.147047612756302e-05, "loss": 0.8345, "step": 620 }, { "epoch": 1.27, "grad_norm": 2.1606051921844482, "learning_rate": 3.095923567119748e-05, "loss": 0.8452, "step": 630 }, { "epoch": 1.29, "grad_norm": 1.8565340042114258, "learning_rate": 3.0445338968721287e-05, "loss": 0.8517, "step": 640 }, { "epoch": 1.31, "grad_norm": 2.326995372772217, "learning_rate": 2.992901508241569e-05, "loss": 0.8721, "step": 650 }, { "epoch": 1.33, "grad_norm": 2.2724409103393555, "learning_rate": 2.9410494156445216e-05, "loss": 0.864, "step": 660 }, { "epoch": 1.35, "grad_norm": 2.4907262325286865, "learning_rate": 2.889000731427416e-05, "loss": 0.8825, "step": 670 }, { "epoch": 1.37, "grad_norm": 2.073660135269165, "learning_rate": 2.836778655564653e-05, "loss": 0.8617, "step": 680 }, { "epoch": 1.39, "grad_norm": 1.923628807067871, "learning_rate": 2.7844064653175378e-05, "loss": 0.8619, "step": 690 }, { "epoch": 1.41, "grad_norm": 2.1420040130615234, "learning_rate": 2.7319075048587666e-05, "loss": 0.8525, "step": 700 }, { "epoch": 1.43, "grad_norm": 2.0220742225646973, "learning_rate": 2.6793051748670896e-05, "loss": 0.86, "step": 710 }, { "epoch": 1.45, "grad_norm": 2.0513052940368652, "learning_rate": 2.6266229220967818e-05, "loss": 0.9214, "step": 720 }, { "epoch": 1.47, "grad_norm": 2.164041042327881, "learning_rate": 2.5738842289265814e-05, "loss": 0.888, "step": 730 }, { "epoch": 1.49, "grad_norm": 1.8874469995498657, "learning_rate": 2.5211126028927464e-05, "loss": 0.9022, "step": 740 }, { "epoch": 1.51, "grad_norm": 1.6967768669128418, "learning_rate": 2.4683315662109e-05, "loss": 0.8912, "step": 750 }, { "epoch": 1.53, "grad_norm": 1.8921899795532227, "learning_rate": 2.4155646452913296e-05, "loss": 0.8499, "step": 760 }, { "epoch": 1.55, "grad_norm": 1.830534815788269, "learning_rate": 2.36283536025242e-05, "loss": 0.8306, "step": 770 }, { "epoch": 1.57, "grad_norm": 2.6946194171905518, "learning_rate": 2.310167214436885e-05, "loss": 0.8753, "step": 780 }, { "epoch": 1.59, "grad_norm": 1.9234410524368286, "learning_rate": 2.2575836839354848e-05, "loss": 0.8758, "step": 790 }, { "epoch": 1.61, "grad_norm": 2.620464563369751, "learning_rate": 2.2051082071228854e-05, "loss": 0.9053, "step": 800 }, { "epoch": 1.63, "grad_norm": 2.201439619064331, "learning_rate": 2.152764174210328e-05, "loss": 0.8763, "step": 810 }, { "epoch": 1.65, "grad_norm": 2.2076492309570312, "learning_rate": 2.1005749168197696e-05, "loss": 0.859, "step": 820 }, { "epoch": 1.67, "grad_norm": 2.1520049571990967, "learning_rate": 2.0485636975841415e-05, "loss": 0.8849, "step": 830 }, { "epoch": 1.69, "grad_norm": 1.9997478723526, "learning_rate": 1.9967536997783494e-05, "loss": 0.8669, "step": 840 }, { "epoch": 1.71, "grad_norm": 2.3713417053222656, "learning_rate": 1.9451680169856602e-05, "loss": 0.9124, "step": 850 }, { "epoch": 1.73, "grad_norm": 2.459920883178711, "learning_rate": 1.89382964280405e-05, "loss": 0.879, "step": 860 }, { "epoch": 1.75, "grad_norm": 2.571826219558716, "learning_rate": 1.842761460597138e-05, "loss": 0.8446, "step": 870 }, { "epoch": 1.77, "grad_norm": 2.335891008377075, "learning_rate": 1.79198623329424e-05, "loss": 0.855, "step": 880 }, { "epoch": 1.79, "grad_norm": 1.8321956396102905, "learning_rate": 1.7415265932441136e-05, "loss": 0.868, "step": 890 }, { "epoch": 1.81, "grad_norm": 1.7981292009353638, "learning_rate": 1.6914050321269047e-05, "loss": 0.8442, "step": 900 }, { "epoch": 1.83, "grad_norm": 2.0625789165496826, "learning_rate": 1.641643890928794e-05, "loss": 0.8335, "step": 910 }, { "epoch": 1.85, "grad_norm": 2.1844565868377686, "learning_rate": 1.5922653499838137e-05, "loss": 0.8314, "step": 920 }, { "epoch": 1.88, "grad_norm": 2.1349494457244873, "learning_rate": 1.5432914190872757e-05, "loss": 0.8059, "step": 930 }, { "epoch": 1.9, "grad_norm": 2.5662639141082764, "learning_rate": 1.4947439276852104e-05, "loss": 0.8678, "step": 940 }, { "epoch": 1.92, "grad_norm": 2.8356733322143555, "learning_rate": 1.4466445151441965e-05, "loss": 0.8859, "step": 950 }, { "epoch": 1.94, "grad_norm": 2.7699570655822754, "learning_rate": 1.399014621105914e-05, "loss": 0.8691, "step": 960 }, { "epoch": 1.96, "grad_norm": 2.013852834701538, "learning_rate": 1.3518754759307289e-05, "loss": 0.8456, "step": 970 }, { "epoch": 1.98, "grad_norm": 2.38482403755188, "learning_rate": 1.3052480912345482e-05, "loss": 0.8193, "step": 980 }, { "epoch": 2.0, "grad_norm": 2.1879611015319824, "learning_rate": 1.2591532505231906e-05, "loss": 0.8785, "step": 990 }, { "epoch": 2.02, "grad_norm": 2.2595455646514893, "learning_rate": 1.2136114999284288e-05, "loss": 0.8564, "step": 1000 }, { "epoch": 2.04, "grad_norm": 2.1374199390411377, "learning_rate": 1.1686431390498354e-05, "loss": 0.8316, "step": 1010 }, { "epoch": 2.06, "grad_norm": 2.3616902828216553, "learning_rate": 1.1242682119065218e-05, "loss": 0.8698, "step": 1020 }, { "epoch": 2.08, "grad_norm": 2.4855775833129883, "learning_rate": 1.0805064980027973e-05, "loss": 0.8445, "step": 1030 }, { "epoch": 2.1, "grad_norm": 2.0951740741729736, "learning_rate": 1.0373775035117305e-05, "loss": 0.8423, "step": 1040 }, { "epoch": 2.12, "grad_norm": 2.2897915840148926, "learning_rate": 9.949004525805423e-06, "loss": 0.8492, "step": 1050 }, { "epoch": 2.14, "grad_norm": 2.129755973815918, "learning_rate": 9.530942787617137e-06, "loss": 0.8388, "step": 1060 }, { "epoch": 2.16, "grad_norm": 2.337714195251465, "learning_rate": 9.11977616573618e-06, "loss": 0.8437, "step": 1070 }, { "epoch": 2.18, "grad_norm": 2.128257989883423, "learning_rate": 8.715687931944449e-06, "loss": 0.8095, "step": 1080 }, { "epoch": 2.2, "grad_norm": 2.5019583702087402, "learning_rate": 8.318858202931153e-06, "loss": 0.8328, "step": 1090 }, { "epoch": 2.22, "grad_norm": 2.477393388748169, "learning_rate": 7.929463860008355e-06, "loss": 0.8561, "step": 1100 }, { "epoch": 2.24, "grad_norm": 3.189155340194702, "learning_rate": 7.547678470268526e-06, "loss": 0.8551, "step": 1110 }, { "epoch": 2.26, "grad_norm": 2.133364200592041, "learning_rate": 7.173672209219495e-06, "loss": 0.85, "step": 1120 }, { "epoch": 2.28, "grad_norm": 2.8646955490112305, "learning_rate": 6.807611784931067e-06, "loss": 0.8625, "step": 1130 }, { "epoch": 2.3, "grad_norm": 2.1333656311035156, "learning_rate": 6.449660363727236e-06, "loss": 0.8542, "step": 1140 }, { "epoch": 2.32, "grad_norm": 2.6336278915405273, "learning_rate": 6.099977497457062e-06, "loss": 0.8474, "step": 1150 }, { "epoch": 2.34, "grad_norm": 2.3681249618530273, "learning_rate": 5.758719052376693e-06, "loss": 0.8534, "step": 1160 }, { "epoch": 2.36, "grad_norm": 2.100318193435669, "learning_rate": 5.4260371396741175e-06, "loss": 0.8259, "step": 1170 }, { "epoch": 2.38, "grad_norm": 2.3494303226470947, "learning_rate": 5.10208004766774e-06, "loss": 0.8064, "step": 1180 }, { "epoch": 2.4, "grad_norm": 2.2627310752868652, "learning_rate": 4.786992175708949e-06, "loss": 0.8188, "step": 1190 }, { "epoch": 2.42, "grad_norm": 2.1743149757385254, "learning_rate": 4.480913969818098e-06, "loss": 0.8474, "step": 1200 } ], "logging_steps": 10, "max_steps": 1488, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 1.1912432826109133e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }