{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.820250284414107, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.26715654134750366, "learning_rate": 4.9992855064046754e-05, "loss": 2.6697, "step": 5 }, { "epoch": 0.05, "grad_norm": 0.4067687392234802, "learning_rate": 4.997142434019578e-05, "loss": 2.5369, "step": 10 }, { "epoch": 0.07, "grad_norm": 0.44472697377204895, "learning_rate": 4.9935720078139045e-05, "loss": 2.5661, "step": 15 }, { "epoch": 0.09, "grad_norm": 0.45230674743652344, "learning_rate": 4.988576268624979e-05, "loss": 2.4824, "step": 20 }, { "epoch": 0.11, "grad_norm": 0.4477585554122925, "learning_rate": 4.982158071991725e-05, "loss": 2.3343, "step": 25 }, { "epoch": 0.14, "grad_norm": 0.3630908131599426, "learning_rate": 4.974321086522453e-05, "loss": 2.4377, "step": 30 }, { "epoch": 0.16, "grad_norm": 0.3237389028072357, "learning_rate": 4.9650697917979025e-05, "loss": 2.4114, "step": 35 }, { "epoch": 0.18, "grad_norm": 0.3014233708381653, "learning_rate": 4.954409475810737e-05, "loss": 2.2636, "step": 40 }, { "epoch": 0.2, "grad_norm": 0.3054388761520386, "learning_rate": 4.942346231942955e-05, "loss": 2.2758, "step": 45 }, { "epoch": 0.23, "grad_norm": 0.2940792143344879, "learning_rate": 4.92888695548294e-05, "loss": 2.3207, "step": 50 }, { "epoch": 0.25, "grad_norm": 0.25064757466316223, "learning_rate": 4.9140393396841565e-05, "loss": 2.2209, "step": 55 }, { "epoch": 0.27, "grad_norm": 0.3227023780345917, "learning_rate": 4.89781187136772e-05, "loss": 2.2328, "step": 60 }, { "epoch": 0.3, "grad_norm": 0.312673419713974, "learning_rate": 4.880213826071375e-05, "loss": 2.2737, "step": 65 }, { "epoch": 0.32, "grad_norm": 0.26930904388427734, "learning_rate": 4.861255262747643e-05, "loss": 2.2686, "step": 70 }, { "epoch": 0.34, "grad_norm": 0.2343609631061554, "learning_rate": 4.8409470180141827e-05, "loss": 2.2661, "step": 75 }, { "epoch": 0.36, "grad_norm": 0.3088403642177582, "learning_rate": 4.8193006999596294e-05, "loss": 2.191, "step": 80 }, { "epoch": 0.39, "grad_norm": 0.3336387574672699, "learning_rate": 4.796328681508473e-05, "loss": 2.2106, "step": 85 }, { "epoch": 0.41, "grad_norm": 0.28899675607681274, "learning_rate": 4.7720440933487575e-05, "loss": 2.2347, "step": 90 }, { "epoch": 0.43, "grad_norm": 0.30223432183265686, "learning_rate": 4.746460816426647e-05, "loss": 2.2307, "step": 95 }, { "epoch": 0.46, "grad_norm": 0.2692021429538727, "learning_rate": 4.7195934740121485e-05, "loss": 2.1503, "step": 100 }, { "epoch": 0.48, "grad_norm": 0.3296695053577423, "learning_rate": 4.6914574233405236e-05, "loss": 2.2145, "step": 105 }, { "epoch": 0.5, "grad_norm": 0.28714171051979065, "learning_rate": 4.662068746834176e-05, "loss": 2.1163, "step": 110 }, { "epoch": 0.52, "grad_norm": 0.35001716017723083, "learning_rate": 4.6314442429100155e-05, "loss": 2.1868, "step": 115 }, { "epoch": 0.55, "grad_norm": 0.32721835374832153, "learning_rate": 4.599601416377575e-05, "loss": 2.1865, "step": 120 }, { "epoch": 0.57, "grad_norm": 0.30708980560302734, "learning_rate": 4.566558468433344e-05, "loss": 2.2035, "step": 125 }, { "epoch": 0.59, "grad_norm": 0.3122975528240204, "learning_rate": 4.532334286257064e-05, "loss": 2.1762, "step": 130 }, { "epoch": 0.61, "grad_norm": 0.3531278669834137, "learning_rate": 4.496948432215913e-05, "loss": 2.2452, "step": 135 }, { "epoch": 0.64, "grad_norm": 0.3190854489803314, "learning_rate": 4.460421132682751e-05, "loss": 2.2267, "step": 140 }, { "epoch": 0.66, "grad_norm": 0.29605475068092346, "learning_rate": 4.4227732664748365e-05, "loss": 2.2548, "step": 145 }, { "epoch": 0.68, "grad_norm": 0.37863361835479736, "learning_rate": 4.384026352919595e-05, "loss": 2.2053, "step": 150 }, { "epoch": 0.71, "grad_norm": 0.3235403895378113, "learning_rate": 4.344202539554285e-05, "loss": 2.193, "step": 155 }, { "epoch": 0.73, "grad_norm": 0.3326057195663452, "learning_rate": 4.3033245894665814e-05, "loss": 2.2349, "step": 160 }, { "epoch": 0.75, "grad_norm": 0.3827252686023712, "learning_rate": 4.261415868283304e-05, "loss": 2.1247, "step": 165 }, { "epoch": 0.77, "grad_norm": 0.34777992963790894, "learning_rate": 4.218500330814753e-05, "loss": 2.1555, "step": 170 }, { "epoch": 0.8, "grad_norm": 0.3294101059436798, "learning_rate": 4.174602507362258e-05, "loss": 2.1771, "step": 175 }, { "epoch": 0.82, "grad_norm": 0.3454132676124573, "learning_rate": 4.1297474896967814e-05, "loss": 2.1616, "step": 180 }, { "epoch": 0.84, "grad_norm": 0.3628969192504883, "learning_rate": 4.083960916716597e-05, "loss": 2.1681, "step": 185 }, { "epoch": 0.86, "grad_norm": 0.31756460666656494, "learning_rate": 4.0372689597922215e-05, "loss": 2.146, "step": 190 }, { "epoch": 0.89, "grad_norm": 0.41087692975997925, "learning_rate": 3.989698307806995e-05, "loss": 2.2185, "step": 195 }, { "epoch": 0.91, "grad_norm": 0.3311336636543274, "learning_rate": 3.941276151901853e-05, "loss": 2.0976, "step": 200 }, { "epoch": 0.93, "grad_norm": 0.38921472430229187, "learning_rate": 3.8920301699330076e-05, "loss": 2.1204, "step": 205 }, { "epoch": 0.96, "grad_norm": 0.31258201599121094, "learning_rate": 3.84198851065143e-05, "loss": 2.1678, "step": 210 }, { "epoch": 0.98, "grad_norm": 0.4159790575504303, "learning_rate": 3.791179777613163e-05, "loss": 2.2486, "step": 215 }, { "epoch": 1.0, "grad_norm": 0.3648820221424103, "learning_rate": 3.739633012829682e-05, "loss": 2.1523, "step": 220 }, { "epoch": 1.02, "grad_norm": 0.35387834906578064, "learning_rate": 3.6873776801676264e-05, "loss": 2.1761, "step": 225 }, { "epoch": 1.05, "grad_norm": 0.40865159034729004, "learning_rate": 3.6344436485074e-05, "loss": 2.219, "step": 230 }, { "epoch": 1.07, "grad_norm": 0.40083423256874084, "learning_rate": 3.5808611746702814e-05, "loss": 2.1755, "step": 235 }, { "epoch": 1.09, "grad_norm": 0.3656306564807892, "learning_rate": 3.5266608861237724e-05, "loss": 2.1917, "step": 240 }, { "epoch": 1.11, "grad_norm": 0.3839705288410187, "learning_rate": 3.471873763475099e-05, "loss": 2.1878, "step": 245 }, { "epoch": 1.14, "grad_norm": 0.4037785530090332, "learning_rate": 3.4165311227628524e-05, "loss": 2.1101, "step": 250 }, { "epoch": 1.16, "grad_norm": 0.3458710312843323, "learning_rate": 3.3606645975569005e-05, "loss": 2.1691, "step": 255 }, { "epoch": 1.18, "grad_norm": 0.3371334373950958, "learning_rate": 3.304306120876807e-05, "loss": 2.1904, "step": 260 }, { "epoch": 1.21, "grad_norm": 0.4284100830554962, "learning_rate": 3.247487906939076e-05, "loss": 2.1688, "step": 265 }, { "epoch": 1.23, "grad_norm": 0.3682388365268707, "learning_rate": 3.1902424327436734e-05, "loss": 2.1691, "step": 270 }, { "epoch": 1.25, "grad_norm": 0.32562267780303955, "learning_rate": 3.132602419510336e-05, "loss": 2.0698, "step": 275 }, { "epoch": 1.27, "grad_norm": 0.4634737968444824, "learning_rate": 3.0746008139752964e-05, "loss": 2.2131, "step": 280 }, { "epoch": 1.3, "grad_norm": 0.4514225423336029, "learning_rate": 3.0162707695590935e-05, "loss": 2.1414, "step": 285 }, { "epoch": 1.32, "grad_norm": 0.4424736499786377, "learning_rate": 2.9576456274162488e-05, "loss": 2.1257, "step": 290 }, { "epoch": 1.34, "grad_norm": 0.4680028557777405, "learning_rate": 2.8987588973776304e-05, "loss": 2.2337, "step": 295 }, { "epoch": 1.37, "grad_norm": 0.44635656476020813, "learning_rate": 2.8396442387964075e-05, "loss": 2.2022, "step": 300 }, { "epoch": 1.39, "grad_norm": 0.4005129039287567, "learning_rate": 2.7803354413085364e-05, "loss": 2.1944, "step": 305 }, { "epoch": 1.41, "grad_norm": 0.3982764184474945, "learning_rate": 2.72086640551878e-05, "loss": 2.0979, "step": 310 }, { "epoch": 1.43, "grad_norm": 0.39070507884025574, "learning_rate": 2.6612711236232912e-05, "loss": 2.1757, "step": 315 }, { "epoch": 1.46, "grad_norm": 0.4183822572231293, "learning_rate": 2.601583659979851e-05, "loss": 2.0571, "step": 320 }, { "epoch": 1.48, "grad_norm": 0.39799273014068604, "learning_rate": 2.541838131636854e-05, "loss": 2.1154, "step": 325 }, { "epoch": 1.5, "grad_norm": 0.3967350125312805, "learning_rate": 2.4820686888321808e-05, "loss": 2.164, "step": 330 }, { "epoch": 1.52, "grad_norm": 0.3555432856082916, "learning_rate": 2.4223094954730956e-05, "loss": 2.1596, "step": 335 }, { "epoch": 1.55, "grad_norm": 0.3698050379753113, "learning_rate": 2.3625947096083327e-05, "loss": 2.0815, "step": 340 }, { "epoch": 1.57, "grad_norm": 0.45504623651504517, "learning_rate": 2.3029584639035286e-05, "loss": 2.0997, "step": 345 }, { "epoch": 1.59, "grad_norm": 0.4048606753349304, "learning_rate": 2.2434348461311684e-05, "loss": 2.1397, "step": 350 }, { "epoch": 1.62, "grad_norm": 0.46529653668403625, "learning_rate": 2.184057879686185e-05, "loss": 2.1781, "step": 355 }, { "epoch": 1.64, "grad_norm": 0.4468785226345062, "learning_rate": 2.1248615041383685e-05, "loss": 2.1155, "step": 360 }, { "epoch": 1.66, "grad_norm": 0.49244192242622375, "learning_rate": 2.0658795558326743e-05, "loss": 2.2141, "step": 365 }, { "epoch": 1.68, "grad_norm": 0.45070916414260864, "learning_rate": 2.0071457485485463e-05, "loss": 2.2219, "step": 370 }, { "epoch": 1.71, "grad_norm": 0.392012357711792, "learning_rate": 1.94869365422929e-05, "loss": 2.1288, "step": 375 }, { "epoch": 1.73, "grad_norm": 0.3873184621334076, "learning_rate": 1.8905566837925264e-05, "loss": 2.1477, "step": 380 }, { "epoch": 1.75, "grad_norm": 0.47440674901008606, "learning_rate": 1.832768068032678e-05, "loss": 2.0753, "step": 385 }, { "epoch": 1.77, "grad_norm": 0.5017093420028687, "learning_rate": 1.7753608386264196e-05, "loss": 2.1999, "step": 390 }, { "epoch": 1.8, "grad_norm": 0.3944746255874634, "learning_rate": 1.7183678092519385e-05, "loss": 2.1569, "step": 395 }, { "epoch": 1.82, "grad_norm": 0.4446674883365631, "learning_rate": 1.66182155683281e-05, "loss": 2.1261, "step": 400 } ], "logging_steps": 5, "max_steps": 657, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.3698387234914304e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }