{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990839694656488, "eval_steps": 1000, "global_step": 818, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 4.999539075477821e-05, "loss": 1.9867, "step": 5 }, { "epoch": 0.01, "learning_rate": 4.998156471872415e-05, "loss": 1.5198, "step": 10 }, { "epoch": 0.02, "learning_rate": 4.995852699004508e-05, "loss": 0.9883, "step": 15 }, { "epoch": 0.02, "learning_rate": 4.992628606366426e-05, "loss": 0.3031, "step": 20 }, { "epoch": 0.03, "learning_rate": 4.988485382808856e-05, "loss": 0.0533, "step": 25 }, { "epoch": 0.04, "learning_rate": 4.983424556102469e-05, "loss": 0.0318, "step": 30 }, { "epoch": 0.04, "learning_rate": 4.97744799237457e-05, "loss": 0.0605, "step": 35 }, { "epoch": 0.05, "learning_rate": 4.970557895420984e-05, "loss": 0.0126, "step": 40 }, { "epoch": 0.05, "learning_rate": 4.9627568058934274e-05, "loss": 0.0017, "step": 45 }, { "epoch": 0.06, "learning_rate": 4.95404760036267e-05, "loss": 0.0005, "step": 50 }, { "epoch": 0.07, "learning_rate": 4.9444334902578315e-05, "loss": 0.0784, "step": 55 }, { "epoch": 0.07, "learning_rate": 4.9339180206821955e-05, "loss": 0.043, "step": 60 }, { "epoch": 0.08, "learning_rate": 4.922505069105995e-05, "loss": 0.0022, "step": 65 }, { "epoch": 0.09, "learning_rate": 4.9101988439366295e-05, "loss": 0.0101, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.897003882966866e-05, "loss": 0.0471, "step": 75 }, { "epoch": 0.1, "learning_rate": 4.8829250517015684e-05, "loss": 0.002, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.867967541563594e-05, "loss": 0.0109, "step": 85 }, { "epoch": 0.11, "learning_rate": 4.8521368679795154e-05, "loss": 0.0322, "step": 90 }, { "epoch": 0.12, "learning_rate": 4.835438868345858e-05, "loss": 0.0156, "step": 95 }, { "epoch": 0.12, "learning_rate": 4.817879699876623e-05, "loss": 0.0912, "step": 100 }, { "epoch": 0.13, "learning_rate": 4.7994658373328804e-05, "loss": 0.027, "step": 105 }, { "epoch": 0.13, "learning_rate": 4.780204070635266e-05, "loss": 0.027, "step": 110 }, { "epoch": 0.14, "learning_rate": 4.760101502360268e-05, "loss": 0.0206, "step": 115 }, { "epoch": 0.15, "learning_rate": 4.739165545121228e-05, "loss": 0.012, "step": 120 }, { "epoch": 0.15, "learning_rate": 4.717403918835017e-05, "loss": 0.002, "step": 125 }, { "epoch": 0.16, "learning_rate": 4.694824647875391e-05, "loss": 0.0052, "step": 130 }, { "epoch": 0.16, "learning_rate": 4.6714360581140935e-05, "loss": 0.0036, "step": 135 }, { "epoch": 0.17, "learning_rate": 4.647246773850773e-05, "loss": 0.0386, "step": 140 }, { "epoch": 0.18, "learning_rate": 4.6222657146328624e-05, "loss": 0.0546, "step": 145 }, { "epoch": 0.18, "learning_rate": 4.596502091966588e-05, "loss": 0.0472, "step": 150 }, { "epoch": 0.19, "learning_rate": 4.5699654059203225e-05, "loss": 0.0779, "step": 155 }, { "epoch": 0.2, "learning_rate": 4.542665441621537e-05, "loss": 0.0169, "step": 160 }, { "epoch": 0.2, "learning_rate": 4.51461226564863e-05, "loss": 0.0249, "step": 165 }, { "epoch": 0.21, "learning_rate": 4.485816222318986e-05, "loss": 0.0132, "step": 170 }, { "epoch": 0.21, "learning_rate": 4.4562879298746165e-05, "loss": 0.0163, "step": 175 }, { "epoch": 0.22, "learning_rate": 4.4260382765667875e-05, "loss": 0.0181, "step": 180 }, { "epoch": 0.23, "learning_rate": 4.395078416641099e-05, "loss": 0.0421, "step": 185 }, { "epoch": 0.23, "learning_rate": 4.363419766224464e-05, "loss": 0.0011, "step": 190 }, { "epoch": 0.24, "learning_rate": 4.3310739991155365e-05, "loss": 0.0062, "step": 195 }, { "epoch": 0.24, "learning_rate": 4.2980530424801146e-05, "loss": 0.0013, "step": 200 }, { "epoch": 0.25, "learning_rate": 4.264369072453126e-05, "loss": 0.0515, "step": 205 }, { "epoch": 0.26, "learning_rate": 4.230034509648803e-05, "loss": 0.009, "step": 210 }, { "epoch": 0.26, "learning_rate": 4.19506201458071e-05, "loss": 0.0132, "step": 215 }, { "epoch": 0.27, "learning_rate": 4.159464482993308e-05, "loss": 0.0092, "step": 220 }, { "epoch": 0.27, "learning_rate": 4.123255041106788e-05, "loss": 0.0429, "step": 225 }, { "epoch": 0.28, "learning_rate": 4.0864470407769114e-05, "loss": 0.0381, "step": 230 }, { "epoch": 0.29, "learning_rate": 4.049054054571648e-05, "loss": 0.062, "step": 235 }, { "epoch": 0.29, "learning_rate": 4.011089870766437e-05, "loss": 0.0134, "step": 240 }, { "epoch": 0.3, "learning_rate": 3.972568488259905e-05, "loss": 0.0167, "step": 245 }, { "epoch": 0.31, "learning_rate": 3.93350411141191e-05, "loss": 0.0143, "step": 250 }, { "epoch": 0.31, "learning_rate": 3.8939111448058404e-05, "loss": 0.0276, "step": 255 }, { "epoch": 0.32, "learning_rate": 3.853804187937066e-05, "loss": 0.0636, "step": 260 }, { "epoch": 0.32, "learning_rate": 3.813198029829532e-05, "loss": 0.0279, "step": 265 }, { "epoch": 0.33, "learning_rate": 3.772107643582458e-05, "loss": 0.0518, "step": 270 }, { "epoch": 0.34, "learning_rate": 3.730548180849161e-05, "loss": 0.0189, "step": 275 }, { "epoch": 0.34, "learning_rate": 3.688534966250042e-05, "loss": 0.0099, "step": 280 }, { "epoch": 0.35, "learning_rate": 3.646083491721794e-05, "loss": 0.0045, "step": 285 }, { "epoch": 0.35, "learning_rate": 3.603209410804906e-05, "loss": 0.003, "step": 290 }, { "epoch": 0.36, "learning_rate": 3.559928532871587e-05, "loss": 0.0229, "step": 295 }, { "epoch": 0.37, "learning_rate": 3.516256817296222e-05, "loss": 0.0148, "step": 300 }, { "epoch": 0.37, "learning_rate": 3.472210367570518e-05, "loss": 0.0131, "step": 305 }, { "epoch": 0.38, "learning_rate": 3.427805425365509e-05, "loss": 0.0104, "step": 310 }, { "epoch": 0.38, "learning_rate": 3.383058364542611e-05, "loss": 0.027, "step": 315 }, { "epoch": 0.39, "learning_rate": 3.3379856851159267e-05, "loss": 0.0023, "step": 320 }, { "epoch": 0.4, "learning_rate": 3.292604007168037e-05, "loss": 0.0019, "step": 325 }, { "epoch": 0.4, "learning_rate": 3.246930064721524e-05, "loss": 0.0023, "step": 330 }, { "epoch": 0.41, "learning_rate": 3.200980699568463e-05, "loss": 0.0248, "step": 335 }, { "epoch": 0.42, "learning_rate": 3.154772855060198e-05, "loss": 0.0134, "step": 340 }, { "epoch": 0.42, "learning_rate": 3.1083235698596505e-05, "loss": 0.0011, "step": 345 }, { "epoch": 0.43, "learning_rate": 3.061649971658488e-05, "loss": 0.0202, "step": 350 }, { "epoch": 0.43, "learning_rate": 3.01476927086147e-05, "loss": 0.001, "step": 355 }, { "epoch": 0.44, "learning_rate": 2.967698754240289e-05, "loss": 0.056, "step": 360 }, { "epoch": 0.45, "learning_rate": 2.92045577855925e-05, "loss": 0.0097, "step": 365 }, { "epoch": 0.45, "learning_rate": 2.8730577641751476e-05, "loss": 0.062, "step": 370 }, { "epoch": 0.46, "learning_rate": 2.825522188613686e-05, "loss": 0.0022, "step": 375 }, { "epoch": 0.46, "learning_rate": 2.7778665801248292e-05, "loss": 0.0012, "step": 380 }, { "epoch": 0.47, "learning_rate": 2.730108511219433e-05, "loss": 0.0058, "step": 385 }, { "epoch": 0.48, "learning_rate": 2.6822655921895695e-05, "loss": 0.0053, "step": 390 }, { "epoch": 0.48, "learning_rate": 2.6343554646149154e-05, "loss": 0.0012, "step": 395 }, { "epoch": 0.49, "learning_rate": 2.586395794857597e-05, "loss": 0.0206, "step": 400 }, { "epoch": 0.49, "learning_rate": 2.538404267547908e-05, "loss": 0.0009, "step": 405 }, { "epoch": 0.5, "learning_rate": 2.4903985790632835e-05, "loss": 0.018, "step": 410 }, { "epoch": 0.51, "learning_rate": 2.4423964310029458e-05, "loss": 0.0238, "step": 415 }, { "epoch": 0.51, "learning_rate": 2.3944155236606196e-05, "loss": 0.0064, "step": 420 }, { "epoch": 0.52, "learning_rate": 2.3464735494977392e-05, "loss": 0.0273, "step": 425 }, { "epoch": 0.53, "learning_rate": 2.2985881866195307e-05, "loss": 0.0045, "step": 430 }, { "epoch": 0.53, "learning_rate": 2.2507770922563966e-05, "loss": 0.0233, "step": 435 }, { "epoch": 0.54, "learning_rate": 2.2030578962529964e-05, "loss": 0.0019, "step": 440 }, { "epoch": 0.54, "learning_rate": 2.155448194567425e-05, "loss": 0.0027, "step": 445 }, { "epoch": 0.55, "learning_rate": 2.1079655427828807e-05, "loss": 0.0246, "step": 450 }, { "epoch": 0.56, "learning_rate": 2.060627449634234e-05, "loss": 0.0128, "step": 455 }, { "epoch": 0.56, "learning_rate": 2.0134513705518547e-05, "loss": 0.0205, "step": 460 }, { "epoch": 0.57, "learning_rate": 1.9664547012251122e-05, "loss": 0.0135, "step": 465 }, { "epoch": 0.57, "learning_rate": 1.9196547711878883e-05, "loss": 0.009, "step": 470 }, { "epoch": 0.58, "learning_rate": 1.873068837428497e-05, "loss": 0.0289, "step": 475 }, { "epoch": 0.59, "learning_rate": 1.8267140780263427e-05, "loss": 0.0587, "step": 480 }, { "epoch": 0.59, "learning_rate": 1.7806075858176903e-05, "loss": 0.0152, "step": 485 }, { "epoch": 0.6, "learning_rate": 1.7347663620928495e-05, "loss": 0.0148, "step": 490 }, { "epoch": 0.6, "learning_rate": 1.6892073103271355e-05, "loss": 0.0339, "step": 495 }, { "epoch": 0.61, "learning_rate": 1.6439472299478803e-05, "loss": 0.0227, "step": 500 }, { "epoch": 0.62, "learning_rate": 1.5990028101398234e-05, "loss": 0.0308, "step": 505 }, { "epoch": 0.62, "learning_rate": 1.5543906236911424e-05, "loss": 0.0014, "step": 510 }, { "epoch": 0.63, "learning_rate": 1.5101271208824168e-05, "loss": 0.0023, "step": 515 }, { "epoch": 0.64, "learning_rate": 1.4662286234207512e-05, "loss": 0.0098, "step": 520 }, { "epoch": 0.64, "learning_rate": 1.4227113184213198e-05, "loss": 0.0209, "step": 525 }, { "epoch": 0.65, "learning_rate": 1.3795912524385323e-05, "loss": 0.0026, "step": 530 }, { "epoch": 0.65, "learning_rate": 1.3368843255490383e-05, "loss": 0.0309, "step": 535 }, { "epoch": 0.66, "learning_rate": 1.2946062854887313e-05, "loss": 0.0016, "step": 540 }, { "epoch": 0.67, "learning_rate": 1.252772721845945e-05, "loss": 0.0083, "step": 545 }, { "epoch": 0.67, "learning_rate": 1.2113990603129433e-05, "loss": 0.0041, "step": 550 }, { "epoch": 0.68, "learning_rate": 1.1705005569978664e-05, "loss": 0.0195, "step": 555 }, { "epoch": 0.68, "learning_rate": 1.1300922927991913e-05, "loss": 0.0216, "step": 560 }, { "epoch": 0.69, "learning_rate": 1.0901891678448144e-05, "loss": 0.0304, "step": 565 }, { "epoch": 0.7, "learning_rate": 1.0508058959977757e-05, "loss": 0.0028, "step": 570 }, { "epoch": 0.7, "learning_rate": 1.0119569994306841e-05, "loss": 0.0037, "step": 575 }, { "epoch": 0.71, "learning_rate": 9.736568032708069e-06, "loss": 0.0062, "step": 580 }, { "epoch": 0.71, "learning_rate": 9.359194303178371e-06, "loss": 0.0018, "step": 585 }, { "epoch": 0.72, "learning_rate": 8.987587958362517e-06, "loss": 0.0052, "step": 590 }, { "epoch": 0.73, "learning_rate": 8.621886024242057e-06, "loss": 0.0419, "step": 595 }, { "epoch": 0.73, "learning_rate": 8.262223349608366e-06, "loss": 0.0037, "step": 600 }, { "epoch": 0.74, "learning_rate": 7.908732556338628e-06, "loss": 0.0027, "step": 605 }, { "epoch": 0.75, "learning_rate": 7.561543990492803e-06, "loss": 0.0571, "step": 610 }, { "epoch": 0.75, "learning_rate": 7.2207856742499695e-06, "loss": 0.0077, "step": 615 }, { "epoch": 0.76, "learning_rate": 6.886583258701382e-06, "loss": 0.0448, "step": 620 }, { "epoch": 0.76, "learning_rate": 6.559059977518017e-06, "loss": 0.0117, "step": 625 }, { "epoch": 0.77, "learning_rate": 6.238336601509365e-06, "loss": 0.0025, "step": 630 }, { "epoch": 0.78, "learning_rate": 5.92453139409051e-06, "loss": 0.0024, "step": 635 }, { "epoch": 0.78, "learning_rate": 5.617760067673666e-06, "loss": 0.0329, "step": 640 }, { "epoch": 0.79, "learning_rate": 5.318135741000488e-06, "loss": 0.0176, "step": 645 }, { "epoch": 0.79, "learning_rate": 5.025768897430644e-06, "loss": 0.0184, "step": 650 }, { "epoch": 0.8, "learning_rate": 4.740767344202282e-06, "loss": 0.0028, "step": 655 }, { "epoch": 0.81, "learning_rate": 4.463236172679192e-06, "loss": 0.0018, "step": 660 }, { "epoch": 0.81, "learning_rate": 4.193277719599481e-06, "loss": 0.0275, "step": 665 }, { "epoch": 0.82, "learning_rate": 3.9309915293399366e-06, "loss": 0.007, "step": 670 }, { "epoch": 0.82, "learning_rate": 3.676474317210099e-06, "loss": 0.0182, "step": 675 }, { "epoch": 0.83, "learning_rate": 3.4298199337894685e-06, "loss": 0.0015, "step": 680 }, { "epoch": 0.84, "learning_rate": 3.1911193303211185e-06, "loss": 0.0018, "step": 685 }, { "epoch": 0.84, "learning_rate": 2.9604605251743136e-06, "loss": 0.0024, "step": 690 }, { "epoch": 0.85, "learning_rate": 2.7379285713886954e-06, "loss": 0.0023, "step": 695 }, { "epoch": 0.85, "learning_rate": 2.5236055253118423e-06, "loss": 0.0048, "step": 700 }, { "epoch": 0.86, "learning_rate": 2.3175704163418353e-06, "loss": 0.0022, "step": 705 }, { "epoch": 0.87, "learning_rate": 2.119899217785995e-06, "loss": 0.0246, "step": 710 }, { "epoch": 0.87, "learning_rate": 1.9306648188465252e-06, "loss": 0.0014, "step": 715 }, { "epoch": 0.88, "learning_rate": 1.7499369977433456e-06, "loss": 0.0012, "step": 720 }, { "epoch": 0.89, "learning_rate": 1.577782395984126e-06, "loss": 0.0812, "step": 725 }, { "epoch": 0.89, "learning_rate": 1.4142644937909206e-06, "loss": 0.0427, "step": 730 }, { "epoch": 0.9, "learning_rate": 1.2594435866924686e-06, "loss": 0.0075, "step": 735 }, { "epoch": 0.9, "learning_rate": 1.113376763290877e-06, "loss": 0.0267, "step": 740 }, { "epoch": 0.91, "learning_rate": 9.761178842107699e-07, "loss": 0.0237, "step": 745 }, { "epoch": 0.92, "learning_rate": 8.477175622387562e-07, "loss": 0.0269, "step": 750 }, { "epoch": 0.92, "learning_rate": 7.282231436604698e-07, "loss": 0.0014, "step": 755 }, { "epoch": 0.93, "learning_rate": 6.176786908021453e-07, "loss": 0.0012, "step": 760 }, { "epoch": 0.93, "learning_rate": 5.161249657830686e-07, "loss": 0.0019, "step": 765 }, { "epoch": 0.94, "learning_rate": 4.2359941548499037e-07, "loss": 0.0036, "step": 770 }, { "epoch": 0.95, "learning_rate": 3.4013615774395323e-07, "loss": 0.0073, "step": 775 }, { "epoch": 0.95, "learning_rate": 2.657659687697156e-07, "loss": 0.0022, "step": 780 }, { "epoch": 0.96, "learning_rate": 2.0051627179733247e-07, "loss": 0.0021, "step": 785 }, { "epoch": 0.96, "learning_rate": 1.444111269751164e-07, "loss": 0.0007, "step": 790 }, { "epoch": 0.97, "learning_rate": 9.747122249273156e-08, "loss": 0.0016, "step": 795 }, { "epoch": 0.98, "learning_rate": 5.971386695260706e-08, "loss": 0.0051, "step": 800 }, { "epoch": 0.98, "learning_rate": 3.1152982987595056e-08, "loss": 0.0021, "step": 805 }, { "epoch": 0.99, "learning_rate": 1.1799102127130202e-08, "loss": 0.0054, "step": 810 }, { "epoch": 1.0, "learning_rate": 1.6593609138415567e-09, "loss": 0.0027, "step": 815 }, { "epoch": 1.0, "step": 818, "total_flos": 1.474472015387689e+17, "train_loss": 0.04709457870986094, "train_runtime": 3992.6154, "train_samples_per_second": 6.561, "train_steps_per_second": 0.205 } ], "logging_steps": 5, "max_steps": 818, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 1.474472015387689e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }