{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1268, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 3.937007874015748e-08, "loss": 5.1243, "step": 10 }, { "epoch": 0.06, "learning_rate": 7.874015748031496e-08, "loss": 4.9463, "step": 20 }, { "epoch": 0.09, "learning_rate": 1.1811023622047243e-07, "loss": 4.0023, "step": 30 }, { "epoch": 0.13, "learning_rate": 1.5748031496062992e-07, "loss": 2.1698, "step": 40 }, { "epoch": 0.16, "learning_rate": 1.968503937007874e-07, "loss": 0.7212, "step": 50 }, { "epoch": 0.19, "learning_rate": 2.3622047244094486e-07, "loss": 0.2513, "step": 60 }, { "epoch": 0.22, "learning_rate": 2.7559055118110235e-07, "loss": 0.1898, "step": 70 }, { "epoch": 0.25, "learning_rate": 3.1496062992125984e-07, "loss": 0.1545, "step": 80 }, { "epoch": 0.28, "learning_rate": 3.5433070866141727e-07, "loss": 0.1172, "step": 90 }, { "epoch": 0.32, "learning_rate": 3.937007874015748e-07, "loss": 0.0956, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.330708661417323e-07, "loss": 0.1053, "step": 110 }, { "epoch": 0.38, "learning_rate": 4.7244094488188973e-07, "loss": 0.0706, "step": 120 }, { "epoch": 0.41, "learning_rate": 4.999914713850033e-07, "loss": 0.0664, "step": 130 }, { "epoch": 0.44, "learning_rate": 4.998398677500815e-07, "loss": 0.0616, "step": 140 }, { "epoch": 0.47, "learning_rate": 4.994988716220417e-07, "loss": 0.0576, "step": 150 }, { "epoch": 0.5, "learning_rate": 4.989687414948845e-07, "loss": 0.0568, "step": 160 }, { "epoch": 0.54, "learning_rate": 4.982498792366836e-07, "loss": 0.058, "step": 170 }, { "epoch": 0.57, "learning_rate": 4.973428297849485e-07, "loss": 0.0586, "step": 180 }, { "epoch": 0.6, "learning_rate": 4.962482807335314e-07, "loss": 0.0677, "step": 190 }, { "epoch": 0.63, "learning_rate": 4.949670618113922e-07, "loss": 0.0617, "step": 200 }, { "epoch": 0.66, "learning_rate": 4.935001442536189e-07, "loss": 0.0571, "step": 210 }, { "epoch": 0.69, "learning_rate": 4.918486400651768e-07, "loss": 0.0584, "step": 220 }, { "epoch": 0.73, "learning_rate": 4.90013801177947e-07, "loss": 0.0596, "step": 230 }, { "epoch": 0.76, "learning_rate": 4.87997018501693e-07, "loss": 0.0536, "step": 240 }, { "epoch": 0.79, "learning_rate": 4.857998208696731e-07, "loss": 0.0569, "step": 250 }, { "epoch": 0.82, "learning_rate": 4.83423873879701e-07, "loss": 0.0594, "step": 260 }, { "epoch": 0.85, "learning_rate": 4.808709786315288e-07, "loss": 0.0572, "step": 270 }, { "epoch": 0.88, "learning_rate": 4.781430703615145e-07, "loss": 0.0522, "step": 280 }, { "epoch": 0.91, "learning_rate": 4.752422169756047e-07, "loss": 0.057, "step": 290 }, { "epoch": 0.95, "learning_rate": 4.721706174817478e-07, "loss": 0.0544, "step": 300 }, { "epoch": 0.98, "learning_rate": 4.6893060032292275e-07, "loss": 0.0524, "step": 310 }, { "epoch": 1.0, "eval_loss": 0.050160180777311325, "eval_runtime": 21.3363, "eval_samples_per_second": 9.655, "eval_steps_per_second": 0.844, "step": 317 }, { "epoch": 1.01, "learning_rate": 4.6552462161205106e-07, "loss": 0.0482, "step": 320 }, { "epoch": 1.04, "learning_rate": 4.6195526327012623e-07, "loss": 0.041, "step": 330 }, { "epoch": 1.07, "learning_rate": 4.582252310689744e-07, "loss": 0.0498, "step": 340 }, { "epoch": 1.1, "learning_rate": 4.5433735258012895e-07, "loss": 0.0522, "step": 350 }, { "epoch": 1.14, "learning_rate": 4.502945750313747e-07, "loss": 0.0469, "step": 360 }, { "epoch": 1.17, "learning_rate": 4.4609996307258503e-07, "loss": 0.0454, "step": 370 }, { "epoch": 1.2, "learning_rate": 4.417566964525473e-07, "loss": 0.0617, "step": 380 }, { "epoch": 1.23, "learning_rate": 4.37268067608537e-07, "loss": 0.0438, "step": 390 }, { "epoch": 1.26, "learning_rate": 4.3263747917046697e-07, "loss": 0.0507, "step": 400 }, { "epoch": 1.29, "learning_rate": 4.278684413815052e-07, "loss": 0.0453, "step": 410 }, { "epoch": 1.32, "learning_rate": 4.22964569437116e-07, "loss": 0.0401, "step": 420 }, { "epoch": 1.36, "learning_rate": 4.179295807445402e-07, "loss": 0.0452, "step": 430 }, { "epoch": 1.39, "learning_rate": 4.127672921047948e-07, "loss": 0.0448, "step": 440 }, { "epoch": 1.42, "learning_rate": 4.074816168193256e-07, "loss": 0.0459, "step": 450 }, { "epoch": 1.45, "learning_rate": 4.0207656172350736e-07, "loss": 0.04, "step": 460 }, { "epoch": 1.48, "learning_rate": 3.9655622414924007e-07, "loss": 0.0531, "step": 470 }, { "epoch": 1.51, "learning_rate": 3.90924788818944e-07, "loss": 0.048, "step": 480 }, { "epoch": 1.55, "learning_rate": 3.851865246733077e-07, "loss": 0.0423, "step": 490 }, { "epoch": 1.58, "learning_rate": 3.793457816351937e-07, "loss": 0.039, "step": 500 }, { "epoch": 1.61, "learning_rate": 3.734069873121555e-07, "loss": 0.045, "step": 510 }, { "epoch": 1.64, "learning_rate": 3.673746436400655e-07, "loss": 0.0425, "step": 520 }, { "epoch": 1.67, "learning_rate": 3.6125332347039733e-07, "loss": 0.0451, "step": 530 }, { "epoch": 1.7, "learning_rate": 3.5504766710375047e-07, "loss": 0.0496, "step": 540 }, { "epoch": 1.74, "learning_rate": 3.487623787722448e-07, "loss": 0.0435, "step": 550 }, { "epoch": 1.77, "learning_rate": 3.424022230734516e-07, "loss": 0.0375, "step": 560 }, { "epoch": 1.8, "learning_rate": 3.3597202135856375e-07, "loss": 0.0279, "step": 570 }, { "epoch": 1.83, "learning_rate": 3.2947664807754393e-07, "loss": 0.0512, "step": 580 }, { "epoch": 1.86, "learning_rate": 3.2292102708402085e-07, "loss": 0.0386, "step": 590 }, { "epoch": 1.89, "learning_rate": 3.1631012790273483e-07, "loss": 0.0461, "step": 600 }, { "epoch": 1.92, "learning_rate": 3.096489619623621e-07, "loss": 0.0417, "step": 610 }, { "epoch": 1.96, "learning_rate": 3.0294257879657347e-07, "loss": 0.0376, "step": 620 }, { "epoch": 1.99, "learning_rate": 2.9619606221620736e-07, "loss": 0.0466, "step": 630 }, { "epoch": 2.0, "eval_loss": 0.04205736145377159, "eval_runtime": 21.3091, "eval_samples_per_second": 9.667, "eval_steps_per_second": 0.845, "step": 634 }, { "epoch": 2.02, "learning_rate": 2.894145264554593e-07, "loss": 0.0378, "step": 640 }, { "epoch": 2.05, "learning_rate": 2.82603112295008e-07, "loss": 0.0223, "step": 650 }, { "epoch": 2.08, "learning_rate": 2.757669831650181e-07, "loss": 0.0323, "step": 660 }, { "epoch": 2.11, "learning_rate": 2.6891132123097383e-07, "loss": 0.0326, "step": 670 }, { "epoch": 2.15, "learning_rate": 2.620413234653093e-07, "loss": 0.0242, "step": 680 }, { "epoch": 2.18, "learning_rate": 2.551621977078152e-07, "loss": 0.0217, "step": 690 }, { "epoch": 2.21, "learning_rate": 2.4827915871780665e-07, "loss": 0.0236, "step": 700 }, { "epoch": 2.24, "learning_rate": 2.413974242210468e-07, "loss": 0.0211, "step": 710 }, { "epoch": 2.27, "learning_rate": 2.3452221095441987e-07, "loss": 0.0246, "step": 720 }, { "epoch": 2.3, "learning_rate": 2.2765873071135604e-07, "loss": 0.0284, "step": 730 }, { "epoch": 2.33, "learning_rate": 2.2081218639100186e-07, "loss": 0.0239, "step": 740 }, { "epoch": 2.37, "learning_rate": 2.1398776805413398e-07, "loss": 0.0226, "step": 750 }, { "epoch": 2.4, "learning_rate": 2.0719064898880494e-07, "loss": 0.0279, "step": 760 }, { "epoch": 2.43, "learning_rate": 2.0042598178870386e-07, "loss": 0.0276, "step": 770 }, { "epoch": 2.46, "learning_rate": 1.936988944472036e-07, "loss": 0.0255, "step": 780 }, { "epoch": 2.49, "learning_rate": 1.8701448647005768e-07, "loss": 0.0251, "step": 790 }, { "epoch": 2.52, "learning_rate": 1.8037782500969133e-07, "loss": 0.0302, "step": 800 }, { "epoch": 2.56, "learning_rate": 1.7379394102401884e-07, "loss": 0.0193, "step": 810 }, { "epoch": 2.59, "learning_rate": 1.672678254626979e-07, "loss": 0.0212, "step": 820 }, { "epoch": 2.62, "learning_rate": 1.60804425483713e-07, "loss": 0.0284, "step": 830 }, { "epoch": 2.65, "learning_rate": 1.5440864070315438e-07, "loss": 0.025, "step": 840 }, { "epoch": 2.68, "learning_rate": 1.480853194810373e-07, "loss": 0.0188, "step": 850 }, { "epoch": 2.71, "learning_rate": 1.4183925524597612e-07, "loss": 0.0266, "step": 860 }, { "epoch": 2.74, "learning_rate": 1.356751828614987e-07, "loss": 0.0243, "step": 870 }, { "epoch": 2.78, "learning_rate": 1.2959777503675736e-07, "loss": 0.0216, "step": 880 }, { "epoch": 2.81, "learning_rate": 1.2361163878435594e-07, "loss": 0.0219, "step": 890 }, { "epoch": 2.84, "learning_rate": 1.177213119279779e-07, "loss": 0.0232, "step": 900 }, { "epoch": 2.87, "learning_rate": 1.1193125966246453e-07, "loss": 0.019, "step": 910 }, { "epoch": 2.9, "learning_rate": 1.0624587116894887e-07, "loss": 0.0182, "step": 920 }, { "epoch": 2.93, "learning_rate": 1.006694562876133e-07, "loss": 0.0248, "step": 930 }, { "epoch": 2.97, "learning_rate": 9.52062422505906e-08, "loss": 0.0207, "step": 940 }, { "epoch": 3.0, "learning_rate": 8.986037047748768e-08, "loss": 0.0158, "step": 950 }, { "epoch": 3.0, "eval_loss": 0.050269413739442825, "eval_runtime": 21.2804, "eval_samples_per_second": 9.68, "eval_steps_per_second": 0.846, "step": 951 }, { "epoch": 3.03, "learning_rate": 8.463589343595976e-08, "loss": 0.0099, "step": 960 }, { "epoch": 3.06, "learning_rate": 7.953677156971433e-08, "loss": 0.0086, "step": 970 }, { "epoch": 3.09, "learning_rate": 7.456687029627526e-08, "loss": 0.0075, "step": 980 }, { "epoch": 3.12, "learning_rate": 6.972995707678153e-08, "loss": 0.0064, "step": 990 }, { "epoch": 3.15, "learning_rate": 6.502969856004251e-08, "loss": 0.0073, "step": 1000 }, { "epoch": 3.19, "learning_rate": 6.046965780301483e-08, "loss": 0.0114, "step": 1010 }, { "epoch": 3.22, "learning_rate": 5.605329156980679e-08, "loss": 0.0099, "step": 1020 }, { "epoch": 3.25, "learning_rate": 5.178394771125968e-08, "loss": 0.0052, "step": 1030 }, { "epoch": 3.28, "learning_rate": 4.7664862627090744e-08, "loss": 0.0138, "step": 1040 }, { "epoch": 3.31, "learning_rate": 4.369915881252259e-08, "loss": 0.0046, "step": 1050 }, { "epoch": 3.34, "learning_rate": 3.988984249125821e-08, "loss": 0.0021, "step": 1060 }, { "epoch": 3.38, "learning_rate": 3.623980133659685e-08, "loss": 0.009, "step": 1070 }, { "epoch": 3.41, "learning_rate": 3.2751802282416915e-08, "loss": 0.0032, "step": 1080 }, { "epoch": 3.44, "learning_rate": 2.9428489425686626e-08, "loss": 0.0074, "step": 1090 }, { "epoch": 3.47, "learning_rate": 2.62723820220917e-08, "loss": 0.0035, "step": 1100 }, { "epoch": 3.5, "learning_rate": 2.3285872576299604e-08, "loss": 0.0069, "step": 1110 }, { "epoch": 3.53, "learning_rate": 2.0471225028308226e-08, "loss": 0.0034, "step": 1120 }, { "epoch": 3.56, "learning_rate": 1.783057303725355e-08, "loss": 0.0171, "step": 1130 }, { "epoch": 3.6, "learning_rate": 1.536591836397763e-08, "loss": 0.0068, "step": 1140 }, { "epoch": 3.63, "learning_rate": 1.3079129353582658e-08, "loss": 0.0121, "step": 1150 }, { "epoch": 3.66, "learning_rate": 1.0971939519121648e-08, "loss": 0.0122, "step": 1160 }, { "epoch": 3.69, "learning_rate": 9.045946227499297e-09, "loss": 0.0079, "step": 1170 }, { "epoch": 3.72, "learning_rate": 7.302609488579248e-09, "loss": 0.0103, "step": 1180 }, { "epoch": 3.75, "learning_rate": 5.743250848415343e-09, "loss": 0.0051, "step": 1190 }, { "epoch": 3.79, "learning_rate": 4.369052387446476e-09, "loss": 0.0066, "step": 1200 }, { "epoch": 3.82, "learning_rate": 3.1810558244139695e-09, "loss": 0.0051, "step": 1210 }, { "epoch": 3.85, "learning_rate": 2.1801617266808957e-09, "loss": 0.0064, "step": 1220 }, { "epoch": 3.88, "learning_rate": 1.3671288275519666e-09, "loss": 0.0085, "step": 1230 }, { "epoch": 3.91, "learning_rate": 7.425734511116998e-10, "loss": 0.0056, "step": 1240 }, { "epoch": 3.94, "learning_rate": 3.06969045016342e-10, "loss": 0.0072, "step": 1250 }, { "epoch": 3.97, "learning_rate": 6.064582159442944e-11, "loss": 0.0047, "step": 1260 }, { "epoch": 4.0, "eval_loss": 0.06437445431947708, "eval_runtime": 21.3091, "eval_samples_per_second": 9.667, "eval_steps_per_second": 0.845, "step": 1268 }, { "epoch": 4.0, "step": 1268, "total_flos": 72924899573760.0, "train_loss": 0.16973004465752314, "train_runtime": 6166.9886, "train_samples_per_second": 2.463, "train_steps_per_second": 0.206 } ], "logging_steps": 10, "max_steps": 1268, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 72924899573760.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }