{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 724, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 0.00029997740600876073, "loss": 1.0216, "step": 5 }, { "epoch": 0.03, "learning_rate": 0.00029988562958468534, "loss": 2.5373, "step": 10 }, { "epoch": 0.04, "learning_rate": 0.0002997233017693859, "loss": 1.7185, "step": 15 }, { "epoch": 0.06, "learning_rate": 0.0002995426927466334, "loss": 2.0243, "step": 20 }, { "epoch": 0.07, "learning_rate": 0.0002992535872688052, "loss": 2.2346, "step": 25 }, { "epoch": 0.08, "learning_rate": 0.0002988942279030214, "loss": 1.9241, "step": 30 }, { "epoch": 0.1, "learning_rate": 0.00029855627017138267, "loss": 1.8485, "step": 35 }, { "epoch": 0.11, "learning_rate": 0.00029827342660583437, "loss": 2.1848, "step": 40 }, { "epoch": 0.12, "learning_rate": 0.0002979654571005535, "loss": 5.0877, "step": 45 }, { "epoch": 0.14, "learning_rate": 0.0002973964828376827, "loss": 1.7997, "step": 50 }, { "epoch": 0.15, "learning_rate": 0.00029675812882870453, "loss": 1.7931, "step": 55 }, { "epoch": 0.17, "learning_rate": 0.000296050695547811, "loss": 1.2798, "step": 60 }, { "epoch": 0.18, "learning_rate": 0.00029527451598491093, "loss": 1.6239, "step": 65 }, { "epoch": 0.19, "learning_rate": 0.0002944299554888919, "loss": 0.7037, "step": 70 }, { "epoch": 0.21, "learning_rate": 0.00029351741159564925, "loss": 1.9876, "step": 75 }, { "epoch": 0.22, "learning_rate": 0.00029253731384096574, "loss": 1.3241, "step": 80 }, { "epoch": 0.23, "learning_rate": 0.00029149012355832816, "loss": 1.7376, "step": 85 }, { "epoch": 0.25, "learning_rate": 0.00029037633366177726, "loss": 1.8011, "step": 90 }, { "epoch": 0.26, "learning_rate": 0.00028919646841389207, "loss": 1.0738, "step": 95 }, { "epoch": 0.28, "learning_rate": 0.0002879510831790189, "loss": 1.0997, "step": 100 }, { "epoch": 0.29, "learning_rate": 0.0002866407641618605, "loss": 1.7646, "step": 105 }, { "epoch": 0.3, "learning_rate": 0.0002852661281315485, "loss": 1.114, "step": 110 }, { "epoch": 0.32, "learning_rate": 0.00028382782213132933, "loss": 1.6206, "step": 115 }, { "epoch": 0.33, "learning_rate": 0.0002823265231740001, "loss": 1.9232, "step": 120 }, { "epoch": 0.35, "learning_rate": 0.00028076293792323744, "loss": 1.9719, "step": 125 }, { "epoch": 0.36, "learning_rate": 0.00027913780236097003, "loss": 1.5214, "step": 130 }, { "epoch": 0.37, "learning_rate": 0.0002774518814409507, "loss": 1.7809, "step": 135 }, { "epoch": 0.39, "learning_rate": 0.0002757059687286915, "loss": 1.7594, "step": 140 }, { "epoch": 0.4, "learning_rate": 0.0002739008860279314, "loss": 1.888, "step": 145 }, { "epoch": 0.41, "learning_rate": 0.0002720374829938119, "loss": 1.3209, "step": 150 }, { "epoch": 0.43, "learning_rate": 0.00027011663673294316, "loss": 1.668, "step": 155 }, { "epoch": 0.44, "learning_rate": 0.00026813925139054844, "loss": 1.5554, "step": 160 }, { "epoch": 0.46, "learning_rate": 0.0002661062577248818, "loss": 1.8724, "step": 165 }, { "epoch": 0.47, "learning_rate": 0.0002640186126691185, "loss": 1.1475, "step": 170 }, { "epoch": 0.48, "learning_rate": 0.0002618772988809255, "loss": 1.5851, "step": 175 }, { "epoch": 0.5, "learning_rate": 0.0002596833242799227, "loss": 1.1434, "step": 180 }, { "epoch": 0.51, "learning_rate": 0.0002574377215732538, "loss": 2.0189, "step": 185 }, { "epoch": 0.52, "learning_rate": 0.0002551415477694895, "loss": 1.1961, "step": 190 }, { "epoch": 0.54, "learning_rate": 0.000252795883681092, "loss": 1.626, "step": 195 }, { "epoch": 0.55, "learning_rate": 0.0002504018334156746, "loss": 1.6993, "step": 200 }, { "epoch": 0.57, "learning_rate": 0.00024796052385629735, "loss": 1.3994, "step": 205 }, { "epoch": 0.58, "learning_rate": 0.00024547310413104055, "loss": 1.0801, "step": 210 }, { "epoch": 0.59, "learning_rate": 0.0002429407450721093, "loss": 1.9244, "step": 215 }, { "epoch": 0.61, "learning_rate": 0.00024036463866472033, "loss": 1.3285, "step": 220 }, { "epoch": 0.62, "learning_rate": 0.00023774599748603331, "loss": 1.3708, "step": 225 }, { "epoch": 0.64, "learning_rate": 0.0002350860541343886, "loss": 1.2642, "step": 230 }, { "epoch": 0.65, "learning_rate": 0.0002323860606491215, "loss": 1.2535, "step": 235 }, { "epoch": 0.66, "learning_rate": 0.0002296472879212254, "loss": 1.7609, "step": 240 }, { "epoch": 0.68, "learning_rate": 0.00022687102509514156, "loss": 1.7628, "step": 245 }, { "epoch": 0.69, "learning_rate": 0.00022405857896195673, "loss": 1.3396, "step": 250 }, { "epoch": 0.7, "learning_rate": 0.0002212112733442949, "loss": 1.516, "step": 255 }, { "epoch": 0.72, "learning_rate": 0.00021833044847319164, "loss": 1.7484, "step": 260 }, { "epoch": 0.73, "learning_rate": 0.0002154174603572456, "loss": 0.7856, "step": 265 }, { "epoch": 0.75, "learning_rate": 0.00021247368014434316, "loss": 1.8457, "step": 270 }, { "epoch": 0.76, "learning_rate": 0.000209500493476257, "loss": 0.8756, "step": 275 }, { "epoch": 0.77, "learning_rate": 0.00020649929983642242, "loss": 1.5258, "step": 280 }, { "epoch": 0.79, "learning_rate": 0.00020347151189119853, "loss": 1.3208, "step": 285 }, { "epoch": 0.8, "learning_rate": 0.00020041855482492402, "loss": 1.3849, "step": 290 }, { "epoch": 0.81, "learning_rate": 0.00019734186566908047, "loss": 1.3585, "step": 295 }, { "epoch": 0.83, "learning_rate": 0.00019424289262587994, "loss": 2.0685, "step": 300 }, { "epoch": 0.84, "learning_rate": 0.00019112309438659375, "loss": 0.9159, "step": 305 }, { "epoch": 0.86, "learning_rate": 0.0001879839394449442, "loss": 1.5168, "step": 310 }, { "epoch": 0.87, "learning_rate": 0.0001848269054058826, "loss": 1.4267, "step": 315 }, { "epoch": 0.88, "learning_rate": 0.00018165347829007832, "loss": 1.4472, "step": 320 }, { "epoch": 0.9, "learning_rate": 0.00017846515183444648, "loss": 2.0622, "step": 325 }, { "epoch": 0.91, "learning_rate": 0.00017526342678904396, "loss": 1.1134, "step": 330 }, { "epoch": 0.93, "learning_rate": 0.00017204981021066401, "loss": 1.7789, "step": 335 }, { "epoch": 0.94, "learning_rate": 0.00016882581475346223, "loss": 0.9645, "step": 340 }, { "epoch": 0.95, "learning_rate": 0.00016559295795694825, "loss": 2.0169, "step": 345 }, { "epoch": 0.97, "learning_rate": 0.00016235276153167723, "loss": 1.1319, "step": 350 }, { "epoch": 0.98, "learning_rate": 0.00015910675064297877, "loss": 1.7811, "step": 355 }, { "epoch": 0.99, "learning_rate": 0.00015585645319305912, "loss": 1.1135, "step": 360 }, { "epoch": 1.01, "learning_rate": 0.00015260339910181528, "loss": 0.9838, "step": 365 }, { "epoch": 1.02, "learning_rate": 0.00014934911958669966, "loss": 2.0867, "step": 370 }, { "epoch": 1.04, "learning_rate": 0.0001460951464419732, "loss": 1.8194, "step": 375 }, { "epoch": 1.05, "learning_rate": 0.00014284301131768816, "loss": 0.7124, "step": 380 }, { "epoch": 1.06, "learning_rate": 0.00013959424499873757, "loss": 0.8251, "step": 385 }, { "epoch": 1.08, "learning_rate": 0.00013635037668431298, "loss": 1.6568, "step": 390 }, { "epoch": 1.09, "learning_rate": 0.00013311293326810758, "loss": 0.9041, "step": 395 }, { "epoch": 1.1, "learning_rate": 0.00012988343861960544, "loss": 0.5734, "step": 400 }, { "epoch": 1.12, "learning_rate": 0.00012666341286679376, "loss": 1.5422, "step": 405 }, { "epoch": 1.13, "learning_rate": 0.00012345437168063614, "loss": 1.0433, "step": 410 }, { "epoch": 1.15, "learning_rate": 0.00012025782556164437, "loss": 0.6305, "step": 415 }, { "epoch": 1.16, "learning_rate": 0.00011707527912888316, "loss": 0.7021, "step": 420 }, { "epoch": 1.17, "learning_rate": 0.00011390823041174384, "loss": 1.7926, "step": 425 }, { "epoch": 1.19, "learning_rate": 0.00011075817014481984, "loss": 0.7126, "step": 430 }, { "epoch": 1.2, "learning_rate": 0.00010762658106621541, "loss": 0.9281, "step": 435 }, { "epoch": 1.22, "learning_rate": 0.00010451493721961883, "loss": 1.2706, "step": 440 }, { "epoch": 1.23, "learning_rate": 0.00010142470326046745, "loss": 0.7369, "step": 445 }, { "epoch": 1.24, "learning_rate": 9.835733376653254e-05, "loss": 1.3282, "step": 450 }, { "epoch": 1.26, "learning_rate": 9.531427255324734e-05, "loss": 1.5328, "step": 455 }, { "epoch": 1.27, "learning_rate": 9.229695199410055e-05, "loss": 0.81, "step": 460 }, { "epoch": 1.28, "learning_rate": 8.930679234641647e-05, "loss": 1.4783, "step": 465 }, { "epoch": 1.3, "learning_rate": 8.634520108283753e-05, "loss": 0.9337, "step": 470 }, { "epoch": 1.31, "learning_rate": 8.341357222882468e-05, "loss": 0.7309, "step": 475 }, { "epoch": 1.33, "learning_rate": 8.051328570648772e-05, "loss": 0.5978, "step": 480 }, { "epoch": 1.34, "learning_rate": 7.764570668505381e-05, "loss": 0.7613, "step": 485 }, { "epoch": 1.35, "learning_rate": 7.481218493828025e-05, "loss": 0.9437, "step": 490 }, { "epoch": 1.37, "learning_rate": 7.201405420911382e-05, "loss": 1.6472, "step": 495 }, { "epoch": 1.38, "learning_rate": 6.925263158189641e-05, "loss": 0.5782, "step": 500 }, { "epoch": 1.4, "learning_rate": 6.652921686241072e-05, "loss": 0.6908, "step": 505 }, { "epoch": 1.41, "learning_rate": 6.384509196606036e-05, "loss": 0.912, "step": 510 }, { "epoch": 1.42, "learning_rate": 6.120152031447023e-05, "loss": 1.3509, "step": 515 }, { "epoch": 1.44, "learning_rate": 5.859974624079157e-05, "loss": 1.2476, "step": 520 }, { "epoch": 1.45, "learning_rate": 5.604099440399325e-05, "loss": 1.3992, "step": 525 }, { "epoch": 1.46, "learning_rate": 5.352646921241234e-05, "loss": 0.8079, "step": 530 }, { "epoch": 1.48, "learning_rate": 5.1057354256837416e-05, "loss": 1.3463, "step": 535 }, { "epoch": 1.49, "learning_rate": 4.8634811753390915e-05, "loss": 0.5269, "step": 540 }, { "epoch": 1.51, "learning_rate": 4.6259981996471896e-05, "loss": 0.9968, "step": 545 }, { "epoch": 1.52, "learning_rate": 4.3933982822017876e-05, "loss": 0.4794, "step": 550 }, { "epoch": 1.53, "learning_rate": 4.165790908133783e-05, "loss": 1.77, "step": 555 }, { "epoch": 1.55, "learning_rate": 3.943283212576392e-05, "loss": 1.4066, "step": 560 }, { "epoch": 1.56, "learning_rate": 3.725979930236496e-05, "loss": 1.0347, "step": 565 }, { "epoch": 1.57, "learning_rate": 3.513983346095838e-05, "loss": 0.3513, "step": 570 }, { "epoch": 1.59, "learning_rate": 3.307393247265362e-05, "loss": 0.387, "step": 575 }, { "epoch": 1.6, "learning_rate": 3.106306876015212e-05, "loss": 0.9932, "step": 580 }, { "epoch": 1.62, "learning_rate": 2.9108188840027074e-05, "loss": 0.7613, "step": 585 }, { "epoch": 1.63, "learning_rate": 2.72102128771965e-05, "loss": 1.3139, "step": 590 }, { "epoch": 1.64, "learning_rate": 2.5370034251799865e-05, "loss": 1.3463, "step": 595 }, { "epoch": 1.66, "learning_rate": 2.3588519138683276e-05, "loss": 1.3703, "step": 600 }, { "epoch": 1.67, "learning_rate": 2.18665060996893e-05, "loss": 0.3998, "step": 605 }, { "epoch": 1.69, "learning_rate": 2.020480568894461e-05, "loss": 1.1889, "step": 610 }, { "epoch": 1.7, "learning_rate": 1.860420007133131e-05, "loss": 0.8979, "step": 615 }, { "epoch": 1.71, "learning_rate": 1.7065442654320334e-05, "loss": 1.1826, "step": 620 }, { "epoch": 1.73, "learning_rate": 1.5589257733341694e-05, "loss": 0.5617, "step": 625 }, { "epoch": 1.74, "learning_rate": 1.417634015085734e-05, "loss": 0.8612, "step": 630 }, { "epoch": 1.75, "learning_rate": 1.2827354969297953e-05, "loss": 1.152, "step": 635 }, { "epoch": 1.77, "learning_rate": 1.1542937158016895e-05, "loss": 1.5859, "step": 640 }, { "epoch": 1.78, "learning_rate": 1.032369129440928e-05, "loss": 1.3262, "step": 645 }, { "epoch": 1.8, "learning_rate": 9.170191279336458e-06, "loss": 0.7804, "step": 650 }, { "epoch": 1.81, "learning_rate": 8.082980066989987e-06, "loss": 0.6581, "step": 655 }, { "epoch": 1.82, "learning_rate": 7.062569409322394e-06, "loss": 0.7062, "step": 660 }, { "epoch": 1.84, "learning_rate": 6.109439615164524e-06, "loss": 0.5249, "step": 665 }, { "epoch": 1.85, "learning_rate": 5.224039324143692e-06, "loss": 0.6158, "step": 670 }, { "epoch": 1.86, "learning_rate": 4.406785295508275e-06, "loss": 1.0754, "step": 675 }, { "epoch": 1.88, "learning_rate": 3.6580622119583847e-06, "loss": 1.3947, "step": 680 }, { "epoch": 1.89, "learning_rate": 2.9782224985753224e-06, "loss": 1.1482, "step": 685 }, { "epoch": 1.91, "learning_rate": 2.367586156934409e-06, "loss": 0.7185, "step": 690 }, { "epoch": 1.92, "learning_rate": 1.8264406144798238e-06, "loss": 0.2661, "step": 695 }, { "epoch": 1.93, "learning_rate": 1.3550405892321591e-06, "loss": 1.0511, "step": 700 }, { "epoch": 1.95, "learning_rate": 9.536079698920951e-07, "loss": 0.7304, "step": 705 }, { "epoch": 1.96, "learning_rate": 6.223317113971948e-07, "loss": 1.0044, "step": 710 }, { "epoch": 1.98, "learning_rate": 3.6136774598043427e-07, "loss": 0.4553, "step": 715 }, { "epoch": 1.99, "learning_rate": 1.7083890977278027e-07, "loss": 0.9209, "step": 720 }, { "epoch": 2.0, "step": 724, "total_flos": 2638293095350272.0, "train_loss": 1.3001856657352238, "train_runtime": 3307.5677, "train_samples_per_second": 0.219, "train_steps_per_second": 0.219 } ], "logging_steps": 5, "max_steps": 724, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 2638293095350272.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }