{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.6410923276983094, "eval_steps": 50, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06501950585175553, "grad_norm": 5.187023639678955, "learning_rate": 1.3003901170351107e-06, "loss": 1.7674, "step": 50 }, { "epoch": 0.06501950585175553, "eval_loss": 0.9484056234359741, "eval_runtime": 12.4787, "eval_samples_per_second": 12.501, "eval_steps_per_second": 0.641, "step": 50 }, { "epoch": 0.13003901170351106, "grad_norm": 8.135283470153809, "learning_rate": 2.6007802340702214e-06, "loss": 0.7724, "step": 100 }, { "epoch": 0.13003901170351106, "eval_loss": 0.7867242693901062, "eval_runtime": 12.7026, "eval_samples_per_second": 12.281, "eval_steps_per_second": 0.63, "step": 100 }, { "epoch": 0.19505851755526657, "grad_norm": 4.912701606750488, "learning_rate": 3.901170351105332e-06, "loss": 0.6915, "step": 150 }, { "epoch": 0.19505851755526657, "eval_loss": 0.7557621598243713, "eval_runtime": 12.7456, "eval_samples_per_second": 12.24, "eval_steps_per_second": 0.628, "step": 150 }, { "epoch": 0.26007802340702213, "grad_norm": 3.022888660430908, "learning_rate": 5.201560468140443e-06, "loss": 0.7283, "step": 200 }, { "epoch": 0.26007802340702213, "eval_loss": 0.74037104845047, "eval_runtime": 12.7356, "eval_samples_per_second": 12.249, "eval_steps_per_second": 0.628, "step": 200 }, { "epoch": 0.3250975292587776, "grad_norm": 2.986654043197632, "learning_rate": 6.501950585175553e-06, "loss": 0.6971, "step": 250 }, { "epoch": 0.3250975292587776, "eval_loss": 0.7387701869010925, "eval_runtime": 12.6845, "eval_samples_per_second": 12.299, "eval_steps_per_second": 0.631, "step": 250 }, { "epoch": 0.39011703511053314, "grad_norm": 1.9750109910964966, "learning_rate": 7.802340702210663e-06, "loss": 0.6829, "step": 300 }, { "epoch": 0.39011703511053314, "eval_loss": 0.7368441820144653, "eval_runtime": 12.732, "eval_samples_per_second": 12.253, "eval_steps_per_second": 0.628, "step": 300 }, { "epoch": 0.45513654096228867, "grad_norm": 1.951280117034912, "learning_rate": 9.102730819245774e-06, "loss": 0.6675, "step": 350 }, { "epoch": 0.45513654096228867, "eval_loss": 0.7451388835906982, "eval_runtime": 12.7327, "eval_samples_per_second": 12.252, "eval_steps_per_second": 0.628, "step": 350 }, { "epoch": 0.5201560468140443, "grad_norm": 1.559377670288086, "learning_rate": 1.0403120936280886e-05, "loss": 0.7137, "step": 400 }, { "epoch": 0.5201560468140443, "eval_loss": 0.7458119988441467, "eval_runtime": 12.7092, "eval_samples_per_second": 12.275, "eval_steps_per_second": 0.629, "step": 400 }, { "epoch": 0.5851755526657998, "grad_norm": 2.116973400115967, "learning_rate": 1.1703511053315997e-05, "loss": 0.682, "step": 450 }, { "epoch": 0.5851755526657998, "eval_loss": 0.7493389844894409, "eval_runtime": 12.4729, "eval_samples_per_second": 12.507, "eval_steps_per_second": 0.641, "step": 450 }, { "epoch": 0.6501950585175552, "grad_norm": 3.4941141605377197, "learning_rate": 1.3003901170351106e-05, "loss": 0.7098, "step": 500 }, { "epoch": 0.6501950585175552, "eval_loss": 0.749406635761261, "eval_runtime": 12.7237, "eval_samples_per_second": 12.261, "eval_steps_per_second": 0.629, "step": 500 }, { "epoch": 0.7152145643693107, "grad_norm": 1.5601381063461304, "learning_rate": 1.4304291287386216e-05, "loss": 0.7018, "step": 550 }, { "epoch": 0.7152145643693107, "eval_loss": 0.7575533390045166, "eval_runtime": 12.6265, "eval_samples_per_second": 12.355, "eval_steps_per_second": 0.634, "step": 550 }, { "epoch": 0.7802340702210663, "grad_norm": 1.6351755857467651, "learning_rate": 1.5604681404421327e-05, "loss": 0.6858, "step": 600 }, { "epoch": 0.7802340702210663, "eval_loss": 0.7638020515441895, "eval_runtime": 12.728, "eval_samples_per_second": 12.256, "eval_steps_per_second": 0.629, "step": 600 }, { "epoch": 0.8452535760728218, "grad_norm": 2.4388427734375, "learning_rate": 1.6905071521456436e-05, "loss": 0.7211, "step": 650 }, { "epoch": 0.8452535760728218, "eval_loss": 0.7665178179740906, "eval_runtime": 12.7306, "eval_samples_per_second": 12.254, "eval_steps_per_second": 0.628, "step": 650 }, { "epoch": 0.9102730819245773, "grad_norm": 1.3347229957580566, "learning_rate": 1.820546163849155e-05, "loss": 0.7241, "step": 700 }, { "epoch": 0.9102730819245773, "eval_loss": 0.7774284482002258, "eval_runtime": 12.7002, "eval_samples_per_second": 12.283, "eval_steps_per_second": 0.63, "step": 700 }, { "epoch": 0.9752925877763329, "grad_norm": 1.5335527658462524, "learning_rate": 1.9505851755526658e-05, "loss": 0.728, "step": 750 }, { "epoch": 0.9752925877763329, "eval_loss": 0.7831318378448486, "eval_runtime": 12.525, "eval_samples_per_second": 12.455, "eval_steps_per_second": 0.639, "step": 750 }, { "epoch": 1.0403120936280885, "grad_norm": 1.68842613697052, "learning_rate": 1.999900997018723e-05, "loss": 0.5466, "step": 800 }, { "epoch": 1.0403120936280885, "eval_loss": 0.8039405941963196, "eval_runtime": 12.7007, "eval_samples_per_second": 12.283, "eval_steps_per_second": 0.63, "step": 800 }, { "epoch": 1.105331599479844, "grad_norm": 2.300739288330078, "learning_rate": 1.9993241455728505e-05, "loss": 0.4615, "step": 850 }, { "epoch": 1.105331599479844, "eval_loss": 0.8171147108078003, "eval_runtime": 12.6326, "eval_samples_per_second": 12.349, "eval_steps_per_second": 0.633, "step": 850 }, { "epoch": 1.1703511053315996, "grad_norm": 1.7798463106155396, "learning_rate": 1.998232551903873e-05, "loss": 0.483, "step": 900 }, { "epoch": 1.1703511053315996, "eval_loss": 0.8221042156219482, "eval_runtime": 12.659, "eval_samples_per_second": 12.323, "eval_steps_per_second": 0.632, "step": 900 }, { "epoch": 1.2353706111833551, "grad_norm": 2.231555938720703, "learning_rate": 1.9966267782811538e-05, "loss": 0.5017, "step": 950 }, { "epoch": 1.2353706111833551, "eval_loss": 0.8124644160270691, "eval_runtime": 12.7557, "eval_samples_per_second": 12.23, "eval_steps_per_second": 0.627, "step": 950 }, { "epoch": 1.3003901170351106, "grad_norm": 2.012153148651123, "learning_rate": 1.99450765182319e-05, "loss": 0.4791, "step": 1000 }, { "epoch": 1.3003901170351106, "eval_loss": 0.8135426640510559, "eval_runtime": 12.6835, "eval_samples_per_second": 12.299, "eval_steps_per_second": 0.631, "step": 1000 }, { "epoch": 1.3654096228868662, "grad_norm": 1.9976961612701416, "learning_rate": 1.991876264071568e-05, "loss": 0.458, "step": 1050 }, { "epoch": 1.3654096228868662, "eval_loss": 0.8159711956977844, "eval_runtime": 12.6632, "eval_samples_per_second": 12.319, "eval_steps_per_second": 0.632, "step": 1050 }, { "epoch": 1.4304291287386217, "grad_norm": 1.917141318321228, "learning_rate": 1.988733970428724e-05, "loss": 0.4767, "step": 1100 }, { "epoch": 1.4304291287386217, "eval_loss": 0.8092829585075378, "eval_runtime": 12.7003, "eval_samples_per_second": 12.283, "eval_steps_per_second": 0.63, "step": 1100 }, { "epoch": 1.4954486345903772, "grad_norm": 2.5106847286224365, "learning_rate": 1.9850823894597882e-05, "loss": 0.4908, "step": 1150 }, { "epoch": 1.4954486345903772, "eval_loss": 0.807929277420044, "eval_runtime": 12.6245, "eval_samples_per_second": 12.357, "eval_steps_per_second": 0.634, "step": 1150 }, { "epoch": 1.5604681404421328, "grad_norm": 1.7505854368209839, "learning_rate": 1.9809234020588762e-05, "loss": 0.4809, "step": 1200 }, { "epoch": 1.5604681404421328, "eval_loss": 0.8194348216056824, "eval_runtime": 12.6755, "eval_samples_per_second": 12.307, "eval_steps_per_second": 0.631, "step": 1200 }, { "epoch": 1.6254876462938883, "grad_norm": 1.2360202074050903, "learning_rate": 1.9762591504802615e-05, "loss": 0.4732, "step": 1250 }, { "epoch": 1.6254876462938883, "eval_loss": 0.8178196549415588, "eval_runtime": 12.6982, "eval_samples_per_second": 12.285, "eval_steps_per_second": 0.63, "step": 1250 }, { "epoch": 1.6905071521456438, "grad_norm": 1.7972296476364136, "learning_rate": 1.9710920372349174e-05, "loss": 0.4658, "step": 1300 }, { "epoch": 1.6905071521456438, "eval_loss": 0.8226543068885803, "eval_runtime": 12.6834, "eval_samples_per_second": 12.3, "eval_steps_per_second": 0.631, "step": 1300 }, { "epoch": 1.7555266579973994, "grad_norm": 2.1342735290527344, "learning_rate": 1.965424723853011e-05, "loss": 0.4874, "step": 1350 }, { "epoch": 1.7555266579973994, "eval_loss": 0.8139074444770813, "eval_runtime": 12.6712, "eval_samples_per_second": 12.311, "eval_steps_per_second": 0.631, "step": 1350 }, { "epoch": 1.820546163849155, "grad_norm": 2.0552141666412354, "learning_rate": 1.9592601295129703e-05, "loss": 0.5181, "step": 1400 }, { "epoch": 1.820546163849155, "eval_loss": 0.8143725395202637, "eval_runtime": 12.7032, "eval_samples_per_second": 12.28, "eval_steps_per_second": 0.63, "step": 1400 }, { "epoch": 1.8855656697009102, "grad_norm": 1.9728556871414185, "learning_rate": 1.9526014295378472e-05, "loss": 0.4663, "step": 1450 }, { "epoch": 1.8855656697009102, "eval_loss": 0.8104857206344604, "eval_runtime": 12.6152, "eval_samples_per_second": 12.366, "eval_steps_per_second": 0.634, "step": 1450 }, { "epoch": 1.9505851755526658, "grad_norm": 2.6176164150238037, "learning_rate": 1.9454520537597364e-05, "loss": 0.5003, "step": 1500 }, { "epoch": 1.9505851755526658, "eval_loss": 0.8162646293640137, "eval_runtime": 12.7145, "eval_samples_per_second": 12.269, "eval_steps_per_second": 0.629, "step": 1500 }, { "epoch": 2.0156046814044215, "grad_norm": 1.6983990669250488, "learning_rate": 1.9378156847531047e-05, "loss": 0.4377, "step": 1550 }, { "epoch": 2.0156046814044215, "eval_loss": 0.8717073202133179, "eval_runtime": 12.636, "eval_samples_per_second": 12.346, "eval_steps_per_second": 0.633, "step": 1550 }, { "epoch": 2.080624187256177, "grad_norm": 1.206516146659851, "learning_rate": 1.9296962559379296e-05, "loss": 0.251, "step": 1600 }, { "epoch": 2.080624187256177, "eval_loss": 0.9452024698257446, "eval_runtime": 12.7171, "eval_samples_per_second": 12.267, "eval_steps_per_second": 0.629, "step": 1600 }, { "epoch": 2.1456436931079326, "grad_norm": 1.4168466329574585, "learning_rate": 1.9210979495536353e-05, "loss": 0.2465, "step": 1650 }, { "epoch": 2.1456436931079326, "eval_loss": 0.9140231013298035, "eval_runtime": 12.6828, "eval_samples_per_second": 12.3, "eval_steps_per_second": 0.631, "step": 1650 }, { "epoch": 2.210663198959688, "grad_norm": 1.882591724395752, "learning_rate": 1.91202519450486e-05, "loss": 0.2494, "step": 1700 }, { "epoch": 2.210663198959688, "eval_loss": 0.9127895832061768, "eval_runtime": 12.6517, "eval_samples_per_second": 12.33, "eval_steps_per_second": 0.632, "step": 1700 }, { "epoch": 2.2756827048114436, "grad_norm": 1.1632663011550903, "learning_rate": 1.9024826640801694e-05, "loss": 0.2564, "step": 1750 }, { "epoch": 2.2756827048114436, "eval_loss": 0.9155619740486145, "eval_runtime": 12.7147, "eval_samples_per_second": 12.269, "eval_steps_per_second": 0.629, "step": 1750 }, { "epoch": 2.340702210663199, "grad_norm": 1.2240418195724487, "learning_rate": 1.8924752735448927e-05, "loss": 0.2565, "step": 1800 }, { "epoch": 2.340702210663199, "eval_loss": 0.9181879758834839, "eval_runtime": 12.7403, "eval_samples_per_second": 12.245, "eval_steps_per_second": 0.628, "step": 1800 }, { "epoch": 2.4057217165149547, "grad_norm": 1.44245183467865, "learning_rate": 1.882008177609315e-05, "loss": 0.2587, "step": 1850 }, { "epoch": 2.4057217165149547, "eval_loss": 0.9207939505577087, "eval_runtime": 12.7382, "eval_samples_per_second": 12.247, "eval_steps_per_second": 0.628, "step": 1850 }, { "epoch": 2.4707412223667102, "grad_norm": 1.4404542446136475, "learning_rate": 1.8710867677735368e-05, "loss": 0.2725, "step": 1900 }, { "epoch": 2.4707412223667102, "eval_loss": 0.9286116361618042, "eval_runtime": 12.6909, "eval_samples_per_second": 12.292, "eval_steps_per_second": 0.63, "step": 1900 }, { "epoch": 2.5357607282184658, "grad_norm": 1.5826677083969116, "learning_rate": 1.859716669550364e-05, "loss": 0.2582, "step": 1950 }, { "epoch": 2.5357607282184658, "eval_loss": 0.9147523045539856, "eval_runtime": 12.8088, "eval_samples_per_second": 12.179, "eval_steps_per_second": 0.625, "step": 1950 }, { "epoch": 2.6007802340702213, "grad_norm": 1.4913737773895264, "learning_rate": 1.8479037395676613e-05, "loss": 0.2558, "step": 2000 }, { "epoch": 2.6007802340702213, "eval_loss": 0.913769543170929, "eval_runtime": 12.7464, "eval_samples_per_second": 12.239, "eval_steps_per_second": 0.628, "step": 2000 }, { "epoch": 2.665799739921977, "grad_norm": 1.355419397354126, "learning_rate": 1.835654062551658e-05, "loss": 0.2625, "step": 2050 }, { "epoch": 2.665799739921977, "eval_loss": 0.9320564270019531, "eval_runtime": 12.7189, "eval_samples_per_second": 12.265, "eval_steps_per_second": 0.629, "step": 2050 }, { "epoch": 2.7308192457737324, "grad_norm": 1.2386664152145386, "learning_rate": 1.8229739481927665e-05, "loss": 0.2601, "step": 2100 }, { "epoch": 2.7308192457737324, "eval_loss": 0.932084321975708, "eval_runtime": 12.7032, "eval_samples_per_second": 12.28, "eval_steps_per_second": 0.63, "step": 2100 }, { "epoch": 2.795838751625488, "grad_norm": 1.9765559434890747, "learning_rate": 1.809869927895519e-05, "loss": 0.2644, "step": 2150 }, { "epoch": 2.795838751625488, "eval_loss": 0.9155560731887817, "eval_runtime": 12.7374, "eval_samples_per_second": 12.247, "eval_steps_per_second": 0.628, "step": 2150 }, { "epoch": 2.8608582574772434, "grad_norm": 1.5704264640808105, "learning_rate": 1.7963487514143073e-05, "loss": 0.2724, "step": 2200 }, { "epoch": 2.8608582574772434, "eval_loss": 0.9207773208618164, "eval_runtime": 12.6727, "eval_samples_per_second": 12.31, "eval_steps_per_second": 0.631, "step": 2200 }, { "epoch": 2.925877763328999, "grad_norm": 2.2694146633148193, "learning_rate": 1.7824173833766457e-05, "loss": 0.2649, "step": 2250 }, { "epoch": 2.925877763328999, "eval_loss": 0.9216504096984863, "eval_runtime": 12.761, "eval_samples_per_second": 12.225, "eval_steps_per_second": 0.627, "step": 2250 }, { "epoch": 2.9908972691807545, "grad_norm": 0.9763782024383545, "learning_rate": 1.7680829996957617e-05, "loss": 0.2701, "step": 2300 }, { "epoch": 2.9908972691807545, "eval_loss": 0.9425987601280212, "eval_runtime": 12.7408, "eval_samples_per_second": 12.244, "eval_steps_per_second": 0.628, "step": 2300 }, { "epoch": 3.0559167750325096, "grad_norm": 1.0670485496520996, "learning_rate": 1.7533529838743514e-05, "loss": 0.1548, "step": 2350 }, { "epoch": 3.0559167750325096, "eval_loss": 0.9931546449661255, "eval_runtime": 12.6597, "eval_samples_per_second": 12.323, "eval_steps_per_second": 0.632, "step": 2350 }, { "epoch": 3.120936280884265, "grad_norm": 1.126699686050415, "learning_rate": 1.7382349232014083e-05, "loss": 0.1425, "step": 2400 }, { "epoch": 3.120936280884265, "eval_loss": 1.0121082067489624, "eval_runtime": 12.7889, "eval_samples_per_second": 12.198, "eval_steps_per_second": 0.626, "step": 2400 }, { "epoch": 3.1859557867360206, "grad_norm": 1.1538535356521606, "learning_rate": 1.7227366048440822e-05, "loss": 0.148, "step": 2450 }, { "epoch": 3.1859557867360206, "eval_loss": 1.010581374168396, "eval_runtime": 12.6419, "eval_samples_per_second": 12.34, "eval_steps_per_second": 0.633, "step": 2450 }, { "epoch": 3.250975292587776, "grad_norm": 0.9063160419464111, "learning_rate": 1.7068660118365894e-05, "loss": 0.1425, "step": 2500 }, { "epoch": 3.250975292587776, "eval_loss": 1.010384440422058, "eval_runtime": 12.7187, "eval_samples_per_second": 12.265, "eval_steps_per_second": 0.629, "step": 2500 }, { "epoch": 3.3159947984395317, "grad_norm": 1.3215436935424805, "learning_rate": 1.6906313189682227e-05, "loss": 0.1433, "step": 2550 }, { "epoch": 3.3159947984395317, "eval_loss": 1.0235826969146729, "eval_runtime": 12.7063, "eval_samples_per_second": 12.277, "eval_steps_per_second": 0.63, "step": 2550 }, { "epoch": 3.3810143042912872, "grad_norm": 1.1784850358963013, "learning_rate": 1.674040888572602e-05, "loss": 0.1501, "step": 2600 }, { "epoch": 3.3810143042912872, "eval_loss": 1.029763102531433, "eval_runtime": 12.7431, "eval_samples_per_second": 12.242, "eval_steps_per_second": 0.628, "step": 2600 }, { "epoch": 3.4460338101430428, "grad_norm": 1.062929391860962, "learning_rate": 1.6571032662203126e-05, "loss": 0.1553, "step": 2650 }, { "epoch": 3.4460338101430428, "eval_loss": 1.0215131044387817, "eval_runtime": 12.6931, "eval_samples_per_second": 12.29, "eval_steps_per_second": 0.63, "step": 2650 }, { "epoch": 3.5110533159947983, "grad_norm": 1.3030365705490112, "learning_rate": 1.6398271763171663e-05, "loss": 0.1521, "step": 2700 }, { "epoch": 3.5110533159947983, "eval_loss": 1.002717137336731, "eval_runtime": 12.6796, "eval_samples_per_second": 12.303, "eval_steps_per_second": 0.631, "step": 2700 }, { "epoch": 3.576072821846554, "grad_norm": 1.0211737155914307, "learning_rate": 1.6222215176103445e-05, "loss": 0.1456, "step": 2750 }, { "epoch": 3.576072821846554, "eval_loss": 1.0083057880401611, "eval_runtime": 12.9744, "eval_samples_per_second": 12.024, "eval_steps_per_second": 0.617, "step": 2750 }, { "epoch": 3.6410923276983094, "grad_norm": 1.2589364051818848, "learning_rate": 1.6042953586047382e-05, "loss": 0.1449, "step": 2800 }, { "epoch": 3.6410923276983094, "eval_loss": 1.0153865814208984, "eval_runtime": 12.6224, "eval_samples_per_second": 12.359, "eval_steps_per_second": 0.634, "step": 2800 } ], "logging_steps": 50, "max_steps": 7690, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 400, "total_flos": 2.549607587339305e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }