{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.6, "eval_steps": 50, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 10.53576374053955, "learning_rate": 2.0000000000000003e-06, "loss": 1.6618, "step": 50 }, { "epoch": 0.1, "eval_loss": 0.7731789350509644, "eval_runtime": 2.2494, "eval_samples_per_second": 69.353, "eval_steps_per_second": 3.557, "step": 50 }, { "epoch": 0.2, "grad_norm": 5.800010681152344, "learning_rate": 4.000000000000001e-06, "loss": 0.7629, "step": 100 }, { "epoch": 0.2, "eval_loss": 0.6901325583457947, "eval_runtime": 2.2539, "eval_samples_per_second": 69.213, "eval_steps_per_second": 3.549, "step": 100 }, { "epoch": 0.3, "grad_norm": 4.960265636444092, "learning_rate": 6e-06, "loss": 0.7256, "step": 150 }, { "epoch": 0.3, "eval_loss": 0.6716309785842896, "eval_runtime": 2.2526, "eval_samples_per_second": 69.254, "eval_steps_per_second": 3.551, "step": 150 }, { "epoch": 0.4, "grad_norm": 5.574848651885986, "learning_rate": 8.000000000000001e-06, "loss": 0.7243, "step": 200 }, { "epoch": 0.4, "eval_loss": 0.6644517779350281, "eval_runtime": 2.2546, "eval_samples_per_second": 69.193, "eval_steps_per_second": 3.548, "step": 200 }, { "epoch": 0.5, "grad_norm": 3.0581891536712646, "learning_rate": 1e-05, "loss": 0.6918, "step": 250 }, { "epoch": 0.5, "eval_loss": 0.6718080043792725, "eval_runtime": 2.255, "eval_samples_per_second": 69.18, "eval_steps_per_second": 3.548, "step": 250 }, { "epoch": 0.6, "grad_norm": 3.797400712966919, "learning_rate": 1.2e-05, "loss": 0.7433, "step": 300 }, { "epoch": 0.6, "eval_loss": 0.67710280418396, "eval_runtime": 2.2558, "eval_samples_per_second": 69.155, "eval_steps_per_second": 3.546, "step": 300 }, { "epoch": 0.7, "grad_norm": 8.121636390686035, "learning_rate": 1.4e-05, "loss": 0.7523, "step": 350 }, { "epoch": 0.7, "eval_loss": 0.680716335773468, "eval_runtime": 2.2562, "eval_samples_per_second": 69.144, "eval_steps_per_second": 3.546, "step": 350 }, { "epoch": 0.8, "grad_norm": 2.615454912185669, "learning_rate": 1.6000000000000003e-05, "loss": 0.7322, "step": 400 }, { "epoch": 0.8, "eval_loss": 0.6906686425209045, "eval_runtime": 2.2633, "eval_samples_per_second": 68.926, "eval_steps_per_second": 3.535, "step": 400 }, { "epoch": 0.9, "grad_norm": 2.9651033878326416, "learning_rate": 1.8e-05, "loss": 0.7497, "step": 450 }, { "epoch": 0.9, "eval_loss": 0.6827173233032227, "eval_runtime": 2.5909, "eval_samples_per_second": 60.21, "eval_steps_per_second": 3.088, "step": 450 }, { "epoch": 1.0, "grad_norm": 3.7542426586151123, "learning_rate": 2e-05, "loss": 0.7622, "step": 500 }, { "epoch": 1.0, "eval_loss": 0.6903170347213745, "eval_runtime": 2.4721, "eval_samples_per_second": 63.105, "eval_steps_per_second": 3.236, "step": 500 }, { "epoch": 1.1, "grad_norm": 2.384434938430786, "learning_rate": 1.999390827019096e-05, "loss": 0.484, "step": 550 }, { "epoch": 1.1, "eval_loss": 0.7237842679023743, "eval_runtime": 2.9344, "eval_samples_per_second": 53.162, "eval_steps_per_second": 2.726, "step": 550 }, { "epoch": 1.2, "grad_norm": 3.1198794841766357, "learning_rate": 1.9975640502598243e-05, "loss": 0.5145, "step": 600 }, { "epoch": 1.2, "eval_loss": 0.7352678179740906, "eval_runtime": 3.8017, "eval_samples_per_second": 41.034, "eval_steps_per_second": 2.104, "step": 600 }, { "epoch": 1.3, "grad_norm": 6.234444618225098, "learning_rate": 1.9945218953682736e-05, "loss": 0.5093, "step": 650 }, { "epoch": 1.3, "eval_loss": 0.7311124801635742, "eval_runtime": 2.2672, "eval_samples_per_second": 68.808, "eval_steps_per_second": 3.529, "step": 650 }, { "epoch": 1.4, "grad_norm": 2.112931489944458, "learning_rate": 1.9902680687415704e-05, "loss": 0.5248, "step": 700 }, { "epoch": 1.4, "eval_loss": 0.734488844871521, "eval_runtime": 2.2746, "eval_samples_per_second": 68.582, "eval_steps_per_second": 3.517, "step": 700 }, { "epoch": 1.5, "grad_norm": 3.4556541442871094, "learning_rate": 1.9848077530122083e-05, "loss": 0.5107, "step": 750 }, { "epoch": 1.5, "eval_loss": 0.723623514175415, "eval_runtime": 2.256, "eval_samples_per_second": 69.148, "eval_steps_per_second": 3.546, "step": 750 }, { "epoch": 1.6, "grad_norm": 3.025707960128784, "learning_rate": 1.9781476007338058e-05, "loss": 0.5171, "step": 800 }, { "epoch": 1.6, "eval_loss": 0.7228586077690125, "eval_runtime": 2.2603, "eval_samples_per_second": 69.016, "eval_steps_per_second": 3.539, "step": 800 }, { "epoch": 1.7, "grad_norm": 2.2873287200927734, "learning_rate": 1.9702957262759964e-05, "loss": 0.5391, "step": 850 }, { "epoch": 1.7, "eval_loss": 0.7198938727378845, "eval_runtime": 2.4311, "eval_samples_per_second": 64.168, "eval_steps_per_second": 3.291, "step": 850 }, { "epoch": 1.8, "grad_norm": 3.1473968029022217, "learning_rate": 1.961261695938319e-05, "loss": 0.5244, "step": 900 }, { "epoch": 1.8, "eval_loss": 0.7222604751586914, "eval_runtime": 2.6131, "eval_samples_per_second": 59.699, "eval_steps_per_second": 3.061, "step": 900 }, { "epoch": 1.9, "grad_norm": 2.5658185482025146, "learning_rate": 1.9510565162951538e-05, "loss": 0.5435, "step": 950 }, { "epoch": 1.9, "eval_loss": 0.7172784209251404, "eval_runtime": 3.0626, "eval_samples_per_second": 50.937, "eval_steps_per_second": 2.612, "step": 950 }, { "epoch": 2.0, "grad_norm": 3.090545415878296, "learning_rate": 1.9396926207859085e-05, "loss": 0.5197, "step": 1000 }, { "epoch": 2.0, "eval_loss": 0.7204703092575073, "eval_runtime": 3.4963, "eval_samples_per_second": 44.619, "eval_steps_per_second": 2.288, "step": 1000 }, { "epoch": 2.1, "grad_norm": 1.921531081199646, "learning_rate": 1.9271838545667876e-05, "loss": 0.2538, "step": 1050 }, { "epoch": 2.1, "eval_loss": 0.791098952293396, "eval_runtime": 2.2604, "eval_samples_per_second": 69.014, "eval_steps_per_second": 3.539, "step": 1050 }, { "epoch": 2.2, "grad_norm": 1.807320475578308, "learning_rate": 1.913545457642601e-05, "loss": 0.2521, "step": 1100 }, { "epoch": 2.2, "eval_loss": 0.8204991221427917, "eval_runtime": 2.2623, "eval_samples_per_second": 68.956, "eval_steps_per_second": 3.536, "step": 1100 }, { "epoch": 2.3, "grad_norm": 2.746616840362549, "learning_rate": 1.8987940462991673e-05, "loss": 0.2687, "step": 1150 }, { "epoch": 2.3, "eval_loss": 0.8025296330451965, "eval_runtime": 2.2565, "eval_samples_per_second": 69.132, "eval_steps_per_second": 3.545, "step": 1150 }, { "epoch": 2.4, "grad_norm": 2.3170738220214844, "learning_rate": 1.8829475928589272e-05, "loss": 0.2689, "step": 1200 }, { "epoch": 2.4, "eval_loss": 0.8150458931922913, "eval_runtime": 2.2607, "eval_samples_per_second": 69.005, "eval_steps_per_second": 3.539, "step": 1200 }, { "epoch": 2.5, "grad_norm": 1.9649097919464111, "learning_rate": 1.866025403784439e-05, "loss": 0.2772, "step": 1250 }, { "epoch": 2.5, "eval_loss": 0.7988224625587463, "eval_runtime": 2.5979, "eval_samples_per_second": 60.048, "eval_steps_per_second": 3.079, "step": 1250 }, { "epoch": 2.6, "grad_norm": 2.264338970184326, "learning_rate": 1.848048096156426e-05, "loss": 0.2788, "step": 1300 }, { "epoch": 2.6, "eval_loss": 0.8175423741340637, "eval_runtime": 3.4025, "eval_samples_per_second": 45.849, "eval_steps_per_second": 2.351, "step": 1300 }, { "epoch": 2.7, "grad_norm": 2.027390241622925, "learning_rate": 1.8290375725550417e-05, "loss": 0.2742, "step": 1350 }, { "epoch": 2.7, "eval_loss": 0.8078347444534302, "eval_runtime": 2.7124, "eval_samples_per_second": 57.513, "eval_steps_per_second": 2.949, "step": 1350 }, { "epoch": 2.8, "grad_norm": 1.8391352891921997, "learning_rate": 1.8090169943749477e-05, "loss": 0.2749, "step": 1400 }, { "epoch": 2.8, "eval_loss": 0.804284393787384, "eval_runtime": 2.9467, "eval_samples_per_second": 52.94, "eval_steps_per_second": 2.715, "step": 1400 }, { "epoch": 2.9, "grad_norm": 1.9982004165649414, "learning_rate": 1.788010753606722e-05, "loss": 0.2717, "step": 1450 }, { "epoch": 2.9, "eval_loss": 0.7994141578674316, "eval_runtime": 2.2711, "eval_samples_per_second": 68.688, "eval_steps_per_second": 3.522, "step": 1450 }, { "epoch": 3.0, "grad_norm": 1.782399296760559, "learning_rate": 1.766044443118978e-05, "loss": 0.2715, "step": 1500 }, { "epoch": 3.0, "eval_loss": 0.804834246635437, "eval_runtime": 2.2867, "eval_samples_per_second": 68.222, "eval_steps_per_second": 3.499, "step": 1500 }, { "epoch": 3.1, "grad_norm": 1.8651448488235474, "learning_rate": 1.7431448254773943e-05, "loss": 0.1627, "step": 1550 }, { "epoch": 3.1, "eval_loss": 0.859173595905304, "eval_runtime": 2.2588, "eval_samples_per_second": 69.062, "eval_steps_per_second": 3.542, "step": 1550 }, { "epoch": 3.2, "grad_norm": 1.4768388271331787, "learning_rate": 1.7193398003386514e-05, "loss": 0.1651, "step": 1600 }, { "epoch": 3.2, "eval_loss": 0.868316650390625, "eval_runtime": 2.259, "eval_samples_per_second": 69.058, "eval_steps_per_second": 3.541, "step": 1600 }, { "epoch": 3.3, "grad_norm": 1.4704113006591797, "learning_rate": 1.6946583704589973e-05, "loss": 0.1702, "step": 1650 }, { "epoch": 3.3, "eval_loss": 0.872775137424469, "eval_runtime": 2.8294, "eval_samples_per_second": 55.136, "eval_steps_per_second": 2.827, "step": 1650 }, { "epoch": 3.4, "grad_norm": 1.082715630531311, "learning_rate": 1.6691306063588583e-05, "loss": 0.1734, "step": 1700 }, { "epoch": 3.4, "eval_loss": 0.8728486895561218, "eval_runtime": 3.3787, "eval_samples_per_second": 46.171, "eval_steps_per_second": 2.368, "step": 1700 }, { "epoch": 3.5, "grad_norm": 2.210588216781616, "learning_rate": 1.6427876096865394e-05, "loss": 0.1752, "step": 1750 }, { "epoch": 3.5, "eval_loss": 0.8705567717552185, "eval_runtime": 3.1278, "eval_samples_per_second": 49.875, "eval_steps_per_second": 2.558, "step": 1750 }, { "epoch": 3.6, "grad_norm": 1.4183433055877686, "learning_rate": 1.6156614753256583e-05, "loss": 0.1706, "step": 1800 }, { "epoch": 3.6, "eval_loss": 0.8853814601898193, "eval_runtime": 3.6433, "eval_samples_per_second": 42.818, "eval_steps_per_second": 2.196, "step": 1800 }, { "epoch": 3.7, "grad_norm": 1.4250963926315308, "learning_rate": 1.5877852522924733e-05, "loss": 0.1784, "step": 1850 }, { "epoch": 3.7, "eval_loss": 0.884819507598877, "eval_runtime": 2.2666, "eval_samples_per_second": 68.827, "eval_steps_per_second": 3.53, "step": 1850 }, { "epoch": 3.8, "grad_norm": 1.252785563468933, "learning_rate": 1.5591929034707468e-05, "loss": 0.1729, "step": 1900 }, { "epoch": 3.8, "eval_loss": 0.8708668351173401, "eval_runtime": 2.2648, "eval_samples_per_second": 68.88, "eval_steps_per_second": 3.532, "step": 1900 }, { "epoch": 3.9, "grad_norm": 1.4024217128753662, "learning_rate": 1.529919264233205e-05, "loss": 0.174, "step": 1950 }, { "epoch": 3.9, "eval_loss": 0.8670658469200134, "eval_runtime": 2.2608, "eval_samples_per_second": 69.003, "eval_steps_per_second": 3.539, "step": 1950 }, { "epoch": 4.0, "grad_norm": 1.6221123933792114, "learning_rate": 1.5000000000000002e-05, "loss": 0.174, "step": 2000 }, { "epoch": 4.0, "eval_loss": 0.8709214925765991, "eval_runtime": 2.2598, "eval_samples_per_second": 69.033, "eval_steps_per_second": 3.54, "step": 2000 }, { "epoch": 4.1, "grad_norm": 1.5479576587677002, "learning_rate": 1.469471562785891e-05, "loss": 0.1167, "step": 2050 }, { "epoch": 4.1, "eval_loss": 0.9011654853820801, "eval_runtime": 2.738, "eval_samples_per_second": 56.976, "eval_steps_per_second": 2.922, "step": 2050 }, { "epoch": 4.2, "grad_norm": 1.3002970218658447, "learning_rate": 1.4383711467890776e-05, "loss": 0.1186, "step": 2100 }, { "epoch": 4.2, "eval_loss": 0.9147914052009583, "eval_runtime": 3.018, "eval_samples_per_second": 51.69, "eval_steps_per_second": 2.651, "step": 2100 }, { "epoch": 4.3, "grad_norm": 1.7996995449066162, "learning_rate": 1.4067366430758004e-05, "loss": 0.1153, "step": 2150 }, { "epoch": 4.3, "eval_loss": 0.9160046577453613, "eval_runtime": 3.6692, "eval_samples_per_second": 42.516, "eval_steps_per_second": 2.18, "step": 2150 }, { "epoch": 4.4, "grad_norm": 1.1670547723770142, "learning_rate": 1.3746065934159123e-05, "loss": 0.1214, "step": 2200 }, { "epoch": 4.4, "eval_loss": 0.9355931282043457, "eval_runtime": 2.337, "eval_samples_per_second": 66.753, "eval_steps_per_second": 3.423, "step": 2200 }, { "epoch": 4.5, "grad_norm": 1.1401852369308472, "learning_rate": 1.342020143325669e-05, "loss": 0.1193, "step": 2250 }, { "epoch": 4.5, "eval_loss": 0.9175124764442444, "eval_runtime": 2.2626, "eval_samples_per_second": 68.947, "eval_steps_per_second": 3.536, "step": 2250 }, { "epoch": 4.6, "grad_norm": 0.8389841914176941, "learning_rate": 1.3090169943749475e-05, "loss": 0.1186, "step": 2300 }, { "epoch": 4.6, "eval_loss": 0.9386661052703857, "eval_runtime": 2.2532, "eval_samples_per_second": 69.235, "eval_steps_per_second": 3.55, "step": 2300 }, { "epoch": 4.7, "grad_norm": 1.2419942617416382, "learning_rate": 1.2756373558169992e-05, "loss": 0.1187, "step": 2350 }, { "epoch": 4.7, "eval_loss": 0.9336636662483215, "eval_runtime": 2.2535, "eval_samples_per_second": 69.225, "eval_steps_per_second": 3.55, "step": 2350 }, { "epoch": 4.8, "grad_norm": 1.0060522556304932, "learning_rate": 1.2419218955996677e-05, "loss": 0.1245, "step": 2400 }, { "epoch": 4.8, "eval_loss": 0.9188296794891357, "eval_runtime": 2.2614, "eval_samples_per_second": 68.983, "eval_steps_per_second": 3.538, "step": 2400 }, { "epoch": 4.9, "grad_norm": 0.7993331551551819, "learning_rate": 1.2079116908177592e-05, "loss": 0.1222, "step": 2450 }, { "epoch": 4.9, "eval_loss": 0.9250988364219666, "eval_runtime": 2.4444, "eval_samples_per_second": 63.82, "eval_steps_per_second": 3.273, "step": 2450 }, { "epoch": 5.0, "grad_norm": 1.1892589330673218, "learning_rate": 1.1736481776669307e-05, "loss": 0.1186, "step": 2500 }, { "epoch": 5.0, "eval_loss": 0.9481778144836426, "eval_runtime": 3.3935, "eval_samples_per_second": 45.97, "eval_steps_per_second": 2.357, "step": 2500 }, { "epoch": 5.1, "grad_norm": 0.7223986983299255, "learning_rate": 1.1391731009600655e-05, "loss": 0.0726, "step": 2550 }, { "epoch": 5.1, "eval_loss": 0.974181056022644, "eval_runtime": 2.9499, "eval_samples_per_second": 52.883, "eval_steps_per_second": 2.712, "step": 2550 }, { "epoch": 5.2, "grad_norm": 0.7545835971832275, "learning_rate": 1.1045284632676535e-05, "loss": 0.0717, "step": 2600 }, { "epoch": 5.2, "eval_loss": 0.9890027046203613, "eval_runtime": 2.7635, "eval_samples_per_second": 56.449, "eval_steps_per_second": 2.895, "step": 2600 }, { "epoch": 5.3, "grad_norm": 1.2251814603805542, "learning_rate": 1.0697564737441254e-05, "loss": 0.072, "step": 2650 }, { "epoch": 5.3, "eval_loss": 0.9911813735961914, "eval_runtime": 2.2537, "eval_samples_per_second": 69.22, "eval_steps_per_second": 3.55, "step": 2650 }, { "epoch": 5.4, "grad_norm": 0.45753681659698486, "learning_rate": 1.0348994967025012e-05, "loss": 0.0718, "step": 2700 }, { "epoch": 5.4, "eval_loss": 0.9854485988616943, "eval_runtime": 2.2539, "eval_samples_per_second": 69.212, "eval_steps_per_second": 3.549, "step": 2700 }, { "epoch": 5.5, "grad_norm": 1.0563805103302002, "learning_rate": 1e-05, "loss": 0.072, "step": 2750 }, { "epoch": 5.5, "eval_loss": 0.9962345957756042, "eval_runtime": 2.2507, "eval_samples_per_second": 69.313, "eval_steps_per_second": 3.555, "step": 2750 }, { "epoch": 5.6, "grad_norm": 1.6450284719467163, "learning_rate": 9.651005032974994e-06, "loss": 0.0699, "step": 2800 }, { "epoch": 5.6, "eval_loss": 0.9950909614562988, "eval_runtime": 2.2532, "eval_samples_per_second": 69.235, "eval_steps_per_second": 3.551, "step": 2800 } ], "logging_steps": 50, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 400, "total_flos": 1.3524716052545536e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }