{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 50, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 6.376349449157715, "learning_rate": 2.5e-06, "loss": 1.4789, "step": 50 }, { "epoch": 0.02, "eval_loss": 1.0021060705184937, "eval_runtime": 2.0714, "eval_samples_per_second": 55.034, "eval_steps_per_second": 2.897, "step": 50 }, { "epoch": 0.04, "grad_norm": 2.7532732486724854, "learning_rate": 5e-06, "loss": 1.125, "step": 100 }, { "epoch": 0.04, "eval_loss": 0.8670538067817688, "eval_runtime": 2.0412, "eval_samples_per_second": 55.848, "eval_steps_per_second": 2.939, "step": 100 }, { "epoch": 0.06, "grad_norm": 2.5737497806549072, "learning_rate": 7.5e-06, "loss": 0.9982, "step": 150 }, { "epoch": 0.06, "eval_loss": 0.85986328125, "eval_runtime": 2.066, "eval_samples_per_second": 55.18, "eval_steps_per_second": 2.904, "step": 150 }, { "epoch": 0.08, "grad_norm": 3.524716377258301, "learning_rate": 1e-05, "loss": 0.9746, "step": 200 }, { "epoch": 0.08, "eval_loss": 0.8621886968612671, "eval_runtime": 2.0577, "eval_samples_per_second": 55.402, "eval_steps_per_second": 2.916, "step": 200 }, { "epoch": 0.1, "grad_norm": 2.9173636436462402, "learning_rate": 1.25e-05, "loss": 0.9435, "step": 250 }, { "epoch": 0.1, "eval_loss": 0.8596158623695374, "eval_runtime": 2.0562, "eval_samples_per_second": 55.441, "eval_steps_per_second": 2.918, "step": 250 }, { "epoch": 0.12, "grad_norm": 3.21992564201355, "learning_rate": 1.5e-05, "loss": 0.9561, "step": 300 }, { "epoch": 0.12, "eval_loss": 0.8649560809135437, "eval_runtime": 2.0472, "eval_samples_per_second": 55.687, "eval_steps_per_second": 2.931, "step": 300 }, { "epoch": 0.14, "grad_norm": 2.842764139175415, "learning_rate": 1.75e-05, "loss": 0.9625, "step": 350 }, { "epoch": 0.14, "eval_loss": 0.8620312213897705, "eval_runtime": 2.0407, "eval_samples_per_second": 55.863, "eval_steps_per_second": 2.94, "step": 350 }, { "epoch": 0.16, "grad_norm": 3.064265012741089, "learning_rate": 2e-05, "loss": 0.9561, "step": 400 }, { "epoch": 0.16, "eval_loss": 0.8754067420959473, "eval_runtime": 2.0491, "eval_samples_per_second": 55.634, "eval_steps_per_second": 2.928, "step": 400 }, { "epoch": 0.18, "grad_norm": 3.680624008178711, "learning_rate": 2.25e-05, "loss": 0.9811, "step": 450 }, { "epoch": 0.18, "eval_loss": 0.8749663829803467, "eval_runtime": 2.0344, "eval_samples_per_second": 56.036, "eval_steps_per_second": 2.949, "step": 450 }, { "epoch": 0.2, "grad_norm": 2.928382396697998, "learning_rate": 2.5e-05, "loss": 0.9841, "step": 500 }, { "epoch": 0.2, "eval_loss": 0.8785499930381775, "eval_runtime": 2.0435, "eval_samples_per_second": 55.786, "eval_steps_per_second": 2.936, "step": 500 }, { "epoch": 0.22, "grad_norm": 3.388023853302002, "learning_rate": 2.7500000000000004e-05, "loss": 1.0304, "step": 550 }, { "epoch": 0.22, "eval_loss": 0.8839182257652283, "eval_runtime": 2.0365, "eval_samples_per_second": 55.979, "eval_steps_per_second": 2.946, "step": 550 }, { "epoch": 0.24, "grad_norm": 3.6457326412200928, "learning_rate": 3e-05, "loss": 1.0091, "step": 600 }, { "epoch": 0.24, "eval_loss": 0.8980669975280762, "eval_runtime": 2.0434, "eval_samples_per_second": 55.79, "eval_steps_per_second": 2.936, "step": 600 }, { "epoch": 0.26, "grad_norm": 2.8867459297180176, "learning_rate": 3.2500000000000004e-05, "loss": 1.0373, "step": 650 }, { "epoch": 0.26, "eval_loss": 0.8973696231842041, "eval_runtime": 2.0404, "eval_samples_per_second": 55.871, "eval_steps_per_second": 2.941, "step": 650 }, { "epoch": 0.28, "grad_norm": 2.924246311187744, "learning_rate": 3.5e-05, "loss": 1.042, "step": 700 }, { "epoch": 0.28, "eval_loss": 0.9169337153434753, "eval_runtime": 2.0757, "eval_samples_per_second": 54.922, "eval_steps_per_second": 2.891, "step": 700 }, { "epoch": 0.3, "grad_norm": 3.8138821125030518, "learning_rate": 3.7500000000000003e-05, "loss": 1.0676, "step": 750 }, { "epoch": 0.3, "eval_loss": 0.9358024001121521, "eval_runtime": 2.0481, "eval_samples_per_second": 55.661, "eval_steps_per_second": 2.93, "step": 750 }, { "epoch": 0.32, "grad_norm": 2.544848918914795, "learning_rate": 4e-05, "loss": 1.1405, "step": 800 }, { "epoch": 0.32, "eval_loss": 0.9551197290420532, "eval_runtime": 2.0608, "eval_samples_per_second": 55.319, "eval_steps_per_second": 2.912, "step": 800 }, { "epoch": 0.34, "grad_norm": 3.607945203781128, "learning_rate": 4.25e-05, "loss": 1.1238, "step": 850 }, { "epoch": 0.34, "eval_loss": 0.9666525721549988, "eval_runtime": 2.0401, "eval_samples_per_second": 55.879, "eval_steps_per_second": 2.941, "step": 850 }, { "epoch": 0.36, "grad_norm": 2.847774028778076, "learning_rate": 4.5e-05, "loss": 1.134, "step": 900 }, { "epoch": 0.36, "eval_loss": 1.0043387413024902, "eval_runtime": 2.0654, "eval_samples_per_second": 55.196, "eval_steps_per_second": 2.905, "step": 900 }, { "epoch": 0.38, "grad_norm": 3.035200595855713, "learning_rate": 4.75e-05, "loss": 1.1589, "step": 950 }, { "epoch": 0.38, "eval_loss": 1.0095112323760986, "eval_runtime": 2.103, "eval_samples_per_second": 54.208, "eval_steps_per_second": 2.853, "step": 950 }, { "epoch": 0.4, "grad_norm": 2.9163053035736084, "learning_rate": 5e-05, "loss": 1.1372, "step": 1000 }, { "epoch": 0.4, "eval_loss": 1.0114259719848633, "eval_runtime": 2.0471, "eval_samples_per_second": 55.688, "eval_steps_per_second": 2.931, "step": 1000 }, { "epoch": 0.42, "grad_norm": 2.7075846195220947, "learning_rate": 4.9996192378909786e-05, "loss": 1.2279, "step": 1050 }, { "epoch": 0.42, "eval_loss": 1.0414971113204956, "eval_runtime": 2.0438, "eval_samples_per_second": 55.778, "eval_steps_per_second": 2.936, "step": 1050 }, { "epoch": 0.44, "grad_norm": 3.5521507263183594, "learning_rate": 4.99847706754774e-05, "loss": 1.2282, "step": 1100 }, { "epoch": 0.44, "eval_loss": 1.0306421518325806, "eval_runtime": 2.0353, "eval_samples_per_second": 56.013, "eval_steps_per_second": 2.948, "step": 1100 }, { "epoch": 0.46, "grad_norm": 2.973623037338257, "learning_rate": 4.996573836886435e-05, "loss": 1.2439, "step": 1150 }, { "epoch": 0.46, "eval_loss": 1.0502970218658447, "eval_runtime": 2.0477, "eval_samples_per_second": 55.673, "eval_steps_per_second": 2.93, "step": 1150 }, { "epoch": 0.48, "grad_norm": 3.2443981170654297, "learning_rate": 4.993910125649561e-05, "loss": 1.2674, "step": 1200 }, { "epoch": 0.48, "eval_loss": 1.052767038345337, "eval_runtime": 2.0476, "eval_samples_per_second": 55.676, "eval_steps_per_second": 2.93, "step": 1200 }, { "epoch": 0.5, "grad_norm": 2.528109073638916, "learning_rate": 4.990486745229364e-05, "loss": 1.2429, "step": 1250 }, { "epoch": 0.5, "eval_loss": 1.0944527387619019, "eval_runtime": 2.0463, "eval_samples_per_second": 55.711, "eval_steps_per_second": 2.932, "step": 1250 }, { "epoch": 0.52, "grad_norm": 3.5652294158935547, "learning_rate": 4.9863047384206835e-05, "loss": 1.2405, "step": 1300 }, { "epoch": 0.52, "eval_loss": 1.0788123607635498, "eval_runtime": 2.0584, "eval_samples_per_second": 55.383, "eval_steps_per_second": 2.915, "step": 1300 }, { "epoch": 0.54, "grad_norm": 3.1418027877807617, "learning_rate": 4.9813653791033057e-05, "loss": 1.2664, "step": 1350 }, { "epoch": 0.54, "eval_loss": 1.077215552330017, "eval_runtime": 2.0417, "eval_samples_per_second": 55.836, "eval_steps_per_second": 2.939, "step": 1350 }, { "epoch": 0.56, "grad_norm": 3.247063159942627, "learning_rate": 4.975670171853926e-05, "loss": 1.2368, "step": 1400 }, { "epoch": 0.56, "eval_loss": 1.0988303422927856, "eval_runtime": 2.0525, "eval_samples_per_second": 55.543, "eval_steps_per_second": 2.923, "step": 1400 }, { "epoch": 0.58, "grad_norm": 2.791402816772461, "learning_rate": 4.9692208514878444e-05, "loss": 1.214, "step": 1450 }, { "epoch": 0.58, "eval_loss": 1.093959093093872, "eval_runtime": 2.0478, "eval_samples_per_second": 55.67, "eval_steps_per_second": 2.93, "step": 1450 }, { "epoch": 0.6, "grad_norm": 3.39119815826416, "learning_rate": 4.962019382530521e-05, "loss": 1.2605, "step": 1500 }, { "epoch": 0.6, "eval_loss": 1.0913000106811523, "eval_runtime": 2.0609, "eval_samples_per_second": 55.317, "eval_steps_per_second": 2.911, "step": 1500 }, { "epoch": 0.62, "grad_norm": 2.8593010902404785, "learning_rate": 4.9540679586191605e-05, "loss": 1.2856, "step": 1550 }, { "epoch": 0.62, "eval_loss": 1.1060646772384644, "eval_runtime": 2.0505, "eval_samples_per_second": 55.597, "eval_steps_per_second": 2.926, "step": 1550 }, { "epoch": 0.64, "grad_norm": 3.9253203868865967, "learning_rate": 4.9453690018345144e-05, "loss": 1.2385, "step": 1600 }, { "epoch": 0.64, "eval_loss": 1.1065127849578857, "eval_runtime": 2.0451, "eval_samples_per_second": 55.743, "eval_steps_per_second": 2.934, "step": 1600 }, { "epoch": 0.66, "grad_norm": 3.433211326599121, "learning_rate": 4.9359251619630886e-05, "loss": 1.2696, "step": 1650 }, { "epoch": 0.66, "eval_loss": 1.1171408891677856, "eval_runtime": 2.0491, "eval_samples_per_second": 55.635, "eval_steps_per_second": 2.928, "step": 1650 }, { "epoch": 0.68, "grad_norm": 2.958655595779419, "learning_rate": 4.925739315689991e-05, "loss": 1.2774, "step": 1700 }, { "epoch": 0.68, "eval_loss": 1.1090198755264282, "eval_runtime": 2.0549, "eval_samples_per_second": 55.476, "eval_steps_per_second": 2.92, "step": 1700 }, { "epoch": 0.7, "grad_norm": 2.845395565032959, "learning_rate": 4.914814565722671e-05, "loss": 1.2598, "step": 1750 }, { "epoch": 0.7, "eval_loss": 1.1252377033233643, "eval_runtime": 2.0564, "eval_samples_per_second": 55.437, "eval_steps_per_second": 2.918, "step": 1750 }, { "epoch": 0.72, "grad_norm": 3.3043181896209717, "learning_rate": 4.9031542398457974e-05, "loss": 1.2897, "step": 1800 }, { "epoch": 0.72, "eval_loss": 1.1197612285614014, "eval_runtime": 2.0511, "eval_samples_per_second": 55.58, "eval_steps_per_second": 2.925, "step": 1800 }, { "epoch": 0.74, "grad_norm": 2.8986546993255615, "learning_rate": 4.890761889907589e-05, "loss": 1.2801, "step": 1850 }, { "epoch": 0.74, "eval_loss": 1.0936975479125977, "eval_runtime": 2.0514, "eval_samples_per_second": 55.57, "eval_steps_per_second": 2.925, "step": 1850 }, { "epoch": 0.76, "grad_norm": 2.980234384536743, "learning_rate": 4.877641290737884e-05, "loss": 1.2732, "step": 1900 }, { "epoch": 0.76, "eval_loss": 1.1040586233139038, "eval_runtime": 2.0664, "eval_samples_per_second": 55.169, "eval_steps_per_second": 2.904, "step": 1900 }, { "epoch": 0.78, "grad_norm": 2.337164878845215, "learning_rate": 4.8637964389982926e-05, "loss": 1.2395, "step": 1950 }, { "epoch": 0.78, "eval_loss": 1.1020458936691284, "eval_runtime": 2.0426, "eval_samples_per_second": 55.811, "eval_steps_per_second": 2.937, "step": 1950 }, { "epoch": 0.8, "grad_norm": 2.535869836807251, "learning_rate": 4.849231551964771e-05, "loss": 1.2581, "step": 2000 }, { "epoch": 0.8, "eval_loss": 1.106950283050537, "eval_runtime": 2.0587, "eval_samples_per_second": 55.375, "eval_steps_per_second": 2.914, "step": 2000 } ], "logging_steps": 50, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2000, "total_flos": 7.650574067145114e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }