{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9869158878504672, "eval_steps": 17, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 8.601767539978027, "learning_rate": 1.0000000000000002e-06, "loss": 2.5188, "step": 1 }, { "epoch": 0.01, "eval_loss": 2.5281636714935303, "eval_runtime": 136.1276, "eval_samples_per_second": 6.795, "eval_steps_per_second": 0.852, "step": 1 }, { "epoch": 0.03, "grad_norm": 8.70827865600586, "learning_rate": 2.0000000000000003e-06, "loss": 2.561, "step": 2 }, { "epoch": 0.04, "grad_norm": 8.504729270935059, "learning_rate": 3e-06, "loss": 2.5392, "step": 3 }, { "epoch": 0.06, "grad_norm": 8.70252513885498, "learning_rate": 4.000000000000001e-06, "loss": 2.5358, "step": 4 }, { "epoch": 0.07, "grad_norm": 8.545235633850098, "learning_rate": 5e-06, "loss": 2.4953, "step": 5 }, { "epoch": 0.09, "grad_norm": 8.594281196594238, "learning_rate": 6e-06, "loss": 2.5025, "step": 6 }, { "epoch": 0.1, "grad_norm": 8.570807456970215, "learning_rate": 7e-06, "loss": 2.4483, "step": 7 }, { "epoch": 0.12, "grad_norm": 8.201045036315918, "learning_rate": 8.000000000000001e-06, "loss": 2.3844, "step": 8 }, { "epoch": 0.13, "grad_norm": 8.294769287109375, "learning_rate": 9e-06, "loss": 2.3132, "step": 9 }, { "epoch": 0.15, "grad_norm": 7.825733661651611, "learning_rate": 1e-05, "loss": 2.2073, "step": 10 }, { "epoch": 0.16, "grad_norm": 7.359045028686523, "learning_rate": 9.999301905929286e-06, "loss": 2.0469, "step": 11 }, { "epoch": 0.18, "grad_norm": 6.876375198364258, "learning_rate": 9.997207818651273e-06, "loss": 1.8699, "step": 12 }, { "epoch": 0.19, "grad_norm": 6.270627021789551, "learning_rate": 9.99371832291393e-06, "loss": 1.6649, "step": 13 }, { "epoch": 0.21, "grad_norm": 5.742035388946533, "learning_rate": 9.988834393115768e-06, "loss": 1.4954, "step": 14 }, { "epoch": 0.22, "grad_norm": 5.249215602874756, "learning_rate": 9.982557393033758e-06, "loss": 1.3214, "step": 15 }, { "epoch": 0.24, "grad_norm": 4.818938732147217, "learning_rate": 9.97488907544252e-06, "loss": 1.1535, "step": 16 }, { "epoch": 0.25, "grad_norm": 4.380295753479004, "learning_rate": 9.965831581624872e-06, "loss": 1.0047, "step": 17 }, { "epoch": 0.25, "eval_loss": 0.8628284335136414, "eval_runtime": 133.5823, "eval_samples_per_second": 6.925, "eval_steps_per_second": 0.868, "step": 17 }, { "epoch": 0.27, "grad_norm": 3.99696683883667, "learning_rate": 9.955387440773902e-06, "loss": 0.8632, "step": 18 }, { "epoch": 0.28, "grad_norm": 3.7503676414489746, "learning_rate": 9.943559569286731e-06, "loss": 0.7559, "step": 19 }, { "epoch": 0.3, "grad_norm": 3.430863618850708, "learning_rate": 9.930351269950144e-06, "loss": 0.6444, "step": 20 }, { "epoch": 0.31, "grad_norm": 3.25299072265625, "learning_rate": 9.915766231018317e-06, "loss": 0.5364, "step": 21 }, { "epoch": 0.33, "grad_norm": 3.217674970626831, "learning_rate": 9.899808525182935e-06, "loss": 0.4766, "step": 22 }, { "epoch": 0.34, "grad_norm": 3.2001354694366455, "learning_rate": 9.882482608435924e-06, "loss": 0.4038, "step": 23 }, { "epoch": 0.36, "grad_norm": 3.011241912841797, "learning_rate": 9.863793318825186e-06, "loss": 0.3333, "step": 24 }, { "epoch": 0.37, "grad_norm": 2.758089303970337, "learning_rate": 9.843745875103628e-06, "loss": 0.2752, "step": 25 }, { "epoch": 0.39, "grad_norm": 1.914292812347412, "learning_rate": 9.822345875271884e-06, "loss": 0.2229, "step": 26 }, { "epoch": 0.4, "grad_norm": 2.1877121925354004, "learning_rate": 9.799599295015154e-06, "loss": 0.1846, "step": 27 }, { "epoch": 0.42, "grad_norm": 2.3617541790008545, "learning_rate": 9.775512486034564e-06, "loss": 0.1556, "step": 28 }, { "epoch": 0.43, "grad_norm": 1.431768774986267, "learning_rate": 9.75009217427352e-06, "loss": 0.1279, "step": 29 }, { "epoch": 0.45, "grad_norm": 0.7117260098457336, "learning_rate": 9.723345458039595e-06, "loss": 0.1061, "step": 30 }, { "epoch": 0.46, "grad_norm": 0.45769569277763367, "learning_rate": 9.695279806022391e-06, "loss": 0.0987, "step": 31 }, { "epoch": 0.48, "grad_norm": 0.2953682243824005, "learning_rate": 9.665903055208013e-06, "loss": 0.0919, "step": 32 }, { "epoch": 0.49, "grad_norm": 0.17375054955482483, "learning_rate": 9.635223408690688e-06, "loss": 0.0854, "step": 33 }, { "epoch": 0.51, "grad_norm": 0.11500216275453568, "learning_rate": 9.603249433382145e-06, "loss": 0.086, "step": 34 }, { "epoch": 0.51, "eval_loss": 0.08624568581581116, "eval_runtime": 132.1775, "eval_samples_per_second": 6.998, "eval_steps_per_second": 0.878, "step": 34 }, { "epoch": 0.52, "grad_norm": 0.12646625936031342, "learning_rate": 9.569990057619414e-06, "loss": 0.0851, "step": 35 }, { "epoch": 0.54, "grad_norm": 0.11912114918231964, "learning_rate": 9.535454568671705e-06, "loss": 0.0834, "step": 36 }, { "epoch": 0.55, "grad_norm": 0.24543708562850952, "learning_rate": 9.49965261014704e-06, "loss": 0.0836, "step": 37 }, { "epoch": 0.57, "grad_norm": 0.0936342254281044, "learning_rate": 9.462594179299408e-06, "loss": 0.0844, "step": 38 }, { "epoch": 0.58, "grad_norm": 0.18341362476348877, "learning_rate": 9.424289624237143e-06, "loss": 0.0805, "step": 39 }, { "epoch": 0.6, "grad_norm": 0.21815626323223114, "learning_rate": 9.384749641033358e-06, "loss": 0.0811, "step": 40 }, { "epoch": 0.61, "grad_norm": 0.21489091217517853, "learning_rate": 9.343985270739184e-06, "loss": 0.0793, "step": 41 }, { "epoch": 0.63, "grad_norm": 0.23281769454479218, "learning_rate": 9.302007896300697e-06, "loss": 0.0775, "step": 42 }, { "epoch": 0.64, "grad_norm": 0.6911139488220215, "learning_rate": 9.25882923938038e-06, "loss": 0.0812, "step": 43 }, { "epoch": 0.66, "grad_norm": 0.6087940335273743, "learning_rate": 9.214461357083986e-06, "loss": 0.0801, "step": 44 }, { "epoch": 0.67, "grad_norm": 0.5599693059921265, "learning_rate": 9.168916638593736e-06, "loss": 0.0822, "step": 45 }, { "epoch": 0.69, "grad_norm": 0.23726361989974976, "learning_rate": 9.122207801708802e-06, "loss": 0.0744, "step": 46 }, { "epoch": 0.7, "grad_norm": 1.075922966003418, "learning_rate": 9.074347889294017e-06, "loss": 0.0824, "step": 47 }, { "epoch": 0.72, "grad_norm": 0.49190425872802734, "learning_rate": 9.025350265637816e-06, "loss": 0.0784, "step": 48 }, { "epoch": 0.73, "grad_norm": 0.8844243288040161, "learning_rate": 8.975228612720415e-06, "loss": 0.0722, "step": 49 }, { "epoch": 0.75, "grad_norm": 1.2909235954284668, "learning_rate": 8.923996926393306e-06, "loss": 0.075, "step": 50 }, { "epoch": 0.76, "grad_norm": 0.3661656975746155, "learning_rate": 8.871669512471068e-06, "loss": 0.0732, "step": 51 }, { "epoch": 0.76, "eval_loss": 0.07530223578214645, "eval_runtime": 133.0044, "eval_samples_per_second": 6.955, "eval_steps_per_second": 0.872, "step": 51 }, { "epoch": 0.78, "grad_norm": 0.49425163865089417, "learning_rate": 8.818260982736662e-06, "loss": 0.07, "step": 52 }, { "epoch": 0.79, "grad_norm": 0.9367341995239258, "learning_rate": 8.763786250861258e-06, "loss": 0.0819, "step": 53 }, { "epoch": 0.81, "grad_norm": 0.9425927400588989, "learning_rate": 8.708260528239788e-06, "loss": 0.0754, "step": 54 }, { "epoch": 0.82, "grad_norm": 0.3247818052768707, "learning_rate": 8.651699319743348e-06, "loss": 0.0739, "step": 55 }, { "epoch": 0.84, "grad_norm": 0.8091337084770203, "learning_rate": 8.594118419389648e-06, "loss": 0.0724, "step": 56 }, { "epoch": 0.85, "grad_norm": 0.556105375289917, "learning_rate": 8.535533905932739e-06, "loss": 0.0733, "step": 57 }, { "epoch": 0.87, "grad_norm": 0.9925010800361633, "learning_rate": 8.475962138373212e-06, "loss": 0.0761, "step": 58 }, { "epoch": 0.88, "grad_norm": 0.32729703187942505, "learning_rate": 8.415419751390155e-06, "loss": 0.0694, "step": 59 }, { "epoch": 0.9, "grad_norm": 0.2549174129962921, "learning_rate": 8.353923650696119e-06, "loss": 0.0672, "step": 60 }, { "epoch": 0.91, "grad_norm": 0.2688353657722473, "learning_rate": 8.291491008316409e-06, "loss": 0.0694, "step": 61 }, { "epoch": 0.93, "grad_norm": 0.38362765312194824, "learning_rate": 8.228139257794012e-06, "loss": 0.0671, "step": 62 }, { "epoch": 0.94, "grad_norm": 0.8581087589263916, "learning_rate": 8.163886089321493e-06, "loss": 0.0745, "step": 63 }, { "epoch": 0.96, "grad_norm": 0.5643619894981384, "learning_rate": 8.098749444801226e-06, "loss": 0.0681, "step": 64 }, { "epoch": 0.97, "grad_norm": 0.637834906578064, "learning_rate": 8.032747512835338e-06, "loss": 0.0773, "step": 65 }, { "epoch": 0.99, "grad_norm": 0.2533693313598633, "learning_rate": 7.965898723646777e-06, "loss": 0.0663, "step": 66 } ], "logging_steps": 1, "max_steps": 198, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 66, "total_flos": 4.667867604084326e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }