{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.018703824932198635, "eval_steps": 4, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00037407649864397267, "grad_norm": 9.773622512817383, "learning_rate": 2e-05, "loss": 4.8339, "step": 1 }, { "epoch": 0.00037407649864397267, "eval_loss": NaN, "eval_runtime": 22.0999, "eval_samples_per_second": 25.475, "eval_steps_per_second": 25.475, "step": 1 }, { "epoch": 0.0007481529972879453, "grad_norm": 8.580421447753906, "learning_rate": 4e-05, "loss": 4.5196, "step": 2 }, { "epoch": 0.0011222294959319181, "grad_norm": 8.180211067199707, "learning_rate": 6e-05, "loss": 4.344, "step": 3 }, { "epoch": 0.0014963059945758907, "grad_norm": 10.246196746826172, "learning_rate": 8e-05, "loss": 4.743, "step": 4 }, { "epoch": 0.0014963059945758907, "eval_loss": NaN, "eval_runtime": 21.684, "eval_samples_per_second": 25.964, "eval_steps_per_second": 25.964, "step": 4 }, { "epoch": 0.0018703824932198635, "grad_norm": 9.82059383392334, "learning_rate": 0.0001, "loss": 4.1244, "step": 5 }, { "epoch": 0.0022444589918638362, "grad_norm": 10.244694709777832, "learning_rate": 0.00012, "loss": 3.5537, "step": 6 }, { "epoch": 0.002618535490507809, "grad_norm": 9.037696838378906, "learning_rate": 0.00014, "loss": 2.7798, "step": 7 }, { "epoch": 0.0029926119891517814, "grad_norm": 10.064749717712402, "learning_rate": 0.00016, "loss": 2.0157, "step": 8 }, { "epoch": 0.0029926119891517814, "eval_loss": NaN, "eval_runtime": 21.7463, "eval_samples_per_second": 25.889, "eval_steps_per_second": 25.889, "step": 8 }, { "epoch": 0.0033666884877957544, "grad_norm": 10.570599555969238, "learning_rate": 0.00018, "loss": 1.5668, "step": 9 }, { "epoch": 0.003740764986439727, "grad_norm": 14.205665588378906, "learning_rate": 0.0002, "loss": 1.1739, "step": 10 }, { "epoch": 0.0041148414850837, "grad_norm": 11.37263011932373, "learning_rate": 0.0001996917333733128, "loss": 1.1603, "step": 11 }, { "epoch": 0.0044889179837276725, "grad_norm": 5.627699851989746, "learning_rate": 0.00019876883405951377, "loss": 0.8112, "step": 12 }, { "epoch": 0.0044889179837276725, "eval_loss": NaN, "eval_runtime": 21.8678, "eval_samples_per_second": 25.746, "eval_steps_per_second": 25.746, "step": 12 }, { "epoch": 0.004862994482371645, "grad_norm": 12.806142807006836, "learning_rate": 0.00019723699203976766, "loss": 0.7784, "step": 13 }, { "epoch": 0.005237070981015618, "grad_norm": 8.642436981201172, "learning_rate": 0.00019510565162951537, "loss": 0.8493, "step": 14 }, { "epoch": 0.00561114747965959, "grad_norm": 6.148828506469727, "learning_rate": 0.0001923879532511287, "loss": 0.656, "step": 15 }, { "epoch": 0.005985223978303563, "grad_norm": 8.592019081115723, "learning_rate": 0.0001891006524188368, "loss": 1.0952, "step": 16 }, { "epoch": 0.005985223978303563, "eval_loss": NaN, "eval_runtime": 21.7489, "eval_samples_per_second": 25.886, "eval_steps_per_second": 25.886, "step": 16 }, { "epoch": 0.006359300476947536, "grad_norm": 2.953862190246582, "learning_rate": 0.00018526401643540922, "loss": 0.7376, "step": 17 }, { "epoch": 0.006733376975591509, "grad_norm": 10.415821075439453, "learning_rate": 0.00018090169943749476, "loss": 0.9074, "step": 18 }, { "epoch": 0.007107453474235481, "grad_norm": 8.800667762756348, "learning_rate": 0.0001760405965600031, "loss": 0.9933, "step": 19 }, { "epoch": 0.007481529972879454, "grad_norm": 5.8638386726379395, "learning_rate": 0.00017071067811865476, "loss": 0.8282, "step": 20 }, { "epoch": 0.007481529972879454, "eval_loss": NaN, "eval_runtime": 21.7567, "eval_samples_per_second": 25.877, "eval_steps_per_second": 25.877, "step": 20 }, { "epoch": 0.007855606471523427, "grad_norm": 6.939398765563965, "learning_rate": 0.00016494480483301836, "loss": 0.9589, "step": 21 }, { "epoch": 0.0082296829701674, "grad_norm": 5.484954357147217, "learning_rate": 0.00015877852522924732, "loss": 0.7741, "step": 22 }, { "epoch": 0.008603759468811372, "grad_norm": 2.173372983932495, "learning_rate": 0.0001522498564715949, "loss": 0.7695, "step": 23 }, { "epoch": 0.008977835967455345, "grad_norm": 2.9671242237091064, "learning_rate": 0.00014539904997395468, "loss": 0.7306, "step": 24 }, { "epoch": 0.008977835967455345, "eval_loss": NaN, "eval_runtime": 21.7415, "eval_samples_per_second": 25.895, "eval_steps_per_second": 25.895, "step": 24 }, { "epoch": 0.009351912466099318, "grad_norm": 4.0381903648376465, "learning_rate": 0.000138268343236509, "loss": 0.7789, "step": 25 }, { "epoch": 0.00972598896474329, "grad_norm": 4.545593738555908, "learning_rate": 0.00013090169943749476, "loss": 0.769, "step": 26 }, { "epoch": 0.010100065463387263, "grad_norm": 2.968888521194458, "learning_rate": 0.00012334453638559057, "loss": 0.8867, "step": 27 }, { "epoch": 0.010474141962031235, "grad_norm": 2.718158006668091, "learning_rate": 0.0001156434465040231, "loss": 0.8066, "step": 28 }, { "epoch": 0.010474141962031235, "eval_loss": NaN, "eval_runtime": 21.8902, "eval_samples_per_second": 25.719, "eval_steps_per_second": 25.719, "step": 28 }, { "epoch": 0.010848218460675208, "grad_norm": 4.827681541442871, "learning_rate": 0.0001078459095727845, "loss": 0.713, "step": 29 }, { "epoch": 0.01122229495931918, "grad_norm": 2.4585800170898438, "learning_rate": 0.0001, "loss": 0.82, "step": 30 }, { "epoch": 0.011596371457963153, "grad_norm": 3.3518271446228027, "learning_rate": 9.215409042721552e-05, "loss": 0.6706, "step": 31 }, { "epoch": 0.011970447956607125, "grad_norm": 1.3497822284698486, "learning_rate": 8.435655349597689e-05, "loss": 0.7892, "step": 32 }, { "epoch": 0.011970447956607125, "eval_loss": NaN, "eval_runtime": 22.3464, "eval_samples_per_second": 25.194, "eval_steps_per_second": 25.194, "step": 32 }, { "epoch": 0.012344524455251098, "grad_norm": 2.0146560668945312, "learning_rate": 7.66554636144095e-05, "loss": 0.8602, "step": 33 }, { "epoch": 0.012718600953895072, "grad_norm": 2.6399552822113037, "learning_rate": 6.909830056250527e-05, "loss": 0.6844, "step": 34 }, { "epoch": 0.013092677452539045, "grad_norm": 2.6146562099456787, "learning_rate": 6.173165676349103e-05, "loss": 0.6531, "step": 35 }, { "epoch": 0.013466753951183017, "grad_norm": 3.0831737518310547, "learning_rate": 5.4600950026045326e-05, "loss": 0.8027, "step": 36 }, { "epoch": 0.013466753951183017, "eval_loss": NaN, "eval_runtime": 21.7733, "eval_samples_per_second": 25.857, "eval_steps_per_second": 25.857, "step": 36 }, { "epoch": 0.01384083044982699, "grad_norm": 2.011878252029419, "learning_rate": 4.7750143528405126e-05, "loss": 0.6398, "step": 37 }, { "epoch": 0.014214906948470963, "grad_norm": 2.133044958114624, "learning_rate": 4.12214747707527e-05, "loss": 0.7345, "step": 38 }, { "epoch": 0.014588983447114935, "grad_norm": 3.275799036026001, "learning_rate": 3.5055195166981645e-05, "loss": 0.5697, "step": 39 }, { "epoch": 0.014963059945758908, "grad_norm": 1.622467279434204, "learning_rate": 2.9289321881345254e-05, "loss": 0.5799, "step": 40 }, { "epoch": 0.014963059945758908, "eval_loss": NaN, "eval_runtime": 21.7692, "eval_samples_per_second": 25.862, "eval_steps_per_second": 25.862, "step": 40 }, { "epoch": 0.01533713644440288, "grad_norm": 3.586545944213867, "learning_rate": 2.3959403439996907e-05, "loss": 0.7168, "step": 41 }, { "epoch": 0.015711212943046855, "grad_norm": 3.455106496810913, "learning_rate": 1.9098300562505266e-05, "loss": 0.6883, "step": 42 }, { "epoch": 0.016085289441690825, "grad_norm": 1.5402703285217285, "learning_rate": 1.4735983564590783e-05, "loss": 0.6855, "step": 43 }, { "epoch": 0.0164593659403348, "grad_norm": 3.069215774536133, "learning_rate": 1.0899347581163221e-05, "loss": 0.4914, "step": 44 }, { "epoch": 0.0164593659403348, "eval_loss": NaN, "eval_runtime": 21.7546, "eval_samples_per_second": 25.88, "eval_steps_per_second": 25.88, "step": 44 }, { "epoch": 0.01683344243897877, "grad_norm": 3.156932830810547, "learning_rate": 7.612046748871327e-06, "loss": 0.7092, "step": 45 }, { "epoch": 0.017207518937622745, "grad_norm": 2.049065589904785, "learning_rate": 4.8943483704846475e-06, "loss": 0.572, "step": 46 }, { "epoch": 0.017581595436266716, "grad_norm": 2.168691396713257, "learning_rate": 2.7630079602323442e-06, "loss": 0.6909, "step": 47 }, { "epoch": 0.01795567193491069, "grad_norm": 3.115194320678711, "learning_rate": 1.231165940486234e-06, "loss": 0.6867, "step": 48 }, { "epoch": 0.01795567193491069, "eval_loss": NaN, "eval_runtime": 21.7292, "eval_samples_per_second": 25.91, "eval_steps_per_second": 25.91, "step": 48 }, { "epoch": 0.01832974843355466, "grad_norm": 1.8398240804672241, "learning_rate": 3.0826662668720364e-07, "loss": 0.7688, "step": 49 }, { "epoch": 0.018703824932198635, "grad_norm": 2.1569676399230957, "learning_rate": 0.0, "loss": 0.636, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2419306423910400.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }