{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.945945945945946, "eval_steps": 500, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21621621621621623, "grad_norm": 0.1806640625, "learning_rate": 0.00019976687691905393, "loss": 0.7107, "step": 10 }, { "epoch": 0.43243243243243246, "grad_norm": 0.235595703125, "learning_rate": 0.00019906859460363307, "loss": 0.6952, "step": 20 }, { "epoch": 0.6486486486486487, "grad_norm": 0.423583984375, "learning_rate": 0.00019790840876823232, "loss": 0.6967, "step": 30 }, { "epoch": 0.8648648648648649, "grad_norm": 0.3974609375, "learning_rate": 0.00019629172873477995, "loss": 0.7649, "step": 40 }, { "epoch": 1.0810810810810811, "grad_norm": 0.327880859375, "learning_rate": 0.00019422609221188207, "loss": 0.7627, "step": 50 }, { "epoch": 1.2972972972972974, "grad_norm": 0.31689453125, "learning_rate": 0.00019172113015054532, "loss": 0.7527, "step": 60 }, { "epoch": 1.5135135135135136, "grad_norm": 0.4111328125, "learning_rate": 0.0001887885218402375, "loss": 0.7536, "step": 70 }, { "epoch": 1.7297297297297298, "grad_norm": 1.376953125, "learning_rate": 0.00018544194045464886, "loss": 0.7241, "step": 80 }, { "epoch": 1.945945945945946, "grad_norm": 0.387939453125, "learning_rate": 0.0001816969893010442, "loss": 0.757, "step": 90 }, { "epoch": 2.1621621621621623, "grad_norm": 0.366455078125, "learning_rate": 0.000177571129070442, "loss": 0.7339, "step": 100 }, { "epoch": 2.3783783783783785, "grad_norm": 0.419677734375, "learning_rate": 0.00017308359642781242, "loss": 0.7005, "step": 110 }, { "epoch": 2.5945945945945947, "grad_norm": 0.38818359375, "learning_rate": 0.00016825531432186543, "loss": 0.7167, "step": 120 }, { "epoch": 2.810810810810811, "grad_norm": 0.452880859375, "learning_rate": 0.00016310879443260528, "loss": 0.7403, "step": 130 }, { "epoch": 3.027027027027027, "grad_norm": 0.412841796875, "learning_rate": 0.00015766803221148673, "loss": 0.7248, "step": 140 }, { "epoch": 3.2432432432432434, "grad_norm": 0.410400390625, "learning_rate": 0.00015195839500354335, "loss": 0.7208, "step": 150 }, { "epoch": 3.4594594594594597, "grad_norm": 0.484619140625, "learning_rate": 0.00014600650377311522, "loss": 0.7079, "step": 160 }, { "epoch": 3.6756756756756754, "grad_norm": 0.429443359375, "learning_rate": 0.00013984010898462416, "loss": 0.6847, "step": 170 }, { "epoch": 3.891891891891892, "grad_norm": 0.437744140625, "learning_rate": 0.00013348796121709862, "loss": 0.7104, "step": 180 }, { "epoch": 4.108108108108108, "grad_norm": 0.4140625, "learning_rate": 0.00012697967711570242, "loss": 0.682, "step": 190 }, { "epoch": 4.324324324324325, "grad_norm": 0.484130859375, "learning_rate": 0.0001203456013052634, "loss": 0.6754, "step": 200 }, { "epoch": 4.54054054054054, "grad_norm": 0.474853515625, "learning_rate": 0.00011361666490962468, "loss": 0.6671, "step": 210 }, { "epoch": 4.756756756756757, "grad_norm": 0.474365234375, "learning_rate": 0.0001068242413364671, "loss": 0.6956, "step": 220 }, { "epoch": 4.972972972972973, "grad_norm": 0.484130859375, "learning_rate": 0.0001, "loss": 0.6679, "step": 230 }, { "epoch": 5.1891891891891895, "grad_norm": 0.47265625, "learning_rate": 9.317575866353292e-05, "loss": 0.6496, "step": 240 }, { "epoch": 5.405405405405405, "grad_norm": 0.44482421875, "learning_rate": 8.638333509037536e-05, "loss": 0.6547, "step": 250 }, { "epoch": 5.621621621621622, "grad_norm": 0.488037109375, "learning_rate": 7.965439869473664e-05, "loss": 0.6739, "step": 260 }, { "epoch": 5.837837837837838, "grad_norm": 0.4658203125, "learning_rate": 7.302032288429756e-05, "loss": 0.6501, "step": 270 }, { "epoch": 6.054054054054054, "grad_norm": 0.48974609375, "learning_rate": 6.651203878290139e-05, "loss": 0.6773, "step": 280 }, { "epoch": 6.27027027027027, "grad_norm": 0.4775390625, "learning_rate": 6.015989101537586e-05, "loss": 0.6298, "step": 290 }, { "epoch": 6.486486486486487, "grad_norm": 0.521484375, "learning_rate": 5.399349622688479e-05, "loss": 0.6464, "step": 300 }, { "epoch": 6.702702702702703, "grad_norm": 0.5234375, "learning_rate": 4.804160499645667e-05, "loss": 0.6369, "step": 310 }, { "epoch": 6.918918918918919, "grad_norm": 0.59765625, "learning_rate": 4.2331967788513295e-05, "loss": 0.6734, "step": 320 }, { "epoch": 7.135135135135135, "grad_norm": 0.54736328125, "learning_rate": 3.689120556739475e-05, "loss": 0.6321, "step": 330 }, { "epoch": 7.351351351351352, "grad_norm": 0.57080078125, "learning_rate": 3.174468567813461e-05, "loss": 0.6261, "step": 340 }, { "epoch": 7.5675675675675675, "grad_norm": 0.47705078125, "learning_rate": 2.691640357218759e-05, "loss": 0.6196, "step": 350 }, { "epoch": 7.783783783783784, "grad_norm": 0.5712890625, "learning_rate": 2.242887092955801e-05, "loss": 0.6505, "step": 360 }, { "epoch": 8.0, "grad_norm": 0.50732421875, "learning_rate": 1.8303010698955804e-05, "loss": 0.6331, "step": 370 }, { "epoch": 8.216216216216216, "grad_norm": 0.48046875, "learning_rate": 1.4558059545351143e-05, "loss": 0.6366, "step": 380 }, { "epoch": 8.432432432432432, "grad_norm": 0.49951171875, "learning_rate": 1.1211478159762478e-05, "loss": 0.641, "step": 390 }, { "epoch": 8.64864864864865, "grad_norm": 0.5078125, "learning_rate": 8.278869849454718e-06, "loss": 0.6233, "step": 400 }, { "epoch": 8.864864864864865, "grad_norm": 0.50634765625, "learning_rate": 5.77390778811796e-06, "loss": 0.6185, "step": 410 }, { "epoch": 9.08108108108108, "grad_norm": 0.53125, "learning_rate": 3.7082712652200867e-06, "loss": 0.6224, "step": 420 }, { "epoch": 9.297297297297296, "grad_norm": 0.487548828125, "learning_rate": 2.091591231767709e-06, "loss": 0.6054, "step": 430 }, { "epoch": 9.513513513513514, "grad_norm": 0.5166015625, "learning_rate": 9.314053963669245e-07, "loss": 0.6299, "step": 440 }, { "epoch": 9.72972972972973, "grad_norm": 0.4990234375, "learning_rate": 2.3312308094607382e-07, "loss": 0.6193, "step": 450 }, { "epoch": 9.945945945945946, "grad_norm": 0.53271484375, "learning_rate": 0.0, "loss": 0.6231, "step": 460 }, { "epoch": 9.945945945945946, "step": 460, "total_flos": 2.244679865204736e+16, "train_loss": 0.6764944273492565, "train_runtime": 411.2593, "train_samples_per_second": 4.498, "train_steps_per_second": 1.119 } ], "logging_steps": 10, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.244679865204736e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }