{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.777777777777779, "eval_steps": 500, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2962962962962963, "grad_norm": 0.83837890625, "learning_rate": 0.00019954719225730847, "loss": 1.6268, "step": 10 }, { "epoch": 0.5925925925925926, "grad_norm": 0.63134765625, "learning_rate": 0.00019819286972627066, "loss": 1.0878, "step": 20 }, { "epoch": 0.8888888888888888, "grad_norm": 0.63623046875, "learning_rate": 0.00019594929736144976, "loss": 0.9758, "step": 30 }, { "epoch": 1.1851851851851851, "grad_norm": 0.54345703125, "learning_rate": 0.00019283679330160726, "loss": 0.8484, "step": 40 }, { "epoch": 1.4814814814814814, "grad_norm": 0.355712890625, "learning_rate": 0.00018888354486549237, "loss": 0.7642, "step": 50 }, { "epoch": 1.7777777777777777, "grad_norm": 0.90869140625, "learning_rate": 0.00018412535328311814, "loss": 0.7435, "step": 60 }, { "epoch": 2.074074074074074, "grad_norm": 0.35498046875, "learning_rate": 0.00017860530947427875, "loss": 0.7328, "step": 70 }, { "epoch": 2.3703703703703702, "grad_norm": 0.37548828125, "learning_rate": 0.00017237340381050703, "loss": 0.7266, "step": 80 }, { "epoch": 2.6666666666666665, "grad_norm": 0.319091796875, "learning_rate": 0.00016548607339452853, "loss": 0.7161, "step": 90 }, { "epoch": 2.962962962962963, "grad_norm": 0.446044921875, "learning_rate": 0.00015800569095711982, "loss": 0.6811, "step": 100 }, { "epoch": 3.259259259259259, "grad_norm": 0.3486328125, "learning_rate": 0.00015000000000000001, "loss": 0.6885, "step": 110 }, { "epoch": 3.5555555555555554, "grad_norm": 0.312255859375, "learning_rate": 0.00014154150130018866, "loss": 0.6989, "step": 120 }, { "epoch": 3.851851851851852, "grad_norm": 0.402099609375, "learning_rate": 0.00013270679633174218, "loss": 0.6812, "step": 130 }, { "epoch": 4.148148148148148, "grad_norm": 0.3740234375, "learning_rate": 0.00012357589355094275, "loss": 0.6903, "step": 140 }, { "epoch": 4.444444444444445, "grad_norm": 0.400390625, "learning_rate": 0.00011423148382732853, "loss": 0.6746, "step": 150 }, { "epoch": 4.7407407407407405, "grad_norm": 0.3525390625, "learning_rate": 0.00010475819158237425, "loss": 0.6626, "step": 160 }, { "epoch": 5.037037037037037, "grad_norm": 0.328857421875, "learning_rate": 9.524180841762577e-05, "loss": 0.6716, "step": 170 }, { "epoch": 5.333333333333333, "grad_norm": 0.336669921875, "learning_rate": 8.57685161726715e-05, "loss": 0.6314, "step": 180 }, { "epoch": 5.62962962962963, "grad_norm": 0.336669921875, "learning_rate": 7.642410644905726e-05, "loss": 0.649, "step": 190 }, { "epoch": 5.925925925925926, "grad_norm": 1.4248046875, "learning_rate": 6.729320366825784e-05, "loss": 0.6906, "step": 200 }, { "epoch": 6.222222222222222, "grad_norm": 0.349365234375, "learning_rate": 5.845849869981137e-05, "loss": 0.644, "step": 210 }, { "epoch": 6.518518518518518, "grad_norm": 0.36474609375, "learning_rate": 5.000000000000002e-05, "loss": 0.6315, "step": 220 }, { "epoch": 6.814814814814815, "grad_norm": 0.377197265625, "learning_rate": 4.19943090428802e-05, "loss": 0.6505, "step": 230 }, { "epoch": 7.111111111111111, "grad_norm": 0.349853515625, "learning_rate": 3.45139266054715e-05, "loss": 0.6518, "step": 240 }, { "epoch": 7.407407407407407, "grad_norm": 0.40966796875, "learning_rate": 2.7626596189492983e-05, "loss": 0.6243, "step": 250 }, { "epoch": 7.703703703703704, "grad_norm": 0.359619140625, "learning_rate": 2.139469052572127e-05, "loss": 0.6406, "step": 260 }, { "epoch": 8.0, "grad_norm": 0.385498046875, "learning_rate": 1.587464671688187e-05, "loss": 0.655, "step": 270 }, { "epoch": 8.296296296296296, "grad_norm": 0.386474609375, "learning_rate": 1.1116455134507664e-05, "loss": 0.6513, "step": 280 }, { "epoch": 8.592592592592592, "grad_norm": 0.384765625, "learning_rate": 7.163206698392744e-06, "loss": 0.6311, "step": 290 }, { "epoch": 8.88888888888889, "grad_norm": 0.341064453125, "learning_rate": 4.050702638550275e-06, "loss": 0.6223, "step": 300 }, { "epoch": 9.185185185185185, "grad_norm": 0.36572265625, "learning_rate": 1.8071302737293295e-06, "loss": 0.6275, "step": 310 }, { "epoch": 9.481481481481481, "grad_norm": 0.380859375, "learning_rate": 4.5280774269154115e-07, "loss": 0.6198, "step": 320 }, { "epoch": 9.777777777777779, "grad_norm": 0.39892578125, "learning_rate": 0.0, "loss": 0.6337, "step": 330 }, { "epoch": 9.777777777777779, "step": 330, "total_flos": 1.610313816342528e+16, "train_loss": 0.7250095107338645, "train_runtime": 297.4234, "train_samples_per_second": 4.539, "train_steps_per_second": 1.11 } ], "logging_steps": 10, "max_steps": 330, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.610313816342528e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }