{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.62962962962963, "eval_steps": 500, "global_step": 130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07407407407407407, "grad_norm": 5.0625, "learning_rate": 1.5384615384615387e-05, "loss": 3.0102, "step": 1 }, { "epoch": 0.37037037037037035, "grad_norm": 2.390625, "learning_rate": 7.692307692307693e-05, "loss": 3.0215, "step": 5 }, { "epoch": 0.7407407407407407, "grad_norm": 11.5, "learning_rate": 0.00015384615384615385, "loss": 2.7544, "step": 10 }, { "epoch": 0.9629629629629629, "eval_loss": 2.8721797466278076, "eval_runtime": 0.652, "eval_samples_per_second": 15.338, "eval_steps_per_second": 1.534, "step": 13 }, { "epoch": 1.1111111111111112, "grad_norm": 1.3671875, "learning_rate": 0.00019985583705641418, "loss": 2.379, "step": 15 }, { "epoch": 1.4814814814814814, "grad_norm": 2.609375, "learning_rate": 0.00019823877374156647, "loss": 2.0489, "step": 20 }, { "epoch": 1.8518518518518519, "grad_norm": 1.5234375, "learning_rate": 0.00019485364419471454, "loss": 1.7723, "step": 25 }, { "epoch": 2.0, "eval_loss": 2.60640549659729, "eval_runtime": 0.5469, "eval_samples_per_second": 18.283, "eval_steps_per_second": 1.828, "step": 27 }, { "epoch": 2.2222222222222223, "grad_norm": 0.66015625, "learning_rate": 0.0001897613727639014, "loss": 1.6022, "step": 30 }, { "epoch": 2.5925925925925926, "grad_norm": 0.65234375, "learning_rate": 0.00018305360832480117, "loss": 1.4855, "step": 35 }, { "epoch": 2.962962962962963, "grad_norm": 0.58203125, "learning_rate": 0.00017485107481711012, "loss": 1.4023, "step": 40 }, { "epoch": 2.962962962962963, "eval_loss": 2.5709500312805176, "eval_runtime": 0.7217, "eval_samples_per_second": 13.857, "eval_steps_per_second": 1.386, "step": 40 }, { "epoch": 3.3333333333333335, "grad_norm": 0.458984375, "learning_rate": 0.0001653013984983585, "loss": 1.3253, "step": 45 }, { "epoch": 3.7037037037037037, "grad_norm": 0.50390625, "learning_rate": 0.00015457645101945046, "loss": 1.2778, "step": 50 }, { "epoch": 4.0, "eval_loss": 2.5349316596984863, "eval_runtime": 0.547, "eval_samples_per_second": 18.283, "eval_steps_per_second": 1.828, "step": 54 }, { "epoch": 4.074074074074074, "grad_norm": 0.376953125, "learning_rate": 0.00014286925614030542, "loss": 1.2498, "step": 55 }, { "epoch": 4.444444444444445, "grad_norm": 0.267578125, "learning_rate": 0.0001303905157574247, "loss": 1.2221, "step": 60 }, { "epoch": 4.814814814814815, "grad_norm": 0.296875, "learning_rate": 0.00011736481776669306, "loss": 1.1848, "step": 65 }, { "epoch": 4.962962962962963, "eval_loss": 2.5175788402557373, "eval_runtime": 0.6693, "eval_samples_per_second": 14.942, "eval_steps_per_second": 1.494, "step": 67 }, { "epoch": 5.185185185185185, "grad_norm": 0.291015625, "learning_rate": 0.00010402659401094152, "loss": 1.1814, "step": 70 }, { "epoch": 5.555555555555555, "grad_norm": 0.33203125, "learning_rate": 9.061590105968208e-05, "loss": 1.1574, "step": 75 }, { "epoch": 5.925925925925926, "grad_norm": 0.349609375, "learning_rate": 7.73740997570278e-05, "loss": 1.1522, "step": 80 }, { "epoch": 6.0, "eval_loss": 2.5044538974761963, "eval_runtime": 0.5444, "eval_samples_per_second": 18.369, "eval_steps_per_second": 1.837, "step": 81 }, { "epoch": 6.296296296296296, "grad_norm": 0.2412109375, "learning_rate": 6.453951129574644e-05, "loss": 1.1367, "step": 85 }, { "epoch": 6.666666666666667, "grad_norm": 0.28515625, "learning_rate": 5.234312799786921e-05, "loss": 1.1305, "step": 90 }, { "epoch": 6.962962962962963, "eval_loss": 2.506514310836792, "eval_runtime": 0.685, "eval_samples_per_second": 14.598, "eval_steps_per_second": 1.46, "step": 94 }, { "epoch": 7.037037037037037, "grad_norm": 0.2265625, "learning_rate": 4.100445599768774e-05, "loss": 1.1188, "step": 95 }, { "epoch": 7.407407407407407, "grad_norm": 0.2099609375, "learning_rate": 3.072756464904006e-05, "loss": 1.1222, "step": 100 }, { "epoch": 7.777777777777778, "grad_norm": 0.248046875, "learning_rate": 2.1697413758237784e-05, "loss": 1.1075, "step": 105 }, { "epoch": 8.0, "eval_loss": 2.5136494636535645, "eval_runtime": 0.5462, "eval_samples_per_second": 18.307, "eval_steps_per_second": 1.831, "step": 108 }, { "epoch": 8.148148148148149, "grad_norm": 0.2470703125, "learning_rate": 1.4076524743778319e-05, "loss": 1.1126, "step": 110 }, { "epoch": 8.518518518518519, "grad_norm": 0.236328125, "learning_rate": 8.002055634117578e-06, "loss": 1.1118, "step": 115 }, { "epoch": 8.88888888888889, "grad_norm": 0.2275390625, "learning_rate": 3.5833325466437694e-06, "loss": 1.1049, "step": 120 }, { "epoch": 8.962962962962964, "eval_loss": 2.512882709503174, "eval_runtime": 0.6584, "eval_samples_per_second": 15.188, "eval_steps_per_second": 1.519, "step": 121 }, { "epoch": 9.25925925925926, "grad_norm": 0.2216796875, "learning_rate": 8.998820754091531e-07, "loss": 1.1116, "step": 125 }, { "epoch": 9.62962962962963, "grad_norm": 0.20703125, "learning_rate": 0.0, "loss": 1.1048, "step": 130 }, { "epoch": 9.62962962962963, "eval_loss": 2.513336658477783, "eval_runtime": 0.5375, "eval_samples_per_second": 18.606, "eval_steps_per_second": 1.861, "step": 130 }, { "epoch": 9.62962962962963, "step": 130, "total_flos": 1.018113810235392e+17, "train_loss": 1.4375356710874116, "train_runtime": 456.268, "train_samples_per_second": 18.301, "train_steps_per_second": 0.285 } ], "logging_steps": 5, "max_steps": 130, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.018113810235392e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }