{ "best_metric": null, "best_model_checkpoint": null, "epoch": null, "eval_steps": 500, "global_step": 28, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1e-05, "loss": 0.0299, "reward": -5.125, "step": 0 }, { "epoch": 0.07, "learning_rate": 1e-05, "loss": 0.0401, "reward": -6.8318, "step": 1 }, { "epoch": 0.14, "learning_rate": 1e-05, "loss": 0.029, "reward": -5.9028, "step": 2 }, { "epoch": 0.21, "learning_rate": 1e-05, "loss": 0.0381, "reward": -6.75, "step": 3 }, { "epoch": 0.29, "learning_rate": 1e-05, "loss": 0.0364, "reward": -5.7639, "step": 4 }, { "epoch": 0.36, "learning_rate": 1e-05, "loss": 0.0378, "reward": -6.25, "step": 5 }, { "epoch": 0.43, "learning_rate": 1e-05, "loss": 0.0352, "reward": -6.6071, "step": 6 }, { "epoch": 0.5, "learning_rate": 1e-05, "loss": 0.0275, "reward": -3.9205, "step": 7 }, { "epoch": 0.57, "learning_rate": 1e-05, "loss": 0.0313, "reward": -7.0625, "step": 8 }, { "epoch": 0.64, "learning_rate": 1e-05, "loss": 0.0312, "reward": -5.8546, "step": 9 }, { "epoch": 0.71, "learning_rate": 1e-05, "loss": 0.0261, "reward": -1.9861, "step": 10 }, { "epoch": 0.79, "learning_rate": 1e-05, "loss": 0.0349, "reward": -7.5, "step": 11 }, { "epoch": 0.86, "learning_rate": 1e-05, "loss": 0.0346, "reward": -6.3542, "step": 12 }, { "epoch": 0.93, "learning_rate": 1e-05, "loss": 0.0346, "reward": -4.8214, "step": 13 }, { "epoch": 1.0, "learning_rate": 1e-05, "loss": 0.0299, "reward": -6.0794, "step": 14 }, { "epoch": 1.07, "learning_rate": 1e-05, "loss": 0.0365, "reward": -8.75, "step": 15 }, { "epoch": 1.14, "learning_rate": 1e-05, "loss": 0.0273, "reward": -6.6927, "step": 16 }, { "epoch": 1.21, "learning_rate": 1e-05, "loss": 0.0266, "reward": -2.7678, "step": 17 }, { "epoch": 1.29, "learning_rate": 1e-05, "loss": 0.0211, "reward": -3.5, "step": 18 }, { "epoch": 1.36, "learning_rate": 1e-05, "loss": 0.0271, "reward": -5.3509, "step": 19 }, { "epoch": 1.43, "learning_rate": 1e-05, "loss": 0.025, "reward": -1.309, "step": 20 }, { "epoch": 1.5, "learning_rate": 1e-05, "loss": 0.0215, "reward": -4.0697, "step": 21 }, { "epoch": 1.57, "learning_rate": 1e-05, "loss": 0.018, "reward": -5.4167, "step": 22 }, { "epoch": 1.64, "learning_rate": 1e-05, "loss": 0.0293, "reward": -4.5805, "step": 23 }, { "epoch": 1.71, "learning_rate": 1e-05, "loss": 0.0249, "reward": -5.7969, "step": 24 }, { "epoch": 1.79, "learning_rate": 1e-05, "loss": 0.0167, "reward": -4.0278, "step": 25 }, { "epoch": 1.86, "learning_rate": 1e-05, "loss": 0.026, "reward": -4.25, "step": 26 }, { "epoch": 1.93, "learning_rate": 1e-05, "loss": 0.0181, "reward": -3.6364, "step": 27 } ], "logging_steps": 500, "max_steps": 28, "num_input_tokens_seen": 0, "num_train_epochs": 2.0, "save_steps": 500, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }