{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 10000, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage": 2.050441083312035, "completion_length": 549.2318519592285, "epoch": 0.064, "grad_norm": 0.3377276062965393, "kl": 0.00016334056854248047, "learning_rate": 1.875e-06, "loss": 0.0, "reward": 1.4327381163835526, "reward_std": 0.2913255948573351, "rewards/accuracy_reward": 0.43273810148239134, "rewards/format_reward": 1.0, "step": 5 }, { "advantage": 1.8731657922267915, "completion_length": 535.2622138977051, "epoch": 0.128, "grad_norm": 0.2055761218070984, "kl": 0.001010751724243164, "learning_rate": 2.993961440992859e-06, "loss": 0.0, "reward": 1.500892886519432, "reward_std": 0.25598252601921556, "rewards/accuracy_reward": 0.5008928671479225, "rewards/format_reward": 1.0, "step": 10 }, { "advantage": 1.8132987678050996, "completion_length": 546.9449508666992, "epoch": 0.192, "grad_norm": 0.22043277323246002, "kl": 0.00526275634765625, "learning_rate": 2.9265847744427307e-06, "loss": 0.0002, "reward": 1.535119077563286, "reward_std": 0.23049755450338125, "rewards/accuracy_reward": 0.5351190570741892, "rewards/format_reward": 1.0, "step": 15 }, { "advantage": 1.9496307998895646, "completion_length": 559.6901908874512, "epoch": 0.256, "grad_norm": 0.18047218024730682, "kl": 0.006356048583984375, "learning_rate": 2.7876731904027993e-06, "loss": 0.0003, "reward": 1.5279762238264083, "reward_std": 0.21949078943580388, "rewards/accuracy_reward": 0.5279762007296085, "rewards/format_reward": 1.0, "step": 20 }, { "advantage": 1.9445811256766319, "completion_length": 561.3247123718262, "epoch": 0.32, "grad_norm": 0.17815344035625458, "kl": 0.00570068359375, "learning_rate": 2.584192295741087e-06, "loss": 0.0002, "reward": 1.5446428924798965, "reward_std": 0.23059090217575431, "rewards/accuracy_reward": 0.5446428753435612, "rewards/format_reward": 1.0, "step": 25 }, { "advantage": 1.7600619524717331, "completion_length": 547.6345359802247, "epoch": 0.384, "grad_norm": 0.18666820228099823, "kl": 0.00573883056640625, "learning_rate": 2.3263454721781537e-06, "loss": 0.0002, "reward": 1.5431547909975052, "reward_std": 0.21657919492572547, "rewards/accuracy_reward": 0.5431547746062279, "rewards/format_reward": 1.0, "step": 30 }, { "advantage": 1.7993007361888886, "completion_length": 546.9157859802247, "epoch": 0.448, "grad_norm": 0.1700781136751175, "kl": 0.00593109130859375, "learning_rate": 2.027062236122014e-06, "loss": 0.0002, "reward": 1.5273809880018234, "reward_std": 0.2096662785857916, "rewards/accuracy_reward": 0.5273809637874365, "rewards/format_reward": 1.0, "step": 35 }, { "advantage": 1.896444058418274, "completion_length": 552.3208389282227, "epoch": 0.512, "grad_norm": 0.1920849233865738, "kl": 0.005233001708984375, "learning_rate": 1.7013498987264833e-06, "loss": 0.0002, "reward": 1.5434524178504945, "reward_std": 0.22238524220883846, "rewards/accuracy_reward": 0.5434523917734623, "rewards/format_reward": 1.0, "step": 40 }, { "advantage": 1.870384192466736, "completion_length": 558.927098083496, "epoch": 0.576, "grad_norm": 0.18718139827251434, "kl": 0.0053081512451171875, "learning_rate": 1.3655410366448499e-06, "loss": 0.0002, "reward": 1.5154762089252471, "reward_std": 0.21234574727714062, "rewards/accuracy_reward": 0.5154761992394924, "rewards/format_reward": 1.0, "step": 45 }, { "advantage": 1.9293582290410995, "completion_length": 554.5389984130859, "epoch": 0.64, "grad_norm": 0.2384500652551651, "kl": 0.0057861328125, "learning_rate": 1.036474508437579e-06, "loss": 0.0002, "reward": 1.5339285999536514, "reward_std": 0.2257133638486266, "rewards/accuracy_reward": 0.5339285805821419, "rewards/format_reward": 1.0, "step": 50 }, { "advantage": 1.9427187949419022, "completion_length": 556.450008392334, "epoch": 0.704, "grad_norm": 0.19755816459655762, "kl": 0.005307769775390625, "learning_rate": 7.30651083891141e-07, "loss": 0.0002, "reward": 1.5452381312847137, "reward_std": 0.2197699649259448, "rewards/accuracy_reward": 0.545238108932972, "rewards/format_reward": 1.0, "step": 55 }, { "advantage": 1.8897637754678727, "completion_length": 560.4640045166016, "epoch": 0.768, "grad_norm": 0.24272561073303223, "kl": 0.005287933349609375, "learning_rate": 4.63406026519703e-07, "loss": 0.0002, "reward": 1.525297647714615, "reward_std": 0.24526160284876825, "rewards/accuracy_reward": 0.5252976305782795, "rewards/format_reward": 1.0, "step": 60 }, { "advantage": 1.9503421396017075, "completion_length": 557.6199478149414, "epoch": 0.832, "grad_norm": 0.18535475432872772, "kl": 0.005422210693359375, "learning_rate": 2.48140119418046e-07, "loss": 0.0002, "reward": 1.517559552192688, "reward_std": 0.21927062328904867, "rewards/accuracy_reward": 0.5175595372915268, "rewards/format_reward": 1.0, "step": 65 }, { "advantage": 1.9777270182967186, "completion_length": 550.9339363098145, "epoch": 0.896, "grad_norm": 0.23573845624923706, "kl": 0.00623931884765625, "learning_rate": 9.564769404039419e-08, "loss": 0.0002, "reward": 1.5324405014514924, "reward_std": 0.22246364038437605, "rewards/accuracy_reward": 0.5324404895305633, "rewards/format_reward": 1.0, "step": 70 }, { "advantage": 1.8454515248537064, "completion_length": 566.992268371582, "epoch": 0.96, "grad_norm": 0.18028907477855682, "kl": 0.00513458251953125, "learning_rate": 1.357535734809795e-08, "loss": 0.0002, "reward": 1.515476217865944, "reward_std": 0.22508260188624263, "rewards/accuracy_reward": 0.5154761977493763, "rewards/format_reward": 1.0, "step": 75 }, { "advantage": 2.0446028262376785, "completion_length": 550.128978729248, "epoch": 0.9984, "kl": 0.005794525146484375, "reward": 1.536210338274638, "reward_std": 0.22843041829764843, "rewards/accuracy_reward": 0.5362103283405304, "rewards/format_reward": 1.0, "step": 78, "total_flos": 0.0, "train_loss": 0.0001987359993598567, "train_runtime": 7692.2558, "train_samples_per_second": 0.975, "train_steps_per_second": 0.01 } ], "logging_steps": 5, "max_steps": 78, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }