{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0064, "grad_norm": 1341.8496030875679, "learning_rate": 6.25e-10, "logits/chosen": -3.9499800205230713, "logits/rejected": -4.237819194793701, "logps/chosen": -300.693115234375, "logps/rejected": -249.96307373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.064, "grad_norm": 1351.1067467304115, "learning_rate": 6.25e-09, "logits/chosen": -4.128900527954102, "logits/rejected": -4.351526260375977, "logps/chosen": -351.4300537109375, "logps/rejected": -308.8679504394531, "loss": 0.7229, "rewards/accuracies": 0.4340277910232544, "rewards/chosen": -0.0018261770019307733, "rewards/margins": -0.04775632172822952, "rewards/rejected": 0.04593014344573021, "step": 10 }, { "epoch": 0.128, "grad_norm": 1408.8095936894558, "learning_rate": 9.979871469976195e-09, "logits/chosen": -4.194854736328125, "logits/rejected": -4.3817548751831055, "logps/chosen": -335.3293762207031, "logps/rejected": -294.04248046875, "loss": 0.7269, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.01136251911520958, "rewards/margins": 0.004810346756130457, "rewards/rejected": 0.0065521723590791225, "step": 20 }, { "epoch": 0.192, "grad_norm": 1432.0458755805519, "learning_rate": 9.755282581475768e-09, "logits/chosen": -4.23565149307251, "logits/rejected": -4.369490623474121, "logps/chosen": -329.5267028808594, "logps/rejected": -296.1650390625, "loss": 0.7136, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 0.0647984966635704, "rewards/margins": 0.020466070622205734, "rewards/rejected": 0.04433242976665497, "step": 30 }, { "epoch": 0.256, "grad_norm": 1362.637677953038, "learning_rate": 9.29224396800933e-09, "logits/chosen": -4.142593860626221, "logits/rejected": -4.344474792480469, "logps/chosen": -333.652587890625, "logps/rejected": -289.78851318359375, "loss": 0.691, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.16199079155921936, "rewards/margins": 0.10949220508337021, "rewards/rejected": 0.052498579025268555, "step": 40 }, { "epoch": 0.32, "grad_norm": 1293.8956896680802, "learning_rate": 8.613974319136958e-09, "logits/chosen": -4.226416110992432, "logits/rejected": -4.406065940856934, "logps/chosen": -334.3558044433594, "logps/rejected": -293.1966552734375, "loss": 0.6734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24848651885986328, "rewards/margins": 0.16572698950767517, "rewards/rejected": 0.08275953680276871, "step": 50 }, { "epoch": 0.384, "grad_norm": 1213.937252280571, "learning_rate": 7.754484907260514e-09, "logits/chosen": -4.241747856140137, "logits/rejected": -4.412692546844482, "logps/chosen": -326.20147705078125, "logps/rejected": -293.2193908691406, "loss": 0.6501, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.28125494718551636, "rewards/margins": 0.12699946761131287, "rewards/rejected": 0.1542554497718811, "step": 60 }, { "epoch": 0.448, "grad_norm": 1168.8702151248158, "learning_rate": 6.756874120406714e-09, "logits/chosen": -4.1678466796875, "logits/rejected": -4.357397556304932, "logps/chosen": -326.0350036621094, "logps/rejected": -290.5421447753906, "loss": 0.6267, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.4029604494571686, "rewards/margins": 0.24430949985980988, "rewards/rejected": 0.1586509495973587, "step": 70 }, { "epoch": 0.512, "grad_norm": 1195.264190588224, "learning_rate": 5.671166329088278e-09, "logits/chosen": -4.038235187530518, "logits/rejected": -4.326010227203369, "logps/chosen": -352.18646240234375, "logps/rejected": -309.32562255859375, "loss": 0.6092, "rewards/accuracies": 0.690625011920929, "rewards/chosen": 0.5486255288124084, "rewards/margins": 0.3041314482688904, "rewards/rejected": 0.24449411034584045, "step": 80 }, { "epoch": 0.576, "grad_norm": 1097.5673117468077, "learning_rate": 4.551803455482833e-09, "logits/chosen": -4.168010711669922, "logits/rejected": -4.375750541687012, "logps/chosen": -338.2205505371094, "logps/rejected": -296.5308532714844, "loss": 0.59, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5563652515411377, "rewards/margins": 0.29324790835380554, "rewards/rejected": 0.263117253780365, "step": 90 }, { "epoch": 0.64, "grad_norm": 1066.1810496477938, "learning_rate": 3.4549150281252633e-09, "logits/chosen": -4.156978130340576, "logits/rejected": -4.374584197998047, "logps/chosen": -335.9981384277344, "logps/rejected": -287.0412902832031, "loss": 0.5812, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 0.6475387811660767, "rewards/margins": 0.36960989236831665, "rewards/rejected": 0.2779288589954376, "step": 100 }, { "epoch": 0.704, "grad_norm": 1155.1395500395697, "learning_rate": 2.43550361297047e-09, "logits/chosen": -4.1374359130859375, "logits/rejected": -4.378481864929199, "logps/chosen": -317.46600341796875, "logps/rejected": -277.5682067871094, "loss": 0.5759, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7310987710952759, "rewards/margins": 0.3804031014442444, "rewards/rejected": 0.3506956100463867, "step": 110 }, { "epoch": 0.768, "grad_norm": 1066.5080189058133, "learning_rate": 1.5446867550656768e-09, "logits/chosen": -4.136859893798828, "logits/rejected": -4.3448615074157715, "logps/chosen": -331.464111328125, "logps/rejected": -281.9703674316406, "loss": 0.5683, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.7297540903091431, "rewards/margins": 0.37383073568344116, "rewards/rejected": 0.35592326521873474, "step": 120 }, { "epoch": 0.832, "grad_norm": 1131.6322549220279, "learning_rate": 8.271337313934869e-10, "logits/chosen": -4.222386360168457, "logits/rejected": -4.382724761962891, "logps/chosen": -336.8995666503906, "logps/rejected": -288.167236328125, "loss": 0.5682, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": 0.7898508310317993, "rewards/margins": 0.4281063973903656, "rewards/rejected": 0.3617444634437561, "step": 130 }, { "epoch": 0.896, "grad_norm": 1132.1867619059146, "learning_rate": 3.18825646801314e-10, "logits/chosen": -4.176682472229004, "logits/rejected": -4.3904242515563965, "logps/chosen": -338.28924560546875, "logps/rejected": -304.8387451171875, "loss": 0.5706, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6995974183082581, "rewards/margins": 0.34457093477249146, "rewards/rejected": 0.3550264835357666, "step": 140 }, { "epoch": 0.96, "grad_norm": 1203.6386117758473, "learning_rate": 4.52511911603265e-11, "logits/chosen": -4.113102912902832, "logits/rejected": -4.341179370880127, "logps/chosen": -344.94573974609375, "logps/rejected": -296.61328125, "loss": 0.5703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7801700830459595, "rewards/margins": 0.40149813890457153, "rewards/rejected": 0.37867194414138794, "step": 150 }, { "epoch": 0.9984, "step": 156, "total_flos": 0.0, "train_loss": 0.6263951460520426, "train_runtime": 5142.9133, "train_samples_per_second": 7.766, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }