{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99581589958159, "eval_steps": 500, "global_step": 119, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 4.166666666666666e-08, "logits/chosen": -2.7076048851013184, "logits/rejected": -2.5675482749938965, "logps/chosen": -287.144287109375, "logps/pi_response": -67.09939575195312, "logps/ref_response": -67.09939575195312, "logps/rejected": -200.97291564941406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.7251973152160645, "logits/rejected": -2.6897215843200684, "logps/chosen": -235.24375915527344, "logps/pi_response": -75.80421447753906, "logps/ref_response": -75.67623901367188, "logps/rejected": -202.0631866455078, "loss": 0.6923, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 0.0013986360281705856, "rewards/margins": 0.001505931606516242, "rewards/rejected": -0.0001072955783456564, "step": 10 }, { "epoch": 0.17, "learning_rate": 4.931352528237397e-07, "logits/chosen": -2.745739459991455, "logits/rejected": -2.7031664848327637, "logps/chosen": -247.6968231201172, "logps/pi_response": -77.08773803710938, "logps/ref_response": -72.46954345703125, "logps/rejected": -176.19552612304688, "loss": 0.674, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.020640889182686806, "rewards/margins": 0.0381900891661644, "rewards/rejected": -0.01754920184612274, "step": 20 }, { "epoch": 0.25, "learning_rate": 4.658920803689553e-07, "logits/chosen": -2.660264253616333, "logits/rejected": -2.6261754035949707, "logps/chosen": -234.98886108398438, "logps/pi_response": -99.96369934082031, "logps/ref_response": -74.0744400024414, "logps/rejected": -171.56405639648438, "loss": 0.636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07018943876028061, "rewards/margins": 0.134169340133667, "rewards/rejected": -0.2043587863445282, "step": 30 }, { "epoch": 0.33, "learning_rate": 4.201712553872657e-07, "logits/chosen": -2.635835886001587, "logits/rejected": -2.5933120250701904, "logps/chosen": -263.5890197753906, "logps/pi_response": -136.6029052734375, "logps/ref_response": -74.10218048095703, "logps/rejected": -225.02127075195312, "loss": 0.5955, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2223609983921051, "rewards/margins": 0.23141320049762726, "rewards/rejected": -0.45377421379089355, "step": 40 }, { "epoch": 0.42, "learning_rate": 3.598859066780754e-07, "logits/chosen": -2.6245293617248535, "logits/rejected": -2.589780330657959, "logps/chosen": -307.409912109375, "logps/pi_response": -178.45059204101562, "logps/ref_response": -79.56159973144531, "logps/rejected": -244.45114135742188, "loss": 0.5611, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.32991307973861694, "rewards/margins": 0.45631542801856995, "rewards/rejected": -0.7862285375595093, "step": 50 }, { "epoch": 0.5, "learning_rate": 2.9019570347986706e-07, "logits/chosen": -2.598756790161133, "logits/rejected": -2.57039213180542, "logps/chosen": -301.62554931640625, "logps/pi_response": -193.3199005126953, "logps/ref_response": -82.40287780761719, "logps/rejected": -272.43170166015625, "loss": 0.531, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.4208662509918213, "rewards/margins": 0.6576918363571167, "rewards/rejected": -1.0785582065582275, "step": 60 }, { "epoch": 0.59, "learning_rate": 2.1706525253979534e-07, "logits/chosen": -2.6376967430114746, "logits/rejected": -2.598003387451172, "logps/chosen": -294.4837951660156, "logps/pi_response": -184.1026153564453, "logps/ref_response": -68.73023986816406, "logps/rejected": -269.95458984375, "loss": 0.5174, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5717201232910156, "rewards/margins": 0.47639116644859314, "rewards/rejected": -1.0481112003326416, "step": 70 }, { "epoch": 0.67, "learning_rate": 1.4675360263490295e-07, "logits/chosen": -2.596078395843506, "logits/rejected": -2.5703680515289307, "logps/chosen": -276.83074951171875, "logps/pi_response": -204.7767791748047, "logps/ref_response": -68.83003997802734, "logps/rejected": -300.37249755859375, "loss": 0.4957, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7744797468185425, "rewards/margins": 0.5705304145812988, "rewards/rejected": -1.3450102806091309, "step": 80 }, { "epoch": 0.75, "learning_rate": 8.527854855097224e-08, "logits/chosen": -2.6634445190429688, "logits/rejected": -2.6276421546936035, "logps/chosen": -322.80712890625, "logps/pi_response": -227.09469604492188, "logps/ref_response": -69.44734191894531, "logps/rejected": -320.1347961425781, "loss": 0.4846, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8887150883674622, "rewards/margins": 0.6986933946609497, "rewards/rejected": -1.587408423423767, "step": 90 }, { "epoch": 0.84, "learning_rate": 3.790158337517127e-08, "logits/chosen": -2.5817208290100098, "logits/rejected": -2.548147678375244, "logps/chosen": -379.9378356933594, "logps/pi_response": -262.61004638671875, "logps/ref_response": -70.62646484375, "logps/rejected": -357.0428771972656, "loss": 0.4681, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1695524454116821, "rewards/margins": 0.6535626649856567, "rewards/rejected": -1.8231151103973389, "step": 100 }, { "epoch": 0.92, "learning_rate": 8.677580722139671e-09, "logits/chosen": -2.6654255390167236, "logits/rejected": -2.5850634574890137, "logps/chosen": -381.2176818847656, "logps/pi_response": -263.3302307128906, "logps/ref_response": -80.19813537597656, "logps/rejected": -368.27734375, "loss": 0.4833, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.045925498008728, "rewards/margins": 0.8829472661018372, "rewards/rejected": -1.92887282371521, "step": 110 }, { "epoch": 1.0, "step": 119, "total_flos": 0.0, "train_loss": 0.5490188137823794, "train_runtime": 3582.4753, "train_samples_per_second": 4.266, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 119, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }