{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968652037617555, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3.125e-08, "logits/chosen": -2.072941541671753, "logits/rejected": -2.0026817321777344, "logps/chosen": -474.7008361816406, "logps/pi_response": -295.3243408203125, "logps/ref_response": -295.3243408203125, "logps/rejected": -399.129638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.0674960613250732, "logits/rejected": -2.0091359615325928, "logps/chosen": -293.94415283203125, "logps/pi_response": -184.32736206054688, "logps/ref_response": -184.39166259765625, "logps/rejected": -331.1109924316406, "loss": 0.6873, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": -0.005304105579853058, "rewards/margins": 0.014647711999714375, "rewards/rejected": -0.019951816648244858, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-07, "logits/chosen": -1.976543664932251, "logits/rejected": -1.9456901550292969, "logps/chosen": -287.80316162109375, "logps/pi_response": -200.89390563964844, "logps/ref_response": -194.6990509033203, "logps/rejected": -415.5035095214844, "loss": 0.6721, "rewards/accuracies": 0.59375, "rewards/chosen": -0.28011927008628845, "rewards/margins": 0.24019071459770203, "rewards/rejected": -0.5203099846839905, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.882681251368548e-07, "logits/chosen": -1.948282241821289, "logits/rejected": -1.9048038721084595, "logps/chosen": -322.1227111816406, "logps/pi_response": -202.88558959960938, "logps/ref_response": -198.0297088623047, "logps/rejected": -393.98382568359375, "loss": 0.6566, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.31841763854026794, "rewards/margins": 0.2520221173763275, "rewards/rejected": -0.5704396963119507, "step": 30 }, { "epoch": 0.25, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -1.99917471408844, "logits/rejected": -1.9424034357070923, "logps/chosen": -306.33917236328125, "logps/pi_response": -221.41934204101562, "logps/ref_response": -196.04620361328125, "logps/rejected": -473.10430908203125, "loss": 0.6508, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4164486825466156, "rewards/margins": 0.34302154183387756, "rewards/rejected": -0.7594702243804932, "step": 40 }, { "epoch": 0.31, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -2.022153615951538, "logits/rejected": -1.8780553340911865, "logps/chosen": -300.8150634765625, "logps/pi_response": -213.9830322265625, "logps/ref_response": -212.6190185546875, "logps/rejected": -455.1544494628906, "loss": 0.6199, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07085980474948883, "rewards/margins": 0.4123227000236511, "rewards/rejected": -0.48318248987197876, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-07, "logits/chosen": -1.623453140258789, "logits/rejected": -1.5108482837677002, "logps/chosen": -327.77496337890625, "logps/pi_response": -236.1122589111328, "logps/ref_response": -207.8074493408203, "logps/rejected": -478.13677978515625, "loss": 0.6015, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4635746479034424, "rewards/margins": 0.47717157006263733, "rewards/rejected": -0.9407461285591125, "step": 60 }, { "epoch": 0.44, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -1.3422645330429077, "logits/rejected": -1.035585641860962, "logps/chosen": -311.18182373046875, "logps/pi_response": -238.19351196289062, "logps/ref_response": -195.33450317382812, "logps/rejected": -481.33416748046875, "loss": 0.5778, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.45635801553726196, "rewards/margins": 0.5854658484458923, "rewards/rejected": -1.0418239831924438, "step": 70 }, { "epoch": 0.5, "learning_rate": 2.910060778827554e-07, "logits/chosen": -1.1403189897537231, "logits/rejected": -0.7611511945724487, "logps/chosen": -358.22308349609375, "logps/pi_response": -249.77072143554688, "logps/ref_response": -202.05001831054688, "logps/rejected": -496.70916748046875, "loss": 0.5643, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.564284086227417, "rewards/margins": 0.598731517791748, "rewards/rejected": -1.163015604019165, "step": 80 }, { "epoch": 0.56, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -0.8958919644355774, "logits/rejected": -0.4801406264305115, "logps/chosen": -365.9770202636719, "logps/pi_response": -254.03115844726562, "logps/ref_response": -202.2326202392578, "logps/rejected": -469.5523986816406, "loss": 0.582, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6682159900665283, "rewards/margins": 0.44698458909988403, "rewards/rejected": -1.1152006387710571, "step": 90 }, { "epoch": 0.63, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -0.8831208944320679, "logits/rejected": -0.5018015503883362, "logps/chosen": -358.91448974609375, "logps/pi_response": -286.5412292480469, "logps/ref_response": -215.90811157226562, "logps/rejected": -526.8685302734375, "loss": 0.5697, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6926028728485107, "rewards/margins": 0.6388980150222778, "rewards/rejected": -1.331500768661499, "step": 100 }, { "epoch": 0.69, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -0.8882027864456177, "logits/rejected": -0.3719862103462219, "logps/chosen": -382.29156494140625, "logps/pi_response": -279.39959716796875, "logps/ref_response": -194.23623657226562, "logps/rejected": -500.37554931640625, "loss": 0.5822, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8814951181411743, "rewards/margins": 0.4973115026950836, "rewards/rejected": -1.3788065910339355, "step": 110 }, { "epoch": 0.75, "learning_rate": 8.628481651367875e-08, "logits/chosen": -0.7373358011245728, "logits/rejected": -0.3627234399318695, "logps/chosen": -384.5245666503906, "logps/pi_response": -285.951171875, "logps/ref_response": -195.06085205078125, "logps/rejected": -552.2709350585938, "loss": 0.5429, "rewards/accuracies": 0.6875, "rewards/chosen": -1.024389624595642, "rewards/margins": 0.5978730916976929, "rewards/rejected": -1.6222625970840454, "step": 120 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-08, "logits/chosen": -0.8074722290039062, "logits/rejected": -0.4555937349796295, "logps/chosen": -363.5116271972656, "logps/pi_response": -276.3110046386719, "logps/ref_response": -198.39134216308594, "logps/rejected": -539.1277465820312, "loss": 0.5798, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7972265481948853, "rewards/margins": 0.6627088785171509, "rewards/rejected": -1.4599354267120361, "step": 130 }, { "epoch": 0.88, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -0.7802283763885498, "logits/rejected": -0.35213881731033325, "logps/chosen": -357.5544128417969, "logps/pi_response": -271.56719970703125, "logps/ref_response": -197.29266357421875, "logps/rejected": -542.0772094726562, "loss": 0.5593, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7836874723434448, "rewards/margins": 0.6558796167373657, "rewards/rejected": -1.4395670890808105, "step": 140 }, { "epoch": 0.94, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -0.9855061769485474, "logits/rejected": -0.5764757990837097, "logps/chosen": -366.1041564941406, "logps/pi_response": -283.294921875, "logps/ref_response": -215.0994873046875, "logps/rejected": -516.2839965820312, "loss": 0.5699, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7308257818222046, "rewards/margins": 0.46794968843460083, "rewards/rejected": -1.1987755298614502, "step": 150 }, { "epoch": 1.0, "step": 159, "total_flos": 0.0, "train_loss": 0.5937920936248587, "train_runtime": 4171.3796, "train_samples_per_second": 4.885, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }