{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9960474308300395, "eval_steps": 100, "global_step": 126, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.6983913280928786, "learning_rate": 3.846153846153846e-08, "logits/chosen": 0.07181362062692642, "logits/rejected": -0.5422722101211548, "logps/chosen": -667.6278076171875, "logps/rejected": -895.8421020507812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "grad_norm": 4.097480178742397, "learning_rate": 3.8461538461538463e-07, "logits/chosen": 0.04356582462787628, "logits/rejected": -0.7095460295677185, "logps/chosen": -583.3101196289062, "logps/rejected": -874.3485107421875, "loss": 0.693, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": 0.0002644425549078733, "rewards/margins": 0.0005916350055485964, "rewards/rejected": -0.000327192508848384, "step": 10 }, { "epoch": 0.16, "grad_norm": 4.056926958975919, "learning_rate": 4.952806974561517e-07, "logits/chosen": 0.13229021430015564, "logits/rejected": -0.5041828751564026, "logps/chosen": -534.6011962890625, "logps/rejected": -831.9937744140625, "loss": 0.691, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0032804287038743496, "rewards/margins": 0.0038477883208543062, "rewards/rejected": -0.0005673595005646348, "step": 20 }, { "epoch": 0.24, "grad_norm": 4.357718501847275, "learning_rate": 4.725936445085709e-07, "logits/chosen": -0.0280394796282053, "logits/rejected": -0.6336067914962769, "logps/chosen": -625.0661010742188, "logps/rejected": -936.4635620117188, "loss": 0.6823, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.012510508298873901, "rewards/margins": 0.018871963024139404, "rewards/rejected": -0.006361453328281641, "step": 30 }, { "epoch": 0.32, "grad_norm": 4.108185931224436, "learning_rate": 4.328120888946271e-07, "logits/chosen": 0.0761369839310646, "logits/rejected": -0.8145635724067688, "logps/chosen": -571.1219482421875, "logps/rejected": -935.4173583984375, "loss": 0.6735, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.02279135212302208, "rewards/margins": 0.04717525467276573, "rewards/rejected": -0.024383898824453354, "step": 40 }, { "epoch": 0.4, "grad_norm": 3.8656836211850885, "learning_rate": 3.789911309071252e-07, "logits/chosen": 0.0432172492146492, "logits/rejected": -0.8026809692382812, "logps/chosen": -599.9905395507812, "logps/rejected": -956.2135620117188, "loss": 0.6511, "rewards/accuracies": 0.875, "rewards/chosen": 0.03818320482969284, "rewards/margins": 0.0915202647447586, "rewards/rejected": -0.053337059915065765, "step": 50 }, { "epoch": 0.47, "grad_norm": 3.683371847061466, "learning_rate": 3.152640534699994e-07, "logits/chosen": 0.11929800361394882, "logits/rejected": -0.7896026372909546, "logps/chosen": -527.4035034179688, "logps/rejected": -991.9866943359375, "loss": 0.6306, "rewards/accuracies": 0.84375, "rewards/chosen": 0.054078031331300735, "rewards/margins": 0.11341840028762817, "rewards/rejected": -0.05934036895632744, "step": 60 }, { "epoch": 0.55, "grad_norm": 3.8302089344745407, "learning_rate": 2.4652489880792125e-07, "logits/chosen": 0.16239780187606812, "logits/rejected": -0.5915371775627136, "logps/chosen": -517.858154296875, "logps/rejected": -948.3834228515625, "loss": 0.6278, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.07395206391811371, "rewards/margins": 0.22345471382141113, "rewards/rejected": -0.14950266480445862, "step": 70 }, { "epoch": 0.63, "grad_norm": 4.211099057428358, "learning_rate": 1.780526211572016e-07, "logits/chosen": 0.13101445138454437, "logits/rejected": -0.729425847530365, "logps/chosen": -607.0712280273438, "logps/rejected": -908.4261474609375, "loss": 0.6168, "rewards/accuracies": 0.875, "rewards/chosen": 0.05716314911842346, "rewards/margins": 0.34125471115112305, "rewards/rejected": -0.2840915620326996, "step": 80 }, { "epoch": 0.71, "grad_norm": 3.8906406186709197, "learning_rate": 1.1510567942602889e-07, "logits/chosen": 0.047411851584911346, "logits/rejected": -0.6215206384658813, "logps/chosen": -574.5473022460938, "logps/rejected": -883.5465087890625, "loss": 0.6002, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.07054788619279861, "rewards/margins": 0.363060861825943, "rewards/rejected": -0.2925129532814026, "step": 90 }, { "epoch": 0.79, "grad_norm": 4.2027425160958565, "learning_rate": 6.251820383244468e-08, "logits/chosen": 8.583068620282575e-07, "logits/rejected": -0.6966060400009155, "logps/chosen": -600.7443237304688, "logps/rejected": -899.9113159179688, "loss": 0.5979, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.04147876054048538, "rewards/margins": 0.5237125754356384, "rewards/rejected": -0.48223385214805603, "step": 100 }, { "epoch": 0.79, "eval_logits/chosen": 0.4742414057254791, "eval_logits/rejected": -0.5929847359657288, "eval_logps/chosen": -408.6627197265625, "eval_logps/rejected": -1010.139892578125, "eval_loss": 0.6596941947937012, "eval_rewards/accuracies": 0.804411768913269, "eval_rewards/chosen": 0.04729553684592247, "eval_rewards/margins": 0.07419417053461075, "eval_rewards/rejected": -0.02689863182604313, "eval_runtime": 302.2106, "eval_samples_per_second": 8.974, "eval_steps_per_second": 0.281, "step": 100 }, { "epoch": 0.87, "grad_norm": 3.9848703965373873, "learning_rate": 2.4328749671846117e-08, "logits/chosen": 0.0030116259586066008, "logits/rejected": -0.7621723413467407, "logps/chosen": -610.0753784179688, "logps/rejected": -956.7000122070312, "loss": 0.6101, "rewards/accuracies": 0.875, "rewards/chosen": 0.05626688152551651, "rewards/margins": 0.29308319091796875, "rewards/rejected": -0.23681628704071045, "step": 110 }, { "epoch": 0.95, "grad_norm": 4.008522467528123, "learning_rate": 3.4701487751534475e-09, "logits/chosen": 0.1057804599404335, "logits/rejected": -0.7350226640701294, "logps/chosen": -561.1431274414062, "logps/rejected": -926.4228515625, "loss": 0.599, "rewards/accuracies": 0.90625, "rewards/chosen": 0.06413096934556961, "rewards/margins": 0.4213576316833496, "rewards/rejected": -0.3572266697883606, "step": 120 }, { "epoch": 1.0, "step": 126, "total_flos": 0.0, "train_loss": 0.6376642821327089, "train_runtime": 2213.5067, "train_samples_per_second": 3.653, "train_steps_per_second": 0.057 } ], "logging_steps": 10, "max_steps": 126, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }