{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.986666666666667, "eval_steps": 500, "global_step": 168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17777777777777778, "grad_norm": 1.5870693922042847, "learning_rate": 4.957230266673969e-06, "logits/chosen": -2.5848517417907715, "logits/rejected": -2.592822313308716, "logps/chosen": -1.2300995588302612, "logps/rejected": -1.2750391960144043, "loss": 1.3004, "odds_ratio_loss": 0.7031511068344116, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.12300996482372284, "rewards/margins": 0.004493957385420799, "rewards/rejected": -0.1275039166212082, "sft_loss": 1.2300995588302612, "step": 10 }, { "epoch": 0.35555555555555557, "grad_norm": 2.5478012561798096, "learning_rate": 4.84529614391025e-06, "logits/chosen": -2.568402051925659, "logits/rejected": -2.5743260383605957, "logps/chosen": -1.1148786544799805, "logps/rejected": -1.2647678852081299, "loss": 1.1815, "odds_ratio_loss": 0.6664617657661438, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.11148785054683685, "rewards/margins": 0.014988938346505165, "rewards/rejected": -0.12647680938243866, "sft_loss": 1.1148786544799805, "step": 20 }, { "epoch": 0.5333333333333333, "grad_norm": 1.738682746887207, "learning_rate": 4.667009949002349e-06, "logits/chosen": -2.5901498794555664, "logits/rejected": -2.6051435470581055, "logps/chosen": -1.2092516422271729, "logps/rejected": -1.3586838245391846, "loss": 1.2768, "odds_ratio_loss": 0.6759374737739563, "rewards/accuracies": 0.4375, "rewards/chosen": -0.120925173163414, "rewards/margins": 0.01494320947676897, "rewards/rejected": -0.1358683854341507, "sft_loss": 1.2092516422271729, "step": 30 }, { "epoch": 0.7111111111111111, "grad_norm": 10.248797416687012, "learning_rate": 4.397288409237892e-06, "logits/chosen": -2.4943628311157227, "logits/rejected": -2.5201773643493652, "logps/chosen": -1.1567463874816895, "logps/rejected": -1.3316763639450073, "loss": 1.2213, "odds_ratio_loss": 0.6450860500335693, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.11567463725805283, "rewards/margins": 0.017492998391389847, "rewards/rejected": -0.13316763937473297, "sft_loss": 1.1567463874816895, "step": 40 }, { "epoch": 0.8888888888888888, "grad_norm": 6.022939682006836, "learning_rate": 4.097593272948218e-06, "logits/chosen": -2.6041746139526367, "logits/rejected": -2.6204562187194824, "logps/chosen": -1.2715274095535278, "logps/rejected": -1.6485040187835693, "loss": 1.3385, "odds_ratio_loss": 0.6699766516685486, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.12715274095535278, "rewards/margins": 0.03769766539335251, "rewards/rejected": -0.1648503988981247, "sft_loss": 1.2715274095535278, "step": 50 }, { "epoch": 1.0666666666666667, "grad_norm": 1.5987056493759155, "learning_rate": 3.712001321256744e-06, "logits/chosen": -2.6027374267578125, "logits/rejected": -2.627061128616333, "logps/chosen": -1.0649330615997314, "logps/rejected": -1.3192559480667114, "loss": 1.1266, "odds_ratio_loss": 0.616315484046936, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.10649331659078598, "rewards/margins": 0.025432268157601357, "rewards/rejected": -0.13192559778690338, "sft_loss": 1.0649330615997314, "step": 60 }, { "epoch": 1.2444444444444445, "grad_norm": 9.42609977722168, "learning_rate": 3.284100248421622e-06, "logits/chosen": -2.592658758163452, "logits/rejected": -2.612031936645508, "logps/chosen": -1.0701837539672852, "logps/rejected": -1.2456352710723877, "loss": 1.1365, "odds_ratio_loss": 0.6634659767150879, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10701838880777359, "rewards/margins": 0.017545154318213463, "rewards/rejected": -0.1245635375380516, "sft_loss": 1.0701837539672852, "step": 70 }, { "epoch": 1.4222222222222223, "grad_norm": 1.1247073411941528, "learning_rate": 2.8288274295187397e-06, "logits/chosen": -2.6690917015075684, "logits/rejected": -2.6884384155273438, "logps/chosen": -1.0463426113128662, "logps/rejected": -1.2134628295898438, "loss": 1.1088, "odds_ratio_loss": 0.6244794130325317, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10463427007198334, "rewards/margins": 0.016712015494704247, "rewards/rejected": -0.12134629487991333, "sft_loss": 1.0463426113128662, "step": 80 }, { "epoch": 1.6, "grad_norm": 8.079381942749023, "learning_rate": 2.3620757456122797e-06, "logits/chosen": -2.638796329498291, "logits/rejected": -2.6489083766937256, "logps/chosen": -1.0774247646331787, "logps/rejected": -1.465980887413025, "loss": 1.1411, "odds_ratio_loss": 0.6369243860244751, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10774248838424683, "rewards/margins": 0.03885561600327492, "rewards/rejected": -0.14659810066223145, "sft_loss": 1.0774247646331787, "step": 90 }, { "epoch": 1.7777777777777777, "grad_norm": 3.1488540172576904, "learning_rate": 1.9001387874698865e-06, "logits/chosen": -2.659306049346924, "logits/rejected": -2.67085599899292, "logps/chosen": -1.0682704448699951, "logps/rejected": -1.2112770080566406, "loss": 1.1326, "odds_ratio_loss": 0.6432015299797058, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10682705789804459, "rewards/margins": 0.01430064719170332, "rewards/rejected": -0.12112770229578018, "sft_loss": 1.0682704448699951, "step": 100 }, { "epoch": 1.9555555555555557, "grad_norm": 2.191756010055542, "learning_rate": 1.4591420711120425e-06, "logits/chosen": -2.685157299041748, "logits/rejected": -2.695525646209717, "logps/chosen": -1.0902485847473145, "logps/rejected": -1.2659653425216675, "loss": 1.1523, "odds_ratio_loss": 0.6206409335136414, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10902485996484756, "rewards/margins": 0.017571674659848213, "rewards/rejected": -0.12659654021263123, "sft_loss": 1.0902485847473145, "step": 110 }, { "epoch": 2.1333333333333333, "grad_norm": 1.3168927431106567, "learning_rate": 1.0544801205950327e-06, "logits/chosen": -2.6532204151153564, "logits/rejected": -2.6720328330993652, "logps/chosen": -1.0528920888900757, "logps/rejected": -1.2378777265548706, "loss": 1.1173, "odds_ratio_loss": 0.6442069411277771, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10528920590877533, "rewards/margins": 0.018498573452234268, "rewards/rejected": -0.1237877756357193, "sft_loss": 1.0528920888900757, "step": 120 }, { "epoch": 2.311111111111111, "grad_norm": 0.9034324288368225, "learning_rate": 7.002790686106056e-07, "logits/chosen": -2.6588597297668457, "logits/rejected": -2.6591033935546875, "logps/chosen": -0.9490475654602051, "logps/rejected": -1.0968741178512573, "loss": 1.0107, "odds_ratio_loss": 0.616373598575592, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09490474313497543, "rewards/margins": 0.01478266529738903, "rewards/rejected": -0.1096874251961708, "sft_loss": 0.9490475654602051, "step": 130 }, { "epoch": 2.488888888888889, "grad_norm": 1.7593573331832886, "learning_rate": 4.0890353469735717e-07, "logits/chosen": -2.686373472213745, "logits/rejected": -2.7087228298187256, "logps/chosen": -1.0427912473678589, "logps/rejected": -1.486315131187439, "loss": 1.1034, "odds_ratio_loss": 0.6059055328369141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10427911579608917, "rewards/margins": 0.04435240477323532, "rewards/rejected": -0.1486315280199051, "sft_loss": 1.0427912473678589, "step": 140 }, { "epoch": 2.6666666666666665, "grad_norm": 5.990176200866699, "learning_rate": 1.905249951948071e-07, "logits/chosen": -2.715557813644409, "logits/rejected": -2.731482982635498, "logps/chosen": -1.144046425819397, "logps/rejected": -1.2883504629135132, "loss": 1.2074, "odds_ratio_loss": 0.6331667304039001, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.11440464109182358, "rewards/margins": 0.014430420473217964, "rewards/rejected": -0.1288350522518158, "sft_loss": 1.144046425819397, "step": 150 }, { "epoch": 2.8444444444444446, "grad_norm": 2.793104887008667, "learning_rate": 5.2766712488021566e-08, "logits/chosen": -2.695949077606201, "logits/rejected": -2.7109360694885254, "logps/chosen": -0.9868586659431458, "logps/rejected": -1.2109596729278564, "loss": 1.0456, "odds_ratio_loss": 0.5878169536590576, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0986858680844307, "rewards/margins": 0.022410111501812935, "rewards/rejected": -0.12109597772359848, "sft_loss": 0.9868586659431458, "step": 160 }, { "epoch": 2.986666666666667, "step": 168, "total_flos": 2.303993975490478e+17, "train_loss": 1.1586682626179285, "train_runtime": 5352.1206, "train_samples_per_second": 0.504, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 168, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.303993975490478e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }