{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 12855, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 9.701138496398926, "learning_rate": 5.000000000000001e-07, "logits/chosen": -1.4137420654296875, "logits/rejected": -1.1609899997711182, "logps/chosen": -1.056043028831482, "logps/rejected": -0.9840900301933289, "loss": 1.1358, "odds_ratio_loss": 0.7978944778442383, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10560431331396103, "rewards/margins": -0.007195314858108759, "rewards/rejected": -0.09840899705886841, "sft_loss": 1.056043028831482, "step": 5 }, { "epoch": 0.0, "grad_norm": 14.184392929077148, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -1.4698854684829712, "logits/rejected": -0.8033113479614258, "logps/chosen": -1.0688271522521973, "logps/rejected": -0.8340684771537781, "loss": 1.1659, "odds_ratio_loss": 0.9709033966064453, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1068827286362648, "rewards/margins": -0.023475874215364456, "rewards/rejected": -0.08340685069561005, "sft_loss": 1.0688271522521973, "step": 10 }, { "epoch": 0.0, "grad_norm": 5.41991662979126, "learning_rate": 1.5e-06, "logits/chosen": -1.3370453119277954, "logits/rejected": -1.307146668434143, "logps/chosen": -1.2577979564666748, "logps/rejected": -0.7277308106422424, "loss": 1.3774, "odds_ratio_loss": 1.196258783340454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12577982246875763, "rewards/margins": -0.05300673842430115, "rewards/rejected": -0.07277307659387589, "sft_loss": 1.2577979564666748, "step": 15 }, { "epoch": 0.0, "grad_norm": 54.984066009521484, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -1.2405481338500977, "logits/rejected": -1.1191933155059814, "logps/chosen": -1.3065041303634644, "logps/rejected": -0.9139581918716431, "loss": 1.4111, "odds_ratio_loss": 1.045548915863037, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13065043091773987, "rewards/margins": -0.03925459831953049, "rewards/rejected": -0.09139582514762878, "sft_loss": 1.3065041303634644, "step": 20 }, { "epoch": 0.0, "grad_norm": 7.118112087249756, "learning_rate": 2.5e-06, "logits/chosen": -1.370082139968872, "logits/rejected": -1.216050386428833, "logps/chosen": -1.2248599529266357, "logps/rejected": -1.151605486869812, "loss": 1.3155, "odds_ratio_loss": 0.9067083597183228, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.12248601019382477, "rewards/margins": -0.007325439713895321, "rewards/rejected": -0.11516056209802628, "sft_loss": 1.2248599529266357, "step": 25 }, { "epoch": 0.0, "grad_norm": 25.97501564025879, "learning_rate": 3e-06, "logits/chosen": -1.396695613861084, "logits/rejected": -0.8267443776130676, "logps/chosen": -0.8272935748100281, "logps/rejected": -0.9551171064376831, "loss": 0.8852, "odds_ratio_loss": 0.5793362855911255, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08272935450077057, "rewards/margins": 0.012782363221049309, "rewards/rejected": -0.09551171958446503, "sft_loss": 0.8272935748100281, "step": 30 }, { "epoch": 0.0, "grad_norm": 6.940337657928467, "learning_rate": 3.5e-06, "logits/chosen": -1.3188809156417847, "logits/rejected": -0.8736783266067505, "logps/chosen": -1.076306939125061, "logps/rejected": -0.9470604658126831, "loss": 1.1587, "odds_ratio_loss": 0.8241630792617798, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10763069242238998, "rewards/margins": -0.012924641370773315, "rewards/rejected": -0.09470604360103607, "sft_loss": 1.076306939125061, "step": 35 }, { "epoch": 0.0, "grad_norm": 10.32648754119873, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.295627236366272, "logits/rejected": -0.9906957745552063, "logps/chosen": -1.1894115209579468, "logps/rejected": -0.960677444934845, "loss": 1.28, "odds_ratio_loss": 0.9056330919265747, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.11894116550683975, "rewards/margins": -0.02287341095507145, "rewards/rejected": -0.09606774151325226, "sft_loss": 1.1894115209579468, "step": 40 }, { "epoch": 0.0, "grad_norm": 5.0081634521484375, "learning_rate": 4.5e-06, "logits/chosen": -1.487616777420044, "logits/rejected": -1.2281392812728882, "logps/chosen": -0.6496211290359497, "logps/rejected": -1.5355957746505737, "loss": 0.6776, "odds_ratio_loss": 0.2798224091529846, "rewards/accuracies": 1.0, "rewards/chosen": -0.06496210396289825, "rewards/margins": 0.08859746903181076, "rewards/rejected": -0.1535595953464508, "sft_loss": 0.6496211290359497, "step": 45 }, { "epoch": 0.0, "grad_norm": 10.065803527832031, "learning_rate": 5e-06, "logits/chosen": -1.2928639650344849, "logits/rejected": -0.9908641576766968, "logps/chosen": -0.9380599856376648, "logps/rejected": -2.4311070442199707, "loss": 0.9678, "odds_ratio_loss": 0.2970461845397949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09380599856376648, "rewards/margins": 0.14930468797683716, "rewards/rejected": -0.24311068654060364, "sft_loss": 0.9380599856376648, "step": 50 }, { "epoch": 0.0, "grad_norm": 5.9541497230529785, "learning_rate": 5.500000000000001e-06, "logits/chosen": -1.368978500366211, "logits/rejected": -0.7404534220695496, "logps/chosen": -0.8750090599060059, "logps/rejected": -0.8971788287162781, "loss": 0.9421, "odds_ratio_loss": 0.6712835431098938, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0875009149312973, "rewards/margins": 0.002216977532953024, "rewards/rejected": -0.08971788734197617, "sft_loss": 0.8750090599060059, "step": 55 }, { "epoch": 0.0, "grad_norm": 5.816754341125488, "learning_rate": 6e-06, "logits/chosen": -1.284854531288147, "logits/rejected": -1.3418177366256714, "logps/chosen": -0.9037086367607117, "logps/rejected": -0.9571773409843445, "loss": 0.9751, "odds_ratio_loss": 0.713750422000885, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09037085622549057, "rewards/margins": 0.005346869118511677, "rewards/rejected": -0.09571772813796997, "sft_loss": 0.9037086367607117, "step": 60 }, { "epoch": 0.01, "grad_norm": 10.776530265808105, "learning_rate": 6.5000000000000004e-06, "logits/chosen": -1.157330870628357, "logits/rejected": -1.152259349822998, "logps/chosen": -1.0829627513885498, "logps/rejected": -1.1035131216049194, "loss": 1.1594, "odds_ratio_loss": 0.764147937297821, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10829626023769379, "rewards/margins": 0.002055042190477252, "rewards/rejected": -0.1103513091802597, "sft_loss": 1.0829627513885498, "step": 65 }, { "epoch": 0.01, "grad_norm": 12.743794441223145, "learning_rate": 7e-06, "logits/chosen": -1.3514772653579712, "logits/rejected": -1.089449405670166, "logps/chosen": -0.9904440641403198, "logps/rejected": -1.0389869213104248, "loss": 1.0554, "odds_ratio_loss": 0.6493497490882874, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09904440492391586, "rewards/margins": 0.004854282829910517, "rewards/rejected": -0.10389868915081024, "sft_loss": 0.9904440641403198, "step": 70 }, { "epoch": 0.01, "grad_norm": 7.272915840148926, "learning_rate": 7.500000000000001e-06, "logits/chosen": -1.3866747617721558, "logits/rejected": -0.6379045248031616, "logps/chosen": -1.008975625038147, "logps/rejected": -1.168935775756836, "loss": 1.0743, "odds_ratio_loss": 0.653215765953064, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10089756548404694, "rewards/margins": 0.015996018424630165, "rewards/rejected": -0.11689357459545135, "sft_loss": 1.008975625038147, "step": 75 }, { "epoch": 0.01, "grad_norm": 198.51878356933594, "learning_rate": 8.000000000000001e-06, "logits/chosen": -1.4508455991744995, "logits/rejected": -1.2784332036972046, "logps/chosen": -1.6414655447006226, "logps/rejected": -2.3989181518554688, "loss": 1.7165, "odds_ratio_loss": 0.7502911686897278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1641465425491333, "rewards/margins": 0.07574529200792313, "rewards/rejected": -0.23989181220531464, "sft_loss": 1.6414655447006226, "step": 80 }, { "epoch": 0.01, "grad_norm": 5.810291290283203, "learning_rate": 8.5e-06, "logits/chosen": -1.3303495645523071, "logits/rejected": -0.7018251419067383, "logps/chosen": -0.8492165803909302, "logps/rejected": -0.7184539437294006, "loss": 0.9336, "odds_ratio_loss": 0.8441473841667175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08492164313793182, "rewards/margins": -0.013076257891952991, "rewards/rejected": -0.0718453973531723, "sft_loss": 0.8492165803909302, "step": 85 }, { "epoch": 0.01, "grad_norm": 5.648757457733154, "learning_rate": 9e-06, "logits/chosen": -1.4343656301498413, "logits/rejected": -1.3420077562332153, "logps/chosen": -1.1983975172042847, "logps/rejected": -2.7446682453155518, "loss": 1.2826, "odds_ratio_loss": 0.8421158790588379, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.11983975023031235, "rewards/margins": 0.15462705492973328, "rewards/rejected": -0.2744668126106262, "sft_loss": 1.1983975172042847, "step": 90 }, { "epoch": 0.01, "grad_norm": 9.33594036102295, "learning_rate": 9.5e-06, "logits/chosen": -1.4645180702209473, "logits/rejected": -1.1062164306640625, "logps/chosen": -0.8589698672294617, "logps/rejected": -0.9380282163619995, "loss": 0.9372, "odds_ratio_loss": 0.7820344567298889, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08589698374271393, "rewards/margins": 0.007905842736363411, "rewards/rejected": -0.09380282461643219, "sft_loss": 0.8589698672294617, "step": 95 }, { "epoch": 0.01, "grad_norm": 6.393992900848389, "learning_rate": 1e-05, "logits/chosen": -1.0969440937042236, "logits/rejected": -1.246144413948059, "logps/chosen": -1.0969411134719849, "logps/rejected": -0.7135934829711914, "loss": 1.2099, "odds_ratio_loss": 1.1294300556182861, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10969410836696625, "rewards/margins": -0.038334764540195465, "rewards/rejected": -0.07135935127735138, "sft_loss": 1.0969411134719849, "step": 100 }, { "epoch": 0.01, "grad_norm": 13.763640403747559, "learning_rate": 9.999996208432589e-06, "logits/chosen": -1.2595760822296143, "logits/rejected": -1.132385492324829, "logps/chosen": -1.1437269449234009, "logps/rejected": -1.4012759923934937, "loss": 1.2214, "odds_ratio_loss": 0.7770463824272156, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11437269300222397, "rewards/margins": 0.025754917412996292, "rewards/rejected": -0.14012759923934937, "sft_loss": 1.1437269449234009, "step": 105 }, { "epoch": 0.01, "grad_norm": 4.661446571350098, "learning_rate": 9.999984833736102e-06, "logits/chosen": -1.322127342224121, "logits/rejected": -0.9959952235221863, "logps/chosen": -1.2580044269561768, "logps/rejected": -0.9094411730766296, "loss": 1.362, "odds_ratio_loss": 1.0395630598068237, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.12580044567584991, "rewards/margins": -0.03485632687807083, "rewards/rejected": -0.09094411134719849, "sft_loss": 1.2580044269561768, "step": 110 }, { "epoch": 0.01, "grad_norm": 43.42310333251953, "learning_rate": 9.999965875927792e-06, "logits/chosen": -1.4291982650756836, "logits/rejected": -1.1815580129623413, "logps/chosen": -0.7048165798187256, "logps/rejected": -1.222245216369629, "loss": 0.7449, "odds_ratio_loss": 0.4009336829185486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07048166543245316, "rewards/margins": 0.05174286291003227, "rewards/rejected": -0.12222452461719513, "sft_loss": 0.7048165798187256, "step": 115 }, { "epoch": 0.01, "grad_norm": 8.35852336883545, "learning_rate": 9.99993933503641e-06, "logits/chosen": -1.299971342086792, "logits/rejected": -0.8148666620254517, "logps/chosen": -0.9109383821487427, "logps/rejected": -1.1609586477279663, "loss": 0.9817, "odds_ratio_loss": 0.7075847387313843, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09109384566545486, "rewards/margins": 0.025002023205161095, "rewards/rejected": -0.11609586328268051, "sft_loss": 0.9109383821487427, "step": 120 }, { "epoch": 0.01, "grad_norm": 8.823124885559082, "learning_rate": 9.99990521110221e-06, "logits/chosen": -1.3256946802139282, "logits/rejected": -0.8278160095214844, "logps/chosen": -0.9099394679069519, "logps/rejected": -1.1701653003692627, "loss": 0.9708, "odds_ratio_loss": 0.6086278557777405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0909939557313919, "rewards/margins": 0.02602258324623108, "rewards/rejected": -0.11701653897762299, "sft_loss": 0.9099394679069519, "step": 125 }, { "epoch": 0.01, "grad_norm": 6.742435455322266, "learning_rate": 9.999863504176946e-06, "logits/chosen": -1.3721487522125244, "logits/rejected": -0.9425897598266602, "logps/chosen": -1.0938831567764282, "logps/rejected": -1.0658223628997803, "loss": 1.1687, "odds_ratio_loss": 0.7483164072036743, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1093883141875267, "rewards/margins": -0.0028060779441148043, "rewards/rejected": -0.10658223927021027, "sft_loss": 1.0938831567764282, "step": 130 }, { "epoch": 0.01, "grad_norm": 22.933637619018555, "learning_rate": 9.999814214323868e-06, "logits/chosen": -1.4473979473114014, "logits/rejected": -1.0455162525177002, "logps/chosen": -0.8949346542358398, "logps/rejected": -1.8385162353515625, "loss": 0.9424, "odds_ratio_loss": 0.47495514154434204, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08949346840381622, "rewards/margins": 0.0943581610918045, "rewards/rejected": -0.18385162949562073, "sft_loss": 0.8949346542358398, "step": 135 }, { "epoch": 0.01, "grad_norm": 24.416580200195312, "learning_rate": 9.999757341617735e-06, "logits/chosen": -1.2248870134353638, "logits/rejected": -1.105491280555725, "logps/chosen": -1.0170341730117798, "logps/rejected": -1.9063619375228882, "loss": 1.0536, "odds_ratio_loss": 0.36606836318969727, "rewards/accuracies": 1.0, "rewards/chosen": -0.10170342028141022, "rewards/margins": 0.08893279731273651, "rewards/rejected": -0.19063621759414673, "sft_loss": 1.0170341730117798, "step": 140 }, { "epoch": 0.01, "grad_norm": 14.371286392211914, "learning_rate": 9.9996928861448e-06, "logits/chosen": -1.3428990840911865, "logits/rejected": -1.150731086730957, "logps/chosen": -0.9745044708251953, "logps/rejected": -1.088010549545288, "loss": 1.0341, "odds_ratio_loss": 0.5962523818016052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09745045006275177, "rewards/margins": 0.011350591666996479, "rewards/rejected": -0.10880105197429657, "sft_loss": 0.9745044708251953, "step": 145 }, { "epoch": 0.01, "grad_norm": 10.886384963989258, "learning_rate": 9.999620848002815e-06, "logits/chosen": -1.401794672012329, "logits/rejected": -0.9987030029296875, "logps/chosen": -1.0617334842681885, "logps/rejected": -1.506756067276001, "loss": 1.1242, "odds_ratio_loss": 0.624306857585907, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10617335140705109, "rewards/margins": 0.04450225830078125, "rewards/rejected": -0.15067560970783234, "sft_loss": 1.0617334842681885, "step": 150 }, { "epoch": 0.01, "grad_norm": 7.1690449714660645, "learning_rate": 9.99954122730104e-06, "logits/chosen": -1.4592490196228027, "logits/rejected": -1.0813499689102173, "logps/chosen": -1.055484414100647, "logps/rejected": -0.9076594114303589, "loss": 1.143, "odds_ratio_loss": 0.875472366809845, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1055484414100647, "rewards/margins": -0.014782501384615898, "rewards/rejected": -0.09076593816280365, "sft_loss": 1.055484414100647, "step": 155 }, { "epoch": 0.01, "grad_norm": 6.417614936828613, "learning_rate": 9.999454024160225e-06, "logits/chosen": -1.1997926235198975, "logits/rejected": -0.984194278717041, "logps/chosen": -1.0787403583526611, "logps/rejected": -0.9788461923599243, "loss": 1.1611, "odds_ratio_loss": 0.8234724998474121, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10787403583526611, "rewards/margins": -0.009989425539970398, "rewards/rejected": -0.09788461774587631, "sft_loss": 1.0787403583526611, "step": 160 }, { "epoch": 0.01, "grad_norm": 5.122522830963135, "learning_rate": 9.999359238712628e-06, "logits/chosen": -1.332296371459961, "logits/rejected": -0.7916947603225708, "logps/chosen": -0.8795109987258911, "logps/rejected": -0.7710585594177246, "loss": 0.9758, "odds_ratio_loss": 0.9632610082626343, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08795110136270523, "rewards/margins": -0.01084524393081665, "rewards/rejected": -0.07710584998130798, "sft_loss": 0.8795109987258911, "step": 165 }, { "epoch": 0.01, "grad_norm": 19.687545776367188, "learning_rate": 9.999256871102002e-06, "logits/chosen": -1.3000966310501099, "logits/rejected": -1.3263441324234009, "logps/chosen": -1.4506076574325562, "logps/rejected": -1.1992508172988892, "loss": 1.5467, "odds_ratio_loss": 0.960999608039856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14506077766418457, "rewards/margins": -0.025135690346360207, "rewards/rejected": -0.11992508172988892, "sft_loss": 1.4506076574325562, "step": 170 }, { "epoch": 0.01, "grad_norm": 7.098282337188721, "learning_rate": 9.9991469214836e-06, "logits/chosen": -1.183526873588562, "logits/rejected": -1.3057854175567627, "logps/chosen": -0.8797661066055298, "logps/rejected": -0.6698340177536011, "loss": 0.9943, "odds_ratio_loss": 1.1449306011199951, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.0879766121506691, "rewards/margins": -0.02099320851266384, "rewards/rejected": -0.0669834092259407, "sft_loss": 0.8797661066055298, "step": 175 }, { "epoch": 0.01, "grad_norm": 11.24793529510498, "learning_rate": 9.999029390024176e-06, "logits/chosen": -1.4583027362823486, "logits/rejected": -1.0522215366363525, "logps/chosen": -0.7333666086196899, "logps/rejected": -1.126899003982544, "loss": 0.7751, "odds_ratio_loss": 0.41780009865760803, "rewards/accuracies": 1.0, "rewards/chosen": -0.073336660861969, "rewards/margins": 0.03935323283076286, "rewards/rejected": -0.11268989741802216, "sft_loss": 0.7333666086196899, "step": 180 }, { "epoch": 0.01, "grad_norm": 61.70880889892578, "learning_rate": 9.99890427690198e-06, "logits/chosen": -1.3517203330993652, "logits/rejected": -1.0869548320770264, "logps/chosen": -0.9237698316574097, "logps/rejected": -1.4883028268814087, "loss": 0.9746, "odds_ratio_loss": 0.5079615116119385, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09237699955701828, "rewards/margins": 0.05645329877734184, "rewards/rejected": -0.14883029460906982, "sft_loss": 0.9237698316574097, "step": 185 }, { "epoch": 0.01, "grad_norm": 15.535794258117676, "learning_rate": 9.998771582306763e-06, "logits/chosen": -1.2531821727752686, "logits/rejected": -1.177253007888794, "logps/chosen": -1.2667360305786133, "logps/rejected": -1.0743447542190552, "loss": 1.3596, "odds_ratio_loss": 0.9285286068916321, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1266736090183258, "rewards/margins": -0.019239135086536407, "rewards/rejected": -0.1074344664812088, "sft_loss": 1.2667360305786133, "step": 190 }, { "epoch": 0.02, "grad_norm": 16.82183265686035, "learning_rate": 9.998631306439772e-06, "logits/chosen": -1.4082105159759521, "logits/rejected": -1.0227917432785034, "logps/chosen": -0.9651159048080444, "logps/rejected": -0.8935340046882629, "loss": 1.0417, "odds_ratio_loss": 0.76555335521698, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.0965115949511528, "rewards/margins": -0.007158198393881321, "rewards/rejected": -0.08935339748859406, "sft_loss": 0.9651159048080444, "step": 195 }, { "epoch": 0.02, "grad_norm": 6.718238353729248, "learning_rate": 9.998483449513756e-06, "logits/chosen": -1.2385953664779663, "logits/rejected": -0.7646237015724182, "logps/chosen": -1.0081157684326172, "logps/rejected": -1.0011669397354126, "loss": 1.0797, "odds_ratio_loss": 0.7158805131912231, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10081157833337784, "rewards/margins": -0.00069489108864218, "rewards/rejected": -0.10011669248342514, "sft_loss": 1.0081157684326172, "step": 200 }, { "epoch": 0.02, "grad_norm": 9.732516288757324, "learning_rate": 9.998328011752954e-06, "logits/chosen": -1.3042540550231934, "logits/rejected": -0.8548318147659302, "logps/chosen": -1.2158184051513672, "logps/rejected": -1.2620457410812378, "loss": 1.2815, "odds_ratio_loss": 0.6565437316894531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12158183753490448, "rewards/margins": 0.004622741136699915, "rewards/rejected": -0.12620458006858826, "sft_loss": 1.2158184051513672, "step": 205 }, { "epoch": 0.02, "grad_norm": 15.401986122131348, "learning_rate": 9.99816499339311e-06, "logits/chosen": -1.1956886053085327, "logits/rejected": -0.8465889692306519, "logps/chosen": -1.0776029825210571, "logps/rejected": -1.3274129629135132, "loss": 1.1322, "odds_ratio_loss": 0.545491099357605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10776031017303467, "rewards/margins": 0.024980993941426277, "rewards/rejected": -0.1327413022518158, "sft_loss": 1.0776029825210571, "step": 210 }, { "epoch": 0.02, "grad_norm": 10.379392623901367, "learning_rate": 9.997994394681463e-06, "logits/chosen": -1.2499234676361084, "logits/rejected": -1.2567858695983887, "logps/chosen": -0.9094167947769165, "logps/rejected": -1.306290864944458, "loss": 0.9605, "odds_ratio_loss": 0.5112447738647461, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09094168245792389, "rewards/margins": 0.039687395095825195, "rewards/rejected": -0.13062907755374908, "sft_loss": 0.9094167947769165, "step": 215 }, { "epoch": 0.02, "grad_norm": 11.4965181350708, "learning_rate": 9.997816215876746e-06, "logits/chosen": -1.376062035560608, "logits/rejected": -1.2260369062423706, "logps/chosen": -0.8196055293083191, "logps/rejected": -0.5650048851966858, "loss": 0.9326, "odds_ratio_loss": 1.1297613382339478, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.08196055144071579, "rewards/margins": -0.02546006441116333, "rewards/rejected": -0.05650048330426216, "sft_loss": 0.8196055293083191, "step": 220 }, { "epoch": 0.02, "grad_norm": 14.551139831542969, "learning_rate": 9.99763045724919e-06, "logits/chosen": -1.43378746509552, "logits/rejected": -0.9187615513801575, "logps/chosen": -1.6133413314819336, "logps/rejected": -2.874190330505371, "loss": 1.7014, "odds_ratio_loss": 0.8803858757019043, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.16133412718772888, "rewards/margins": 0.12608489394187927, "rewards/rejected": -0.28741902112960815, "sft_loss": 1.6133413314819336, "step": 225 }, { "epoch": 0.02, "grad_norm": 6.731103420257568, "learning_rate": 9.997437119080521e-06, "logits/chosen": -1.3006021976470947, "logits/rejected": -1.0998780727386475, "logps/chosen": -1.077310562133789, "logps/rejected": -1.445723295211792, "loss": 1.1458, "odds_ratio_loss": 0.6849013566970825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10773106664419174, "rewards/margins": 0.03684128075838089, "rewards/rejected": -0.14457234740257263, "sft_loss": 1.077310562133789, "step": 230 }, { "epoch": 0.02, "grad_norm": 34.18952941894531, "learning_rate": 9.997236201663962e-06, "logits/chosen": -1.480669617652893, "logits/rejected": -0.9708350300788879, "logps/chosen": -0.9847885370254517, "logps/rejected": -1.239406943321228, "loss": 1.049, "odds_ratio_loss": 0.6420648694038391, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09847886860370636, "rewards/margins": 0.025461841374635696, "rewards/rejected": -0.12394070625305176, "sft_loss": 0.9847885370254517, "step": 235 }, { "epoch": 0.02, "grad_norm": 7.8535075187683105, "learning_rate": 9.99702770530423e-06, "logits/chosen": -1.290428876876831, "logits/rejected": -1.2067062854766846, "logps/chosen": -1.3287948369979858, "logps/rejected": -0.8534753918647766, "loss": 1.4482, "odds_ratio_loss": 1.1943113803863525, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13287948071956635, "rewards/margins": -0.04753195121884346, "rewards/rejected": -0.08534753322601318, "sft_loss": 1.3287948369979858, "step": 240 }, { "epoch": 0.02, "grad_norm": 16.53376007080078, "learning_rate": 9.996811630317534e-06, "logits/chosen": -1.4881622791290283, "logits/rejected": -1.0307799577713013, "logps/chosen": -1.1290310621261597, "logps/rejected": -0.9766250848770142, "loss": 1.2165, "odds_ratio_loss": 0.8741899728775024, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11290310323238373, "rewards/margins": -0.015240591950714588, "rewards/rejected": -0.09766252338886261, "sft_loss": 1.1290310621261597, "step": 245 }, { "epoch": 0.02, "grad_norm": 9.521791458129883, "learning_rate": 9.996587977031583e-06, "logits/chosen": -1.3156846761703491, "logits/rejected": -0.6501539945602417, "logps/chosen": -1.225023627281189, "logps/rejected": -1.778752088546753, "loss": 1.285, "odds_ratio_loss": 0.5998628735542297, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12250236421823502, "rewards/margins": 0.05537285655736923, "rewards/rejected": -0.17787523567676544, "sft_loss": 1.225023627281189, "step": 250 }, { "epoch": 0.02, "grad_norm": 14.721240997314453, "learning_rate": 9.996356745785572e-06, "logits/chosen": -1.3730287551879883, "logits/rejected": -1.036368489265442, "logps/chosen": -1.005418062210083, "logps/rejected": -2.0516037940979004, "loss": 1.0814, "odds_ratio_loss": 0.7593610286712646, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10054180771112442, "rewards/margins": 0.10461856424808502, "rewards/rejected": -0.20516034960746765, "sft_loss": 1.005418062210083, "step": 255 }, { "epoch": 0.02, "grad_norm": 94.28663635253906, "learning_rate": 9.996117936930194e-06, "logits/chosen": -1.1371415853500366, "logits/rejected": -1.0226691961288452, "logps/chosen": -1.1712075471878052, "logps/rejected": -1.5980589389801025, "loss": 1.2284, "odds_ratio_loss": 0.5723423957824707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11712075769901276, "rewards/margins": 0.0426851324737072, "rewards/rejected": -0.15980589389801025, "sft_loss": 1.1712075471878052, "step": 260 }, { "epoch": 0.02, "grad_norm": 15.995462417602539, "learning_rate": 9.995871550827632e-06, "logits/chosen": -1.2461187839508057, "logits/rejected": -0.6994976997375488, "logps/chosen": -1.089324951171875, "logps/rejected": -0.9996173977851868, "loss": 1.1704, "odds_ratio_loss": 0.8106604814529419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1089324951171875, "rewards/margins": -0.008970752358436584, "rewards/rejected": -0.09996173530817032, "sft_loss": 1.089324951171875, "step": 265 }, { "epoch": 0.02, "grad_norm": 11.086663246154785, "learning_rate": 9.995617587851563e-06, "logits/chosen": -1.3579238653182983, "logits/rejected": -0.8066130876541138, "logps/chosen": -1.0367783308029175, "logps/rejected": -3.150995969772339, "loss": 1.1058, "odds_ratio_loss": 0.6898540258407593, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10367783159017563, "rewards/margins": 0.21142175793647766, "rewards/rejected": -0.3150995671749115, "sft_loss": 1.0367783308029175, "step": 270 }, { "epoch": 0.02, "grad_norm": 29.14234733581543, "learning_rate": 9.995356048387154e-06, "logits/chosen": -1.1860498189926147, "logits/rejected": -1.4626860618591309, "logps/chosen": -1.1332612037658691, "logps/rejected": -1.233254313468933, "loss": 1.2158, "odds_ratio_loss": 0.8249042630195618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11332611739635468, "rewards/margins": 0.009999324567615986, "rewards/rejected": -0.12332544475793839, "sft_loss": 1.1332612037658691, "step": 275 }, { "epoch": 0.02, "grad_norm": 17.886398315429688, "learning_rate": 9.995086932831063e-06, "logits/chosen": -1.3031141757965088, "logits/rejected": -1.1265239715576172, "logps/chosen": -0.8843557238578796, "logps/rejected": -0.5077162981033325, "loss": 1.0095, "odds_ratio_loss": 1.2514125108718872, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.0884355679154396, "rewards/margins": -0.037663936614990234, "rewards/rejected": -0.05077163502573967, "sft_loss": 0.8843557238578796, "step": 280 }, { "epoch": 0.02, "grad_norm": 6.501739025115967, "learning_rate": 9.994810241591437e-06, "logits/chosen": -1.4174010753631592, "logits/rejected": -0.9393793344497681, "logps/chosen": -0.9986799359321594, "logps/rejected": -1.7344329357147217, "loss": 1.0552, "odds_ratio_loss": 0.5652368068695068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09986799955368042, "rewards/margins": 0.07357531040906906, "rewards/rejected": -0.17344330251216888, "sft_loss": 0.9986799359321594, "step": 285 }, { "epoch": 0.02, "grad_norm": 3.8505361080169678, "learning_rate": 9.994525975087914e-06, "logits/chosen": -1.5343291759490967, "logits/rejected": -0.9835977554321289, "logps/chosen": -0.9028434753417969, "logps/rejected": -0.9722514152526855, "loss": 0.9772, "odds_ratio_loss": 0.7433664798736572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09028434753417969, "rewards/margins": 0.006940790917724371, "rewards/rejected": -0.0972251445055008, "sft_loss": 0.9028434753417969, "step": 290 }, { "epoch": 0.02, "grad_norm": 7.451793670654297, "learning_rate": 9.99423413375162e-06, "logits/chosen": -1.3480112552642822, "logits/rejected": -0.4587880074977875, "logps/chosen": -0.9931986927986145, "logps/rejected": -1.0893638134002686, "loss": 1.0575, "odds_ratio_loss": 0.6427035331726074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09931986033916473, "rewards/margins": 0.009616507217288017, "rewards/rejected": -0.1089363843202591, "sft_loss": 0.9931986927986145, "step": 295 }, { "epoch": 0.02, "grad_norm": 6.356688499450684, "learning_rate": 9.99393471802517e-06, "logits/chosen": -1.119459867477417, "logits/rejected": -0.8443056344985962, "logps/chosen": -0.929328441619873, "logps/rejected": -0.7678232192993164, "loss": 1.0215, "odds_ratio_loss": 0.9219361543655396, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09293285012245178, "rewards/margins": -0.01615052856504917, "rewards/rejected": -0.07678232342004776, "sft_loss": 0.929328441619873, "step": 300 }, { "epoch": 0.02, "grad_norm": 7.339206695556641, "learning_rate": 9.993627728362663e-06, "logits/chosen": -1.1373573541641235, "logits/rejected": -0.6757062673568726, "logps/chosen": -1.032034158706665, "logps/rejected": -1.0653880834579468, "loss": 1.1037, "odds_ratio_loss": 0.7163321375846863, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1032034158706665, "rewards/margins": 0.003335393965244293, "rewards/rejected": -0.1065388172864914, "sft_loss": 1.032034158706665, "step": 305 }, { "epoch": 0.02, "grad_norm": 10.089631080627441, "learning_rate": 9.993313165229692e-06, "logits/chosen": -1.3139688968658447, "logits/rejected": -0.9246614575386047, "logps/chosen": -0.921240508556366, "logps/rejected": -0.9949884414672852, "loss": 0.9923, "odds_ratio_loss": 0.710671603679657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09212405979633331, "rewards/margins": 0.007374781183898449, "rewards/rejected": -0.09949883818626404, "sft_loss": 0.921240508556366, "step": 310 }, { "epoch": 0.02, "grad_norm": 15.631460189819336, "learning_rate": 9.99299102910333e-06, "logits/chosen": -1.333685278892517, "logits/rejected": -0.8180558085441589, "logps/chosen": -1.2940690517425537, "logps/rejected": -1.4365034103393555, "loss": 1.3564, "odds_ratio_loss": 0.623278796672821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1294068992137909, "rewards/margins": 0.01424344815313816, "rewards/rejected": -0.1436503529548645, "sft_loss": 1.2940690517425537, "step": 315 }, { "epoch": 0.02, "grad_norm": 5.078014373779297, "learning_rate": 9.992661320472139e-06, "logits/chosen": -1.337958812713623, "logits/rejected": -0.6402798295021057, "logps/chosen": -0.9434254765510559, "logps/rejected": -0.7878870964050293, "loss": 1.0331, "odds_ratio_loss": 0.8966773152351379, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.09434254467487335, "rewards/margins": -0.015553837642073631, "rewards/rejected": -0.07878871262073517, "sft_loss": 0.9434254765510559, "step": 320 }, { "epoch": 0.03, "grad_norm": 13.075796127319336, "learning_rate": 9.992324039836161e-06, "logits/chosen": -1.4848788976669312, "logits/rejected": -1.2323976755142212, "logps/chosen": -0.967354953289032, "logps/rejected": -0.8414648771286011, "loss": 1.0499, "odds_ratio_loss": 0.8251625299453735, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.09673549234867096, "rewards/margins": -0.012589002028107643, "rewards/rejected": -0.08414648473262787, "sft_loss": 0.967354953289032, "step": 325 }, { "epoch": 0.03, "grad_norm": 12.756791114807129, "learning_rate": 9.991979187706925e-06, "logits/chosen": -1.4242439270019531, "logits/rejected": -0.8965123295783997, "logps/chosen": -1.032097339630127, "logps/rejected": -1.1921613216400146, "loss": 1.0902, "odds_ratio_loss": 0.5806518793106079, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1032097339630127, "rewards/margins": 0.01600639522075653, "rewards/rejected": -0.11921612918376923, "sft_loss": 1.032097339630127, "step": 330 }, { "epoch": 0.03, "grad_norm": 34.52486038208008, "learning_rate": 9.991626764607447e-06, "logits/chosen": -1.3143986463546753, "logits/rejected": -0.7883418798446655, "logps/chosen": -1.0698704719543457, "logps/rejected": -1.3327813148498535, "loss": 1.1451, "odds_ratio_loss": 0.7526839971542358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10698704421520233, "rewards/margins": 0.02629108354449272, "rewards/rejected": -0.13327813148498535, "sft_loss": 1.0698704719543457, "step": 335 }, { "epoch": 0.03, "grad_norm": 6.101346015930176, "learning_rate": 9.991266771072219e-06, "logits/chosen": -1.2860424518585205, "logits/rejected": -0.9905912280082703, "logps/chosen": -1.153584361076355, "logps/rejected": -1.241302490234375, "loss": 1.2418, "odds_ratio_loss": 0.8823820948600769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11535843461751938, "rewards/margins": 0.00877181626856327, "rewards/rejected": -0.1241302490234375, "sft_loss": 1.153584361076355, "step": 340 }, { "epoch": 0.03, "grad_norm": 14.8096284866333, "learning_rate": 9.990899207647215e-06, "logits/chosen": -1.2073997259140015, "logits/rejected": -0.8682562708854675, "logps/chosen": -1.5429751873016357, "logps/rejected": -1.2009754180908203, "loss": 1.6562, "odds_ratio_loss": 1.1326292753219604, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.15429750084877014, "rewards/margins": -0.034199971705675125, "rewards/rejected": -0.12009753286838531, "sft_loss": 1.5429751873016357, "step": 345 }, { "epoch": 0.03, "grad_norm": 16.363656997680664, "learning_rate": 9.990524074889894e-06, "logits/chosen": -1.4137274026870728, "logits/rejected": -1.1143226623535156, "logps/chosen": -1.3855955600738525, "logps/rejected": -3.2896132469177246, "loss": 1.4376, "odds_ratio_loss": 0.5200487375259399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13855956494808197, "rewards/margins": 0.1904018074274063, "rewards/rejected": -0.3289613723754883, "sft_loss": 1.3855955600738525, "step": 350 }, { "epoch": 0.03, "grad_norm": 8.82497501373291, "learning_rate": 9.990141373369192e-06, "logits/chosen": -1.2464665174484253, "logits/rejected": -1.108237385749817, "logps/chosen": -0.788859486579895, "logps/rejected": -0.8404549360275269, "loss": 0.8587, "odds_ratio_loss": 0.6983199119567871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07888595759868622, "rewards/margins": 0.005159544292837381, "rewards/rejected": -0.08404550701379776, "sft_loss": 0.788859486579895, "step": 355 }, { "epoch": 0.03, "grad_norm": 5.63923454284668, "learning_rate": 9.989751103665523e-06, "logits/chosen": -1.2375560998916626, "logits/rejected": -0.7072022557258606, "logps/chosen": -0.8630796670913696, "logps/rejected": -1.0150066614151, "loss": 0.9281, "odds_ratio_loss": 0.6499568819999695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08630797266960144, "rewards/margins": 0.015192699618637562, "rewards/rejected": -0.10150066763162613, "sft_loss": 0.8630796670913696, "step": 360 }, { "epoch": 0.03, "grad_norm": 6.3448486328125, "learning_rate": 9.989353266370785e-06, "logits/chosen": -1.2494094371795654, "logits/rejected": -0.7897502779960632, "logps/chosen": -0.7433738112449646, "logps/rejected": -2.6994833946228027, "loss": 0.7757, "odds_ratio_loss": 0.32370153069496155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07433737814426422, "rewards/margins": 0.19561094045639038, "rewards/rejected": -0.2699483335018158, "sft_loss": 0.7433738112449646, "step": 365 }, { "epoch": 0.03, "grad_norm": 29.389131546020508, "learning_rate": 9.988947862088343e-06, "logits/chosen": -1.3360885381698608, "logits/rejected": -1.07649564743042, "logps/chosen": -1.17660391330719, "logps/rejected": -1.7249990701675415, "loss": 1.234, "odds_ratio_loss": 0.5744453072547913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11766038835048676, "rewards/margins": 0.05483951419591904, "rewards/rejected": -0.17249992489814758, "sft_loss": 1.17660391330719, "step": 370 }, { "epoch": 0.03, "grad_norm": 5.227707386016846, "learning_rate": 9.988534891433048e-06, "logits/chosen": -1.237786054611206, "logits/rejected": -1.303662896156311, "logps/chosen": -0.9617307782173157, "logps/rejected": -1.3591482639312744, "loss": 1.0078, "odds_ratio_loss": 0.46071720123291016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09617307037115097, "rewards/margins": 0.03974176198244095, "rewards/rejected": -0.13591483235359192, "sft_loss": 0.9617307782173157, "step": 375 }, { "epoch": 0.03, "grad_norm": 6.049983501434326, "learning_rate": 9.98811435503122e-06, "logits/chosen": -1.420792818069458, "logits/rejected": -1.1584433317184448, "logps/chosen": -1.0772771835327148, "logps/rejected": -0.9023246765136719, "loss": 1.1665, "odds_ratio_loss": 0.8922192454338074, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10772772133350372, "rewards/margins": -0.01749524101614952, "rewards/rejected": -0.0902324691414833, "sft_loss": 1.0772771835327148, "step": 380 }, { "epoch": 0.03, "grad_norm": 95.02626037597656, "learning_rate": 9.987686253520657e-06, "logits/chosen": -1.4909613132476807, "logits/rejected": -1.1455066204071045, "logps/chosen": -0.9977737665176392, "logps/rejected": -1.1992311477661133, "loss": 1.061, "odds_ratio_loss": 0.6324112415313721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09977736324071884, "rewards/margins": 0.020145747810602188, "rewards/rejected": -0.11992311477661133, "sft_loss": 0.9977737665176392, "step": 385 }, { "epoch": 0.03, "grad_norm": 10.860966682434082, "learning_rate": 9.98725058755063e-06, "logits/chosen": -1.4899775981903076, "logits/rejected": -1.1015746593475342, "logps/chosen": -1.325384497642517, "logps/rejected": -1.309385895729065, "loss": 1.3994, "odds_ratio_loss": 0.7398003339767456, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13253845274448395, "rewards/margins": -0.0015998601447790861, "rewards/rejected": -0.1309386044740677, "sft_loss": 1.325384497642517, "step": 390 }, { "epoch": 0.03, "grad_norm": 5.0103983879089355, "learning_rate": 9.986807357781878e-06, "logits/chosen": -1.4857903718948364, "logits/rejected": -0.9983230829238892, "logps/chosen": -0.9025314450263977, "logps/rejected": -0.843797504901886, "loss": 0.9809, "odds_ratio_loss": 0.7840155363082886, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09025314450263977, "rewards/margins": -0.005873396061360836, "rewards/rejected": -0.08437974750995636, "sft_loss": 0.9025314450263977, "step": 395 }, { "epoch": 0.03, "grad_norm": 12.473005294799805, "learning_rate": 9.986356564886621e-06, "logits/chosen": -1.3183162212371826, "logits/rejected": -0.7969815135002136, "logps/chosen": -1.0109480619430542, "logps/rejected": -1.190051794052124, "loss": 1.0721, "odds_ratio_loss": 0.6110685467720032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1010948047041893, "rewards/margins": 0.017910365015268326, "rewards/rejected": -0.11900516599416733, "sft_loss": 1.0109480619430542, "step": 400 }, { "epoch": 0.03, "grad_norm": 5.28497838973999, "learning_rate": 9.985898209548541e-06, "logits/chosen": -1.3139629364013672, "logits/rejected": -0.9575953483581543, "logps/chosen": -1.0471327304840088, "logps/rejected": -1.388662338256836, "loss": 1.107, "odds_ratio_loss": 0.5983954071998596, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10471327602863312, "rewards/margins": 0.03415297716856003, "rewards/rejected": -0.13886624574661255, "sft_loss": 1.0471327304840088, "step": 405 }, { "epoch": 0.03, "grad_norm": 7.404033184051514, "learning_rate": 9.98543229246279e-06, "logits/chosen": -1.2961384057998657, "logits/rejected": -0.8433519601821899, "logps/chosen": -1.1629507541656494, "logps/rejected": -1.3445327281951904, "loss": 1.2312, "odds_ratio_loss": 0.6821416616439819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11629508435726166, "rewards/margins": 0.0181582011282444, "rewards/rejected": -0.13445329666137695, "sft_loss": 1.1629507541656494, "step": 410 }, { "epoch": 0.03, "grad_norm": 67.28243255615234, "learning_rate": 9.984958814335995e-06, "logits/chosen": -1.206665277481079, "logits/rejected": -0.8332012295722961, "logps/chosen": -1.3545682430267334, "logps/rejected": -0.9981037378311157, "loss": 1.4571, "odds_ratio_loss": 1.0249886512756348, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13545681536197662, "rewards/margins": -0.035646453499794006, "rewards/rejected": -0.09981036931276321, "sft_loss": 1.3545682430267334, "step": 415 }, { "epoch": 0.03, "grad_norm": 10.88105583190918, "learning_rate": 9.984477775886241e-06, "logits/chosen": -1.2840704917907715, "logits/rejected": -0.9073774218559265, "logps/chosen": -1.0101227760314941, "logps/rejected": -1.3432159423828125, "loss": 1.0694, "odds_ratio_loss": 0.5929387211799622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10101227462291718, "rewards/margins": 0.033309321850538254, "rewards/rejected": -0.13432160019874573, "sft_loss": 1.0101227760314941, "step": 420 }, { "epoch": 0.03, "grad_norm": 7.556691646575928, "learning_rate": 9.983989177843088e-06, "logits/chosen": -1.2547153234481812, "logits/rejected": -0.6640560626983643, "logps/chosen": -0.8858125805854797, "logps/rejected": -0.8567777872085571, "loss": 0.9631, "odds_ratio_loss": 0.7732707262039185, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08858126401901245, "rewards/margins": -0.002903483808040619, "rewards/rejected": -0.08567778021097183, "sft_loss": 0.8858125805854797, "step": 425 }, { "epoch": 0.03, "grad_norm": 18.232669830322266, "learning_rate": 9.983493020947553e-06, "logits/chosen": -1.4706394672393799, "logits/rejected": -1.0486671924591064, "logps/chosen": -1.2855408191680908, "logps/rejected": -1.816890001296997, "loss": 1.3368, "odds_ratio_loss": 0.5123870968818665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1285540759563446, "rewards/margins": 0.053134918212890625, "rewards/rejected": -0.18168899416923523, "sft_loss": 1.2855408191680908, "step": 430 }, { "epoch": 0.03, "grad_norm": 13.52712631225586, "learning_rate": 9.982989305952125e-06, "logits/chosen": -1.439195990562439, "logits/rejected": -1.036635398864746, "logps/chosen": -1.218326449394226, "logps/rejected": -2.1572933197021484, "loss": 1.2839, "odds_ratio_loss": 0.6560950875282288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12183265388011932, "rewards/margins": 0.09389667212963104, "rewards/rejected": -0.21572932600975037, "sft_loss": 1.218326449394226, "step": 435 }, { "epoch": 0.03, "grad_norm": 14.365988731384277, "learning_rate": 9.982478033620746e-06, "logits/chosen": -1.367100477218628, "logits/rejected": -0.7920514345169067, "logps/chosen": -0.8387094736099243, "logps/rejected": -4.545905590057373, "loss": 0.8913, "odds_ratio_loss": 0.5259830951690674, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08387093245983124, "rewards/margins": 0.3707196116447449, "rewards/rejected": -0.4545905590057373, "sft_loss": 0.8387094736099243, "step": 440 }, { "epoch": 0.03, "grad_norm": 9.169198989868164, "learning_rate": 9.98195920472883e-06, "logits/chosen": -1.3178900480270386, "logits/rejected": -1.0445078611373901, "logps/chosen": -0.848944365978241, "logps/rejected": -0.767935574054718, "loss": 0.9279, "odds_ratio_loss": 0.7899635434150696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08489444106817245, "rewards/margins": -0.008100892417132854, "rewards/rejected": -0.07679355144500732, "sft_loss": 0.848944365978241, "step": 445 }, { "epoch": 0.04, "grad_norm": 7.628312587738037, "learning_rate": 9.981432820063249e-06, "logits/chosen": -1.4050272703170776, "logits/rejected": -1.0616410970687866, "logps/chosen": -0.9740702509880066, "logps/rejected": -1.0533596277236938, "loss": 1.0414, "odds_ratio_loss": 0.6728119254112244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0974070280790329, "rewards/margins": 0.007928932085633278, "rewards/rejected": -0.10533596575260162, "sft_loss": 0.9740702509880066, "step": 450 }, { "epoch": 0.04, "grad_norm": 22.665225982666016, "learning_rate": 9.980898880422324e-06, "logits/chosen": -1.3875292539596558, "logits/rejected": -0.9992658495903015, "logps/chosen": -1.2542369365692139, "logps/rejected": -1.6469194889068604, "loss": 1.31, "odds_ratio_loss": 0.5573633909225464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12542369961738586, "rewards/margins": 0.03926824778318405, "rewards/rejected": -0.1646919548511505, "sft_loss": 1.2542369365692139, "step": 455 }, { "epoch": 0.04, "grad_norm": 36.47029113769531, "learning_rate": 9.980357386615852e-06, "logits/chosen": -1.1831592321395874, "logits/rejected": -0.7186304330825806, "logps/chosen": -1.766829252243042, "logps/rejected": -0.7457488775253296, "loss": 1.947, "odds_ratio_loss": 1.8014347553253174, "rewards/accuracies": 0.0, "rewards/chosen": -0.17668291926383972, "rewards/margins": -0.10210806131362915, "rewards/rejected": -0.07457488030195236, "sft_loss": 1.766829252243042, "step": 460 }, { "epoch": 0.04, "grad_norm": 17.059722900390625, "learning_rate": 9.97980833946507e-06, "logits/chosen": -1.448505163192749, "logits/rejected": -0.8654235005378723, "logps/chosen": -1.2362231016159058, "logps/rejected": -3.4485504627227783, "loss": 1.273, "odds_ratio_loss": 0.36770644783973694, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12362232059240341, "rewards/margins": 0.22123269736766815, "rewards/rejected": -0.34485501050949097, "sft_loss": 1.2362231016159058, "step": 465 }, { "epoch": 0.04, "grad_norm": 9.267600059509277, "learning_rate": 9.97925173980268e-06, "logits/chosen": -1.36885666847229, "logits/rejected": -0.8539026379585266, "logps/chosen": -1.0619010925292969, "logps/rejected": -1.1658036708831787, "loss": 1.1364, "odds_ratio_loss": 0.7445744276046753, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10619010776281357, "rewards/margins": 0.010390259325504303, "rewards/rejected": -0.11658036708831787, "sft_loss": 1.0619010925292969, "step": 470 }, { "epoch": 0.04, "grad_norm": 10.483030319213867, "learning_rate": 9.978687588472838e-06, "logits/chosen": -1.2792476415634155, "logits/rejected": -0.5440900921821594, "logps/chosen": -1.1186585426330566, "logps/rejected": -1.2156472206115723, "loss": 1.1859, "odds_ratio_loss": 0.6719276905059814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11186586320400238, "rewards/margins": 0.009698860347270966, "rewards/rejected": -0.12156472355127335, "sft_loss": 1.1186585426330566, "step": 475 }, { "epoch": 0.04, "grad_norm": 5.348385810852051, "learning_rate": 9.978115886331147e-06, "logits/chosen": -1.2613639831542969, "logits/rejected": -0.782037079334259, "logps/chosen": -1.2296042442321777, "logps/rejected": -1.3125030994415283, "loss": 1.3113, "odds_ratio_loss": 0.8173338770866394, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.12296042591333389, "rewards/margins": 0.008289876393973827, "rewards/rejected": -0.1312503069639206, "sft_loss": 1.2296042442321777, "step": 480 }, { "epoch": 0.04, "grad_norm": 18.186199188232422, "learning_rate": 9.977536634244668e-06, "logits/chosen": -1.3923256397247314, "logits/rejected": -0.8637619018554688, "logps/chosen": -1.040005087852478, "logps/rejected": -1.5120867490768433, "loss": 1.0898, "odds_ratio_loss": 0.4984281659126282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.104000523686409, "rewards/margins": 0.04720815271139145, "rewards/rejected": -0.15120866894721985, "sft_loss": 1.040005087852478, "step": 485 }, { "epoch": 0.04, "grad_norm": 12.430156707763672, "learning_rate": 9.976949833091912e-06, "logits/chosen": -1.278136134147644, "logits/rejected": -0.9997159838676453, "logps/chosen": -1.2000458240509033, "logps/rejected": -1.8299169540405273, "loss": 1.297, "odds_ratio_loss": 0.9691734313964844, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1200045794248581, "rewards/margins": 0.06298711150884628, "rewards/rejected": -0.18299169838428497, "sft_loss": 1.2000458240509033, "step": 490 }, { "epoch": 0.04, "grad_norm": 5.525488376617432, "learning_rate": 9.976355483762836e-06, "logits/chosen": -1.1536939144134521, "logits/rejected": -1.0693880319595337, "logps/chosen": -0.8165262341499329, "logps/rejected": -0.9334398508071899, "loss": 0.8891, "odds_ratio_loss": 0.7261025309562683, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08165262639522552, "rewards/margins": 0.011691351421177387, "rewards/rejected": -0.09334397315979004, "sft_loss": 0.8165262341499329, "step": 495 }, { "epoch": 0.04, "grad_norm": 16.369741439819336, "learning_rate": 9.975753587158845e-06, "logits/chosen": -1.271452784538269, "logits/rejected": -1.020087718963623, "logps/chosen": -1.0287582874298096, "logps/rejected": -2.0818448066711426, "loss": 1.0777, "odds_ratio_loss": 0.4890977442264557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10287582874298096, "rewards/margins": 0.10530862957239151, "rewards/rejected": -0.20818445086479187, "sft_loss": 1.0287582874298096, "step": 500 }, { "epoch": 0.04, "grad_norm": 7.120456695556641, "learning_rate": 9.975144144192794e-06, "logits/chosen": -1.433354139328003, "logits/rejected": -0.9860296249389648, "logps/chosen": -0.8577227592468262, "logps/rejected": -1.743486762046814, "loss": 0.9003, "odds_ratio_loss": 0.4252917766571045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08577227592468262, "rewards/margins": 0.08857639133930206, "rewards/rejected": -0.17434866726398468, "sft_loss": 0.8577227592468262, "step": 505 }, { "epoch": 0.04, "grad_norm": 19.190956115722656, "learning_rate": 9.97452715578898e-06, "logits/chosen": -1.2277740240097046, "logits/rejected": -1.2004899978637695, "logps/chosen": -0.9218929409980774, "logps/rejected": -1.2253445386886597, "loss": 0.9733, "odds_ratio_loss": 0.5138994455337524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09218929708003998, "rewards/margins": 0.030345162376761436, "rewards/rejected": -0.12253445386886597, "sft_loss": 0.9218929409980774, "step": 510 }, { "epoch": 0.04, "grad_norm": 29.12042236328125, "learning_rate": 9.973902622883142e-06, "logits/chosen": -1.1885347366333008, "logits/rejected": -1.1869349479675293, "logps/chosen": -1.316017508506775, "logps/rejected": -0.8871244192123413, "loss": 1.4462, "odds_ratio_loss": 1.3014862537384033, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1316017359495163, "rewards/margins": -0.0428893081843853, "rewards/rejected": -0.08871243894100189, "sft_loss": 1.316017508506775, "step": 515 }, { "epoch": 0.04, "grad_norm": 12.59604549407959, "learning_rate": 9.973270546422465e-06, "logits/chosen": -1.1499083042144775, "logits/rejected": -1.0709830522537231, "logps/chosen": -1.061827540397644, "logps/rejected": -1.1412575244903564, "loss": 1.1507, "odds_ratio_loss": 0.888770580291748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1061827540397644, "rewards/margins": 0.007943002507090569, "rewards/rejected": -0.11412575095891953, "sft_loss": 1.061827540397644, "step": 520 }, { "epoch": 0.04, "grad_norm": 29.967601776123047, "learning_rate": 9.972630927365574e-06, "logits/chosen": -1.269574522972107, "logits/rejected": -0.6280655860900879, "logps/chosen": -1.1269054412841797, "logps/rejected": -1.688178300857544, "loss": 1.1772, "odds_ratio_loss": 0.5029090046882629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11269054561853409, "rewards/margins": 0.056127287447452545, "rewards/rejected": -0.16881783306598663, "sft_loss": 1.1269054412841797, "step": 525 }, { "epoch": 0.04, "grad_norm": 32.36668014526367, "learning_rate": 9.971983766682532e-06, "logits/chosen": -1.2929702997207642, "logits/rejected": -0.6805129647254944, "logps/chosen": -0.9838630557060242, "logps/rejected": -1.9721927642822266, "loss": 1.0517, "odds_ratio_loss": 0.6784967184066772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09838631004095078, "rewards/margins": 0.09883297979831696, "rewards/rejected": -0.19721928238868713, "sft_loss": 0.9838630557060242, "step": 530 }, { "epoch": 0.04, "grad_norm": 7.508626461029053, "learning_rate": 9.97132906535484e-06, "logits/chosen": -1.4744285345077515, "logits/rejected": -1.0449306964874268, "logps/chosen": -1.0357000827789307, "logps/rejected": -2.0317296981811523, "loss": 1.074, "odds_ratio_loss": 0.3831598162651062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10357002168893814, "rewards/margins": 0.09960293769836426, "rewards/rejected": -0.203172966837883, "sft_loss": 1.0357000827789307, "step": 535 }, { "epoch": 0.04, "grad_norm": 5.905401706695557, "learning_rate": 9.970666824375436e-06, "logits/chosen": -1.3433470726013184, "logits/rejected": -0.807905375957489, "logps/chosen": -1.0914314985275269, "logps/rejected": -1.157183051109314, "loss": 1.1575, "odds_ratio_loss": 0.660188615322113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10914316028356552, "rewards/margins": 0.006575146224349737, "rewards/rejected": -0.1157183051109314, "sft_loss": 1.0914314985275269, "step": 540 }, { "epoch": 0.04, "grad_norm": 11.22401237487793, "learning_rate": 9.969997044748691e-06, "logits/chosen": -1.3542927503585815, "logits/rejected": -0.971311092376709, "logps/chosen": -1.3239521980285645, "logps/rejected": -1.7703745365142822, "loss": 1.4147, "odds_ratio_loss": 0.9070507884025574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1323952078819275, "rewards/margins": 0.04464225098490715, "rewards/rejected": -0.17703744769096375, "sft_loss": 1.3239521980285645, "step": 545 }, { "epoch": 0.04, "grad_norm": 7.181665897369385, "learning_rate": 9.969319727490415e-06, "logits/chosen": -1.4340964555740356, "logits/rejected": -0.8164238929748535, "logps/chosen": -0.9762603640556335, "logps/rejected": -1.5105446577072144, "loss": 1.0495, "odds_ratio_loss": 0.7322009205818176, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09762604534626007, "rewards/margins": 0.053428418934345245, "rewards/rejected": -0.1510544717311859, "sft_loss": 0.9762603640556335, "step": 550 }, { "epoch": 0.04, "grad_norm": 6.170845031738281, "learning_rate": 9.96863487362784e-06, "logits/chosen": -1.3454340696334839, "logits/rejected": -0.6722744703292847, "logps/chosen": -0.8916047811508179, "logps/rejected": -1.5881434679031372, "loss": 0.945, "odds_ratio_loss": 0.5340217351913452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0891604870557785, "rewards/margins": 0.06965386122465134, "rewards/rejected": -0.15881435573101044, "sft_loss": 0.8916047811508179, "step": 555 }, { "epoch": 0.04, "grad_norm": 6.939874172210693, "learning_rate": 9.967942484199638e-06, "logits/chosen": -1.319394588470459, "logits/rejected": -0.7050190567970276, "logps/chosen": -0.8833843469619751, "logps/rejected": -1.2793550491333008, "loss": 0.9408, "odds_ratio_loss": 0.5743966698646545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0883384421467781, "rewards/margins": 0.03959707170724869, "rewards/rejected": -0.1279354989528656, "sft_loss": 0.8833843469619751, "step": 560 }, { "epoch": 0.04, "grad_norm": 215.84613037109375, "learning_rate": 9.967242560255906e-06, "logits/chosen": -1.2708102464675903, "logits/rejected": -1.2701189517974854, "logps/chosen": -0.9652470350265503, "logps/rejected": -2.828083038330078, "loss": 0.9969, "odds_ratio_loss": 0.3170081079006195, "rewards/accuracies": 1.0, "rewards/chosen": -0.09652470052242279, "rewards/margins": 0.18628360331058502, "rewards/rejected": -0.2828083038330078, "sft_loss": 0.9652470350265503, "step": 565 }, { "epoch": 0.04, "grad_norm": 20.366199493408203, "learning_rate": 9.966535102858163e-06, "logits/chosen": -1.1454098224639893, "logits/rejected": -1.2445162534713745, "logps/chosen": -1.3247931003570557, "logps/rejected": -0.9458759427070618, "loss": 1.4305, "odds_ratio_loss": 1.0566179752349854, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.13247931003570557, "rewards/margins": -0.03789170831441879, "rewards/rejected": -0.09458760172128677, "sft_loss": 1.3247931003570557, "step": 570 }, { "epoch": 0.04, "grad_norm": 8.253193855285645, "learning_rate": 9.965820113079361e-06, "logits/chosen": -1.3363704681396484, "logits/rejected": -0.9352320432662964, "logps/chosen": -0.9640430212020874, "logps/rejected": -1.2558622360229492, "loss": 1.0235, "odds_ratio_loss": 0.5942882299423218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0964042991399765, "rewards/margins": 0.029181916266679764, "rewards/rejected": -0.12558622658252716, "sft_loss": 0.9640430212020874, "step": 575 }, { "epoch": 0.05, "grad_norm": 9.946394920349121, "learning_rate": 9.965097592003874e-06, "logits/chosen": -1.3996388912200928, "logits/rejected": -1.0949664115905762, "logps/chosen": -0.8815194368362427, "logps/rejected": -2.7259840965270996, "loss": 0.9355, "odds_ratio_loss": 0.5393964052200317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0881519466638565, "rewards/margins": 0.18444648385047913, "rewards/rejected": -0.2725984454154968, "sft_loss": 0.8815194368362427, "step": 580 }, { "epoch": 0.05, "grad_norm": 7.081384658813477, "learning_rate": 9.964367540727492e-06, "logits/chosen": -1.2155399322509766, "logits/rejected": -0.9352075457572937, "logps/chosen": -1.4964728355407715, "logps/rejected": -2.1899776458740234, "loss": 1.5801, "odds_ratio_loss": 0.8366632461547852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14964726567268372, "rewards/margins": 0.0693504810333252, "rewards/rejected": -0.2189977616071701, "sft_loss": 1.4964728355407715, "step": 585 }, { "epoch": 0.05, "grad_norm": 9.30331039428711, "learning_rate": 9.963629960357438e-06, "logits/chosen": -1.3705943822860718, "logits/rejected": -0.8403999209403992, "logps/chosen": -1.2652610540390015, "logps/rejected": -1.490480661392212, "loss": 1.3256, "odds_ratio_loss": 0.6036292910575867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1265261024236679, "rewards/margins": 0.0225219689309597, "rewards/rejected": -0.1490480750799179, "sft_loss": 1.2652610540390015, "step": 590 }, { "epoch": 0.05, "grad_norm": 11.273741722106934, "learning_rate": 9.96288485201234e-06, "logits/chosen": -1.4199020862579346, "logits/rejected": -1.1455968618392944, "logps/chosen": -0.9536746740341187, "logps/rejected": -1.5848302841186523, "loss": 0.9977, "odds_ratio_loss": 0.44051140546798706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09536747634410858, "rewards/margins": 0.06311556696891785, "rewards/rejected": -0.15848304331302643, "sft_loss": 0.9536746740341187, "step": 595 }, { "epoch": 0.05, "grad_norm": 12.181118965148926, "learning_rate": 9.962132216822252e-06, "logits/chosen": -1.4740102291107178, "logits/rejected": -0.8962316513061523, "logps/chosen": -1.0127413272857666, "logps/rejected": -1.6857773065567017, "loss": 1.066, "odds_ratio_loss": 0.5327258706092834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10127413272857666, "rewards/margins": 0.0673035979270935, "rewards/rejected": -0.16857774555683136, "sft_loss": 1.0127413272857666, "step": 600 }, { "epoch": 0.05, "grad_norm": 15.026229858398438, "learning_rate": 9.96137205592864e-06, "logits/chosen": -1.5482691526412964, "logits/rejected": -1.1137874126434326, "logps/chosen": -0.8740970492362976, "logps/rejected": -1.5898300409317017, "loss": 0.9234, "odds_ratio_loss": 0.4932100176811218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08740969747304916, "rewards/margins": 0.07157330214977264, "rewards/rejected": -0.1589830219745636, "sft_loss": 0.8740970492362976, "step": 605 }, { "epoch": 0.05, "grad_norm": 31.824352264404297, "learning_rate": 9.960604370484385e-06, "logits/chosen": -1.4865148067474365, "logits/rejected": -1.0594385862350464, "logps/chosen": -1.0654785633087158, "logps/rejected": -1.3001195192337036, "loss": 1.1611, "odds_ratio_loss": 0.9559617042541504, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10654785484075546, "rewards/margins": 0.023464106023311615, "rewards/rejected": -0.13001194596290588, "sft_loss": 1.0654785633087158, "step": 610 }, { "epoch": 0.05, "grad_norm": 9.08944034576416, "learning_rate": 9.959829161653778e-06, "logits/chosen": -1.456381916999817, "logits/rejected": -0.8847858309745789, "logps/chosen": -0.9969884157180786, "logps/rejected": -0.8793743252754211, "loss": 1.0802, "odds_ratio_loss": 0.8324553370475769, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09969884157180786, "rewards/margins": -0.011761406436562538, "rewards/rejected": -0.08793742954730988, "sft_loss": 0.9969884157180786, "step": 615 }, { "epoch": 0.05, "grad_norm": 8.12314224243164, "learning_rate": 9.959046430612524e-06, "logits/chosen": -1.420644760131836, "logits/rejected": -1.0690540075302124, "logps/chosen": -0.9774150848388672, "logps/rejected": -0.9998822212219238, "loss": 1.0705, "odds_ratio_loss": 0.9305570721626282, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0977415144443512, "rewards/margins": 0.002246710704639554, "rewards/rejected": -0.09998822957277298, "sft_loss": 0.9774150848388672, "step": 620 }, { "epoch": 0.05, "grad_norm": 5.643470287322998, "learning_rate": 9.958256178547734e-06, "logits/chosen": -1.3239028453826904, "logits/rejected": -1.0841190814971924, "logps/chosen": -1.0758788585662842, "logps/rejected": -2.1539533138275146, "loss": 1.1522, "odds_ratio_loss": 0.7633380889892578, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10758789628744125, "rewards/margins": 0.10780743509531021, "rewards/rejected": -0.21539533138275146, "sft_loss": 1.0758788585662842, "step": 625 }, { "epoch": 0.05, "grad_norm": 7.302849769592285, "learning_rate": 9.957458406657924e-06, "logits/chosen": -1.3189386129379272, "logits/rejected": -0.8606952428817749, "logps/chosen": -0.8432208895683289, "logps/rejected": -0.9131423234939575, "loss": 0.9137, "odds_ratio_loss": 0.7045748829841614, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08432208746671677, "rewards/margins": 0.006992141250520945, "rewards/rejected": -0.09131423383951187, "sft_loss": 0.8432208895683289, "step": 630 }, { "epoch": 0.05, "grad_norm": 8.54017448425293, "learning_rate": 9.956653116153015e-06, "logits/chosen": -1.2382190227508545, "logits/rejected": -1.2081432342529297, "logps/chosen": -0.7250022888183594, "logps/rejected": -0.5700157284736633, "loss": 0.8206, "odds_ratio_loss": 0.95604407787323, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07250023633241653, "rewards/margins": -0.015498657710850239, "rewards/rejected": -0.05700157210230827, "sft_loss": 0.7250022888183594, "step": 635 }, { "epoch": 0.05, "grad_norm": 40.91093444824219, "learning_rate": 9.955840308254336e-06, "logits/chosen": -1.2277017831802368, "logits/rejected": -0.9123063087463379, "logps/chosen": -1.060502529144287, "logps/rejected": -2.4313435554504395, "loss": 1.1204, "odds_ratio_loss": 0.5990681052207947, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1060502678155899, "rewards/margins": 0.13708409667015076, "rewards/rejected": -0.24313434958457947, "sft_loss": 1.060502529144287, "step": 640 }, { "epoch": 0.05, "grad_norm": 7.556923866271973, "learning_rate": 9.955019984194611e-06, "logits/chosen": -1.3158109188079834, "logits/rejected": -0.7409260272979736, "logps/chosen": -1.08364737033844, "logps/rejected": -1.4582345485687256, "loss": 1.1466, "odds_ratio_loss": 0.6292973756790161, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10836473852396011, "rewards/margins": 0.03745872527360916, "rewards/rejected": -0.14582346379756927, "sft_loss": 1.08364737033844, "step": 645 }, { "epoch": 0.05, "grad_norm": 15.207615852355957, "learning_rate": 9.954192145217966e-06, "logits/chosen": -1.3571964502334595, "logits/rejected": -0.6947706937789917, "logps/chosen": -1.1696298122406006, "logps/rejected": -0.8519676327705383, "loss": 1.2938, "odds_ratio_loss": 1.2413297891616821, "rewards/accuracies": 0.0, "rewards/chosen": -0.1169629842042923, "rewards/margins": -0.031766220927238464, "rewards/rejected": -0.08519675582647324, "sft_loss": 1.1696298122406006, "step": 650 }, { "epoch": 0.05, "grad_norm": 5.49888277053833, "learning_rate": 9.953356792579925e-06, "logits/chosen": -1.4864259958267212, "logits/rejected": -0.9483200311660767, "logps/chosen": -1.032362461090088, "logps/rejected": -1.8657007217407227, "loss": 1.082, "odds_ratio_loss": 0.4959256649017334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10323625802993774, "rewards/margins": 0.08333383500576019, "rewards/rejected": -0.18657009303569794, "sft_loss": 1.032362461090088, "step": 655 }, { "epoch": 0.05, "grad_norm": 9.250358581542969, "learning_rate": 9.952513927547405e-06, "logits/chosen": -1.4891111850738525, "logits/rejected": -1.4543288946151733, "logps/chosen": -1.168774127960205, "logps/rejected": -2.8399128913879395, "loss": 1.2364, "odds_ratio_loss": 0.6762497425079346, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11687741428613663, "rewards/margins": 0.16711387038230896, "rewards/rejected": -0.283991277217865, "sft_loss": 1.168774127960205, "step": 660 }, { "epoch": 0.05, "grad_norm": 9.42989730834961, "learning_rate": 9.951663551398717e-06, "logits/chosen": -1.4089422225952148, "logits/rejected": -1.302330732345581, "logps/chosen": -1.2792950868606567, "logps/rejected": -1.7675468921661377, "loss": 1.3501, "odds_ratio_loss": 0.7083412408828735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12792950868606567, "rewards/margins": 0.048825182020664215, "rewards/rejected": -0.17675469815731049, "sft_loss": 1.2792950868606567, "step": 665 }, { "epoch": 0.05, "grad_norm": 57.358638763427734, "learning_rate": 9.950805665423566e-06, "logits/chosen": -1.3845967054367065, "logits/rejected": -0.9723657369613647, "logps/chosen": -0.7079774141311646, "logps/rejected": -3.329834461212158, "loss": 0.7583, "odds_ratio_loss": 0.5035191178321838, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07079773396253586, "rewards/margins": 0.2621857225894928, "rewards/rejected": -0.33298343420028687, "sft_loss": 0.7079774141311646, "step": 670 }, { "epoch": 0.05, "grad_norm": 14.513296127319336, "learning_rate": 9.949940270923047e-06, "logits/chosen": -1.201442003250122, "logits/rejected": -0.9176227450370789, "logps/chosen": -0.8187268972396851, "logps/rejected": -1.3563072681427002, "loss": 0.8591, "odds_ratio_loss": 0.40391626954078674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08187268674373627, "rewards/margins": 0.05375804752111435, "rewards/rejected": -0.1356307417154312, "sft_loss": 0.8187268972396851, "step": 675 }, { "epoch": 0.05, "grad_norm": 6.561885833740234, "learning_rate": 9.949067369209635e-06, "logits/chosen": -1.3256163597106934, "logits/rejected": -1.0909916162490845, "logps/chosen": -1.040527105331421, "logps/rejected": -1.0341429710388184, "loss": 1.1175, "odds_ratio_loss": 0.7698140144348145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10405270755290985, "rewards/margins": -0.0006384074804373085, "rewards/rejected": -0.10341429710388184, "sft_loss": 1.040527105331421, "step": 680 }, { "epoch": 0.05, "grad_norm": 4.70972204208374, "learning_rate": 9.9481869616072e-06, "logits/chosen": -1.390150785446167, "logits/rejected": -0.5502735376358032, "logps/chosen": -1.3307173252105713, "logps/rejected": -3.408107042312622, "loss": 1.3648, "odds_ratio_loss": 0.34050750732421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.13307173550128937, "rewards/margins": 0.207738995552063, "rewards/rejected": -0.34081071615219116, "sft_loss": 1.3307173252105713, "step": 685 }, { "epoch": 0.05, "grad_norm": 18.70503044128418, "learning_rate": 9.947299049450994e-06, "logits/chosen": -1.2619187831878662, "logits/rejected": -0.8109877705574036, "logps/chosen": -0.7874099612236023, "logps/rejected": -0.9367067217826843, "loss": 0.8516, "odds_ratio_loss": 0.6423176527023315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07874099910259247, "rewards/margins": 0.014929664321243763, "rewards/rejected": -0.09367066621780396, "sft_loss": 0.7874099612236023, "step": 690 }, { "epoch": 0.05, "grad_norm": 7.274419784545898, "learning_rate": 9.946403634087643e-06, "logits/chosen": -1.3324792385101318, "logits/rejected": -0.749333381652832, "logps/chosen": -0.8649250864982605, "logps/rejected": -1.0733157396316528, "loss": 0.9191, "odds_ratio_loss": 0.5421899557113647, "rewards/accuracies": 1.0, "rewards/chosen": -0.08649250864982605, "rewards/margins": 0.020839063450694084, "rewards/rejected": -0.10733157396316528, "sft_loss": 0.8649250864982605, "step": 695 }, { "epoch": 0.05, "grad_norm": 9.046555519104004, "learning_rate": 9.945500716875162e-06, "logits/chosen": -1.4016413688659668, "logits/rejected": -1.1516118049621582, "logps/chosen": -1.357739806175232, "logps/rejected": -3.03171968460083, "loss": 1.4345, "odds_ratio_loss": 0.7673938870429993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13577398657798767, "rewards/margins": 0.16739800572395325, "rewards/rejected": -0.30317196249961853, "sft_loss": 1.357739806175232, "step": 700 }, { "epoch": 0.05, "grad_norm": 46.778812408447266, "learning_rate": 9.944590299182939e-06, "logits/chosen": -1.2562851905822754, "logits/rejected": -1.2785260677337646, "logps/chosen": -1.5307793617248535, "logps/rejected": -2.245178461074829, "loss": 1.6408, "odds_ratio_loss": 1.1004269123077393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15307793021202087, "rewards/margins": 0.07143990695476532, "rewards/rejected": -0.2245178520679474, "sft_loss": 1.5307793617248535, "step": 705 }, { "epoch": 0.06, "grad_norm": 27.53387451171875, "learning_rate": 9.943672382391738e-06, "logits/chosen": -1.4019700288772583, "logits/rejected": -0.750403881072998, "logps/chosen": -1.2429605722427368, "logps/rejected": -1.03725266456604, "loss": 1.3308, "odds_ratio_loss": 0.8781029582023621, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.12429605424404144, "rewards/margins": -0.02057078294456005, "rewards/rejected": -0.10372526943683624, "sft_loss": 1.2429605722427368, "step": 710 }, { "epoch": 0.06, "grad_norm": 5.3482160568237305, "learning_rate": 9.942746967893695e-06, "logits/chosen": -1.3286592960357666, "logits/rejected": -1.1365609169006348, "logps/chosen": -1.1098437309265137, "logps/rejected": -1.0078884363174438, "loss": 1.1921, "odds_ratio_loss": 0.8228921890258789, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11098437011241913, "rewards/margins": -0.010195528157055378, "rewards/rejected": -0.10078884661197662, "sft_loss": 1.1098437309265137, "step": 715 }, { "epoch": 0.06, "grad_norm": 17.593463897705078, "learning_rate": 9.94181405709232e-06, "logits/chosen": -1.448482871055603, "logits/rejected": -0.9955105781555176, "logps/chosen": -0.7879932522773743, "logps/rejected": -1.9978437423706055, "loss": 0.8648, "odds_ratio_loss": 0.767823338508606, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.07879932224750519, "rewards/margins": 0.12098504602909088, "rewards/rejected": -0.19978436827659607, "sft_loss": 0.7879932522773743, "step": 720 }, { "epoch": 0.06, "grad_norm": 6.694170951843262, "learning_rate": 9.94087365140249e-06, "logits/chosen": -1.31051504611969, "logits/rejected": -1.086120367050171, "logps/chosen": -1.2709503173828125, "logps/rejected": -0.9000130891799927, "loss": 1.3821, "odds_ratio_loss": 1.1116914749145508, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.1270950436592102, "rewards/margins": -0.03709372878074646, "rewards/rejected": -0.09000130742788315, "sft_loss": 1.2709503173828125, "step": 725 }, { "epoch": 0.06, "grad_norm": 39.7230110168457, "learning_rate": 9.93992575225045e-06, "logits/chosen": -1.38053297996521, "logits/rejected": -1.010858178138733, "logps/chosen": -1.0346620082855225, "logps/rejected": -1.122179627418518, "loss": 1.1033, "odds_ratio_loss": 0.6862468123435974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10346619784832001, "rewards/margins": 0.00875175278633833, "rewards/rejected": -0.11221794784069061, "sft_loss": 1.0346620082855225, "step": 730 }, { "epoch": 0.06, "grad_norm": 25.078582763671875, "learning_rate": 9.93897036107381e-06, "logits/chosen": -1.3918646574020386, "logits/rejected": -1.1127347946166992, "logps/chosen": -0.9163225293159485, "logps/rejected": -3.5592429637908936, "loss": 0.9362, "odds_ratio_loss": 0.19927331805229187, "rewards/accuracies": 1.0, "rewards/chosen": -0.09163224697113037, "rewards/margins": 0.26429206132888794, "rewards/rejected": -0.3559243083000183, "sft_loss": 0.9163225293159485, "step": 735 }, { "epoch": 0.06, "grad_norm": 18.3801212310791, "learning_rate": 9.938007479321541e-06, "logits/chosen": -1.1464792490005493, "logits/rejected": -0.6045576930046082, "logps/chosen": -0.9576179385185242, "logps/rejected": -1.123015284538269, "loss": 1.0146, "odds_ratio_loss": 0.5698369145393372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09576179087162018, "rewards/margins": 0.016539743170142174, "rewards/rejected": -0.1123015433549881, "sft_loss": 0.9576179385185242, "step": 740 }, { "epoch": 0.06, "grad_norm": 6.056693077087402, "learning_rate": 9.937037108453974e-06, "logits/chosen": -1.2240662574768066, "logits/rejected": -0.912168025970459, "logps/chosen": -0.9915501475334167, "logps/rejected": -1.0677486658096313, "loss": 1.0657, "odds_ratio_loss": 0.741681694984436, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09915502369403839, "rewards/margins": 0.0076198456808924675, "rewards/rejected": -0.10677486658096313, "sft_loss": 0.9915501475334167, "step": 745 }, { "epoch": 0.06, "grad_norm": 5.033421039581299, "learning_rate": 9.936059249942805e-06, "logits/chosen": -1.3071390390396118, "logits/rejected": -0.8568423986434937, "logps/chosen": -0.8234881162643433, "logps/rejected": -1.3216912746429443, "loss": 0.8871, "odds_ratio_loss": 0.6363648772239685, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08234880864620209, "rewards/margins": 0.04982032626867294, "rewards/rejected": -0.13216914236545563, "sft_loss": 0.8234881162643433, "step": 750 }, { "epoch": 0.06, "grad_norm": 6.3838701248168945, "learning_rate": 9.935073905271074e-06, "logits/chosen": -1.204453468322754, "logits/rejected": -0.6676325798034668, "logps/chosen": -1.0909837484359741, "logps/rejected": -1.807064414024353, "loss": 1.149, "odds_ratio_loss": 0.5803178548812866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1090983897447586, "rewards/margins": 0.0716080591082573, "rewards/rejected": -0.1807064414024353, "sft_loss": 1.0909837484359741, "step": 755 }, { "epoch": 0.06, "grad_norm": 11.226985931396484, "learning_rate": 9.934081075933187e-06, "logits/chosen": -1.3518896102905273, "logits/rejected": -0.982338547706604, "logps/chosen": -1.2844316959381104, "logps/rejected": -1.6078770160675049, "loss": 1.365, "odds_ratio_loss": 0.8052700161933899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1284431666135788, "rewards/margins": 0.03234453126788139, "rewards/rejected": -0.1607877016067505, "sft_loss": 1.2844316959381104, "step": 760 }, { "epoch": 0.06, "grad_norm": 31.29439353942871, "learning_rate": 9.93308076343489e-06, "logits/chosen": -1.3497084379196167, "logits/rejected": -0.9264364242553711, "logps/chosen": -1.1290289163589478, "logps/rejected": -1.3911527395248413, "loss": 1.1867, "odds_ratio_loss": 0.576228141784668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11290287971496582, "rewards/margins": 0.026212383061647415, "rewards/rejected": -0.13911525905132294, "sft_loss": 1.1290289163589478, "step": 765 }, { "epoch": 0.06, "grad_norm": 10.17575740814209, "learning_rate": 9.932072969293288e-06, "logits/chosen": -1.3262922763824463, "logits/rejected": -1.022761344909668, "logps/chosen": -0.9707845449447632, "logps/rejected": -1.6581337451934814, "loss": 1.0134, "odds_ratio_loss": 0.4258590340614319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09707845747470856, "rewards/margins": 0.06873490661382675, "rewards/rejected": -0.1658133566379547, "sft_loss": 0.9707845449447632, "step": 770 }, { "epoch": 0.06, "grad_norm": 14.091787338256836, "learning_rate": 9.931057695036828e-06, "logits/chosen": -1.356696367263794, "logits/rejected": -1.0855176448822021, "logps/chosen": -1.1384265422821045, "logps/rejected": -0.9075587391853333, "loss": 1.2365, "odds_ratio_loss": 0.9805895686149597, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11384264379739761, "rewards/margins": -0.023086776956915855, "rewards/rejected": -0.09075586497783661, "sft_loss": 1.1384265422821045, "step": 775 }, { "epoch": 0.06, "grad_norm": 8.655046463012695, "learning_rate": 9.930034942205303e-06, "logits/chosen": -1.3109716176986694, "logits/rejected": -0.816183865070343, "logps/chosen": -0.8931499719619751, "logps/rejected": -1.218641996383667, "loss": 0.9495, "odds_ratio_loss": 0.563424825668335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08931499719619751, "rewards/margins": 0.03254919499158859, "rewards/rejected": -0.1218641996383667, "sft_loss": 0.8931499719619751, "step": 780 }, { "epoch": 0.06, "grad_norm": 5.672267436981201, "learning_rate": 9.929004712349844e-06, "logits/chosen": -1.364993929862976, "logits/rejected": -1.0111385583877563, "logps/chosen": -1.0240195989608765, "logps/rejected": -1.448655605316162, "loss": 1.0751, "odds_ratio_loss": 0.510697066783905, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1024019718170166, "rewards/margins": 0.04246360436081886, "rewards/rejected": -0.14486555755138397, "sft_loss": 1.0240195989608765, "step": 785 }, { "epoch": 0.06, "grad_norm": 8.213885307312012, "learning_rate": 9.92796700703293e-06, "logits/chosen": -1.2081053256988525, "logits/rejected": -1.3082879781723022, "logps/chosen": -0.8951700329780579, "logps/rejected": -1.3031384944915771, "loss": 0.9444, "odds_ratio_loss": 0.49256768822669983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08951699733734131, "rewards/margins": 0.04079686850309372, "rewards/rejected": -0.13031387329101562, "sft_loss": 0.8951700329780579, "step": 790 }, { "epoch": 0.06, "grad_norm": 9.180521965026855, "learning_rate": 9.926921827828368e-06, "logits/chosen": -1.3431509733200073, "logits/rejected": -1.1413524150848389, "logps/chosen": -1.030499815940857, "logps/rejected": -0.9735819697380066, "loss": 1.1098, "odds_ratio_loss": 0.7934376001358032, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10304999351501465, "rewards/margins": -0.00569178769364953, "rewards/rejected": -0.09735819697380066, "sft_loss": 1.030499815940857, "step": 795 }, { "epoch": 0.06, "grad_norm": 21.196258544921875, "learning_rate": 9.92586917632131e-06, "logits/chosen": -1.408015251159668, "logits/rejected": -1.2530816793441772, "logps/chosen": -0.9153121709823608, "logps/rejected": -1.3999773263931274, "loss": 0.9899, "odds_ratio_loss": 0.745948314666748, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09153122454881668, "rewards/margins": 0.0484665222465992, "rewards/rejected": -0.13999773561954498, "sft_loss": 0.9153121709823608, "step": 800 }, { "epoch": 0.06, "grad_norm": 16.20098114013672, "learning_rate": 9.924809054108232e-06, "logits/chosen": -1.193213939666748, "logits/rejected": -1.042197585105896, "logps/chosen": -0.8584068417549133, "logps/rejected": -0.8795690536499023, "loss": 0.9491, "odds_ratio_loss": 0.9071296453475952, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08584068715572357, "rewards/margins": 0.0021162242628633976, "rewards/rejected": -0.08795692026615143, "sft_loss": 0.8584068417549133, "step": 805 }, { "epoch": 0.06, "grad_norm": 26.68222427368164, "learning_rate": 9.923741462796947e-06, "logits/chosen": -1.0805448293685913, "logits/rejected": -0.7028868794441223, "logps/chosen": -1.1113938093185425, "logps/rejected": -1.3654364347457886, "loss": 1.179, "odds_ratio_loss": 0.676261305809021, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11113937944173813, "rewards/margins": 0.02540426328778267, "rewards/rejected": -0.1365436464548111, "sft_loss": 1.1113938093185425, "step": 810 }, { "epoch": 0.06, "grad_norm": 6.551802635192871, "learning_rate": 9.922666404006592e-06, "logits/chosen": -1.418948769569397, "logits/rejected": -0.7345365881919861, "logps/chosen": -0.9354351162910461, "logps/rejected": -1.849973440170288, "loss": 0.9887, "odds_ratio_loss": 0.5327891111373901, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09354351460933685, "rewards/margins": 0.09145383536815643, "rewards/rejected": -0.1849973499774933, "sft_loss": 0.9354351162910461, "step": 815 }, { "epoch": 0.06, "grad_norm": 6.564281940460205, "learning_rate": 9.921583879367627e-06, "logits/chosen": -1.2443413734436035, "logits/rejected": -1.0016214847564697, "logps/chosen": -1.3639390468597412, "logps/rejected": -1.252862572669983, "loss": 1.4527, "odds_ratio_loss": 0.887839674949646, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13639390468597412, "rewards/margins": -0.011107642203569412, "rewards/rejected": -0.1252862513065338, "sft_loss": 1.3639390468597412, "step": 820 }, { "epoch": 0.06, "grad_norm": 10.34186840057373, "learning_rate": 9.920493890521842e-06, "logits/chosen": -1.3167495727539062, "logits/rejected": -0.8088476061820984, "logps/chosen": -1.0269895792007446, "logps/rejected": -1.8802833557128906, "loss": 1.0725, "odds_ratio_loss": 0.45481786131858826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10269895941019058, "rewards/margins": 0.08532937616109848, "rewards/rejected": -0.18802833557128906, "sft_loss": 1.0269895792007446, "step": 825 }, { "epoch": 0.06, "grad_norm": 7.909381866455078, "learning_rate": 9.91939643912234e-06, "logits/chosen": -1.3650422096252441, "logits/rejected": -0.854724109172821, "logps/chosen": -0.98639976978302, "logps/rejected": -1.1779707670211792, "loss": 1.0439, "odds_ratio_loss": 0.5750349164009094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09863997995853424, "rewards/margins": 0.019157104194164276, "rewards/rejected": -0.11779709160327911, "sft_loss": 0.98639976978302, "step": 830 }, { "epoch": 0.06, "grad_norm": 63.97218704223633, "learning_rate": 9.918291526833548e-06, "logits/chosen": -1.4302396774291992, "logits/rejected": -0.8879743814468384, "logps/chosen": -1.32662034034729, "logps/rejected": -2.953354597091675, "loss": 1.3797, "odds_ratio_loss": 0.5310118198394775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13266204297542572, "rewards/margins": 0.16267342865467072, "rewards/rejected": -0.29533547163009644, "sft_loss": 1.32662034034729, "step": 835 }, { "epoch": 0.07, "grad_norm": 7.25938081741333, "learning_rate": 9.917179155331206e-06, "logits/chosen": -1.1007791757583618, "logits/rejected": -0.7932504415512085, "logps/chosen": -1.3327864408493042, "logps/rejected": -1.0754101276397705, "loss": 1.4221, "odds_ratio_loss": 0.8930914998054504, "rewards/accuracies": 0.0, "rewards/chosen": -0.13327865302562714, "rewards/margins": -0.025737643241882324, "rewards/rejected": -0.10754100978374481, "sft_loss": 1.3327864408493042, "step": 840 }, { "epoch": 0.07, "grad_norm": 11.274470329284668, "learning_rate": 9.916059326302364e-06, "logits/chosen": -1.3314870595932007, "logits/rejected": -0.8154805302619934, "logps/chosen": -1.0468391180038452, "logps/rejected": -1.227414608001709, "loss": 1.1067, "odds_ratio_loss": 0.5981887578964233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10468391329050064, "rewards/margins": 0.01805754378437996, "rewards/rejected": -0.1227414458990097, "sft_loss": 1.0468391180038452, "step": 845 }, { "epoch": 0.07, "grad_norm": 12.382113456726074, "learning_rate": 9.914932041445386e-06, "logits/chosen": -1.3826451301574707, "logits/rejected": -1.1742017269134521, "logps/chosen": -1.1019704341888428, "logps/rejected": -3.996877670288086, "loss": 1.1651, "odds_ratio_loss": 0.6314796209335327, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1101970449090004, "rewards/margins": 0.2894907593727112, "rewards/rejected": -0.399687796831131, "sft_loss": 1.1019704341888428, "step": 850 }, { "epoch": 0.07, "grad_norm": 9.66358757019043, "learning_rate": 9.913797302469944e-06, "logits/chosen": -1.204119324684143, "logits/rejected": -1.0197612047195435, "logps/chosen": -0.855130672454834, "logps/rejected": -0.7614967823028564, "loss": 0.9439, "odds_ratio_loss": 0.888095498085022, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08551307767629623, "rewards/margins": -0.009363390505313873, "rewards/rejected": -0.07614968717098236, "sft_loss": 0.855130672454834, "step": 855 }, { "epoch": 0.07, "grad_norm": 5.658473968505859, "learning_rate": 9.912655111097014e-06, "logits/chosen": -1.2263609170913696, "logits/rejected": -0.691985011100769, "logps/chosen": -1.2378971576690674, "logps/rejected": -1.0723927021026611, "loss": 1.3234, "odds_ratio_loss": 0.8548486828804016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1237897127866745, "rewards/margins": -0.016550447791814804, "rewards/rejected": -0.1072392612695694, "sft_loss": 1.2378971576690674, "step": 860 }, { "epoch": 0.07, "grad_norm": 5.982890605926514, "learning_rate": 9.911505469058872e-06, "logits/chosen": -1.2504374980926514, "logits/rejected": -1.004959225654602, "logps/chosen": -1.328712821006775, "logps/rejected": -1.461451530456543, "loss": 1.4051, "odds_ratio_loss": 0.7639222145080566, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.13287129998207092, "rewards/margins": 0.013273855671286583, "rewards/rejected": -0.14614513516426086, "sft_loss": 1.328712821006775, "step": 865 }, { "epoch": 0.07, "grad_norm": 5.529323101043701, "learning_rate": 9.910348378099098e-06, "logits/chosen": -1.312565565109253, "logits/rejected": -0.8698539733886719, "logps/chosen": -1.003717064857483, "logps/rejected": -1.205470085144043, "loss": 1.0766, "odds_ratio_loss": 0.7283679246902466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10037169605493546, "rewards/margins": 0.020175311714410782, "rewards/rejected": -0.12054701149463654, "sft_loss": 1.003717064857483, "step": 870 }, { "epoch": 0.07, "grad_norm": 94.74681854248047, "learning_rate": 9.909183839972565e-06, "logits/chosen": -1.375828742980957, "logits/rejected": -1.1545583009719849, "logps/chosen": -1.1318230628967285, "logps/rejected": -3.6160807609558105, "loss": 1.1897, "odds_ratio_loss": 0.5786797404289246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11318230628967285, "rewards/margins": 0.24842575192451477, "rewards/rejected": -0.36160808801651, "sft_loss": 1.1318230628967285, "step": 875 }, { "epoch": 0.07, "grad_norm": 21.118417739868164, "learning_rate": 9.908011856445444e-06, "logits/chosen": -1.4701459407806396, "logits/rejected": -1.0760300159454346, "logps/chosen": -1.4192107915878296, "logps/rejected": -1.8034175634384155, "loss": 1.4791, "odds_ratio_loss": 0.598541796207428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14192108809947968, "rewards/margins": 0.03842068463563919, "rewards/rejected": -0.18034176528453827, "sft_loss": 1.4192107915878296, "step": 880 }, { "epoch": 0.07, "grad_norm": 10.34803295135498, "learning_rate": 9.906832429295199e-06, "logits/chosen": -1.2245054244995117, "logits/rejected": -0.9822576642036438, "logps/chosen": -0.933951199054718, "logps/rejected": -1.1507595777511597, "loss": 0.9959, "odds_ratio_loss": 0.619904100894928, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09339512884616852, "rewards/margins": 0.021680831909179688, "rewards/rejected": -0.1150759607553482, "sft_loss": 0.933951199054718, "step": 885 }, { "epoch": 0.07, "grad_norm": 17.897947311401367, "learning_rate": 9.905645560310577e-06, "logits/chosen": -1.383840799331665, "logits/rejected": -1.3219610452651978, "logps/chosen": -0.9768409729003906, "logps/rejected": -1.3957569599151611, "loss": 1.0512, "odds_ratio_loss": 0.7432494759559631, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0976840928196907, "rewards/margins": 0.04189159721136093, "rewards/rejected": -0.13957569003105164, "sft_loss": 0.9768409729003906, "step": 890 }, { "epoch": 0.07, "grad_norm": 9.26870059967041, "learning_rate": 9.90445125129162e-06, "logits/chosen": -1.4521198272705078, "logits/rejected": -1.1943159103393555, "logps/chosen": -1.0366110801696777, "logps/rejected": -1.3479697704315186, "loss": 1.0942, "odds_ratio_loss": 0.5757991075515747, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10366110503673553, "rewards/margins": 0.03113587573170662, "rewards/rejected": -0.13479699194431305, "sft_loss": 1.0366110801696777, "step": 895 }, { "epoch": 0.07, "grad_norm": 6.55729341506958, "learning_rate": 9.903249504049645e-06, "logits/chosen": -1.2487952709197998, "logits/rejected": -0.6370812654495239, "logps/chosen": -0.9395875930786133, "logps/rejected": -0.9449079632759094, "loss": 1.016, "odds_ratio_loss": 0.7639524340629578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09395875781774521, "rewards/margins": 0.0005320362979546189, "rewards/rejected": -0.09449080377817154, "sft_loss": 0.9395875930786133, "step": 900 }, { "epoch": 0.07, "grad_norm": 45.118629455566406, "learning_rate": 9.902040320407258e-06, "logits/chosen": -1.2511450052261353, "logits/rejected": -0.8519547581672668, "logps/chosen": -1.1160920858383179, "logps/rejected": -1.2400343418121338, "loss": 1.2332, "odds_ratio_loss": 1.171097755432129, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.11160922050476074, "rewards/margins": 0.012394214980304241, "rewards/rejected": -0.12400342524051666, "sft_loss": 1.1160920858383179, "step": 905 }, { "epoch": 0.07, "grad_norm": 5.698209285736084, "learning_rate": 9.900823702198338e-06, "logits/chosen": -1.47048020362854, "logits/rejected": -0.7404271364212036, "logps/chosen": -1.0461819171905518, "logps/rejected": -1.5807592868804932, "loss": 1.0939, "odds_ratio_loss": 0.477081835269928, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10461819171905518, "rewards/margins": 0.053457729518413544, "rewards/rejected": -0.15807592868804932, "sft_loss": 1.0461819171905518, "step": 910 }, { "epoch": 0.07, "grad_norm": 9.900472640991211, "learning_rate": 9.899599651268039e-06, "logits/chosen": -1.3266518115997314, "logits/rejected": -1.1770989894866943, "logps/chosen": -1.1734721660614014, "logps/rejected": -2.0207362174987793, "loss": 1.2137, "odds_ratio_loss": 0.40272313356399536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11734724044799805, "rewards/margins": 0.08472639322280884, "rewards/rejected": -0.2020736187696457, "sft_loss": 1.1734721660614014, "step": 915 }, { "epoch": 0.07, "grad_norm": 14.617350578308105, "learning_rate": 9.898368169472794e-06, "logits/chosen": -1.3530217409133911, "logits/rejected": -0.8183524012565613, "logps/chosen": -0.8375462293624878, "logps/rejected": -2.5914151668548584, "loss": 0.8782, "odds_ratio_loss": 0.4067561626434326, "rewards/accuracies": 1.0, "rewards/chosen": -0.08375462144613266, "rewards/margins": 0.17538690567016602, "rewards/rejected": -0.2591415345668793, "sft_loss": 0.8375462293624878, "step": 920 }, { "epoch": 0.07, "grad_norm": 38.70340347290039, "learning_rate": 9.897129258680298e-06, "logits/chosen": -1.382263422012329, "logits/rejected": -1.0352758169174194, "logps/chosen": -0.6595112681388855, "logps/rejected": -2.8338589668273926, "loss": 0.6854, "odds_ratio_loss": 0.2593601644039154, "rewards/accuracies": 1.0, "rewards/chosen": -0.06595112383365631, "rewards/margins": 0.21743479371070862, "rewards/rejected": -0.28338590264320374, "sft_loss": 0.6595112681388855, "step": 925 }, { "epoch": 0.07, "grad_norm": 30.268157958984375, "learning_rate": 9.895882920769515e-06, "logits/chosen": -1.3694220781326294, "logits/rejected": -1.1358081102371216, "logps/chosen": -1.0219380855560303, "logps/rejected": -0.8565353155136108, "loss": 1.1132, "odds_ratio_loss": 0.9122053980827332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10219381004571915, "rewards/margins": -0.016540277749300003, "rewards/rejected": -0.08565352857112885, "sft_loss": 1.0219380855560303, "step": 930 }, { "epoch": 0.07, "grad_norm": 8.992692947387695, "learning_rate": 9.89462915763068e-06, "logits/chosen": -1.2513294219970703, "logits/rejected": -0.9854547381401062, "logps/chosen": -0.9552785754203796, "logps/rejected": -1.058593511581421, "loss": 1.0204, "odds_ratio_loss": 0.6511629223823547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09552786499261856, "rewards/margins": 0.010331504046916962, "rewards/rejected": -0.10585936158895493, "sft_loss": 0.9552785754203796, "step": 935 }, { "epoch": 0.07, "grad_norm": 12.573531150817871, "learning_rate": 9.893367971165279e-06, "logits/chosen": -1.388832688331604, "logits/rejected": -0.8999320864677429, "logps/chosen": -1.093196988105774, "logps/rejected": -1.4769212007522583, "loss": 1.1543, "odds_ratio_loss": 0.6115171313285828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10931968688964844, "rewards/margins": 0.03837241604924202, "rewards/rejected": -0.14769211411476135, "sft_loss": 1.093196988105774, "step": 940 }, { "epoch": 0.07, "grad_norm": 23.18633460998535, "learning_rate": 9.892099363286065e-06, "logits/chosen": -1.479446530342102, "logits/rejected": -1.034317135810852, "logps/chosen": -1.3559527397155762, "logps/rejected": -1.4490649700164795, "loss": 1.453, "odds_ratio_loss": 0.9702537655830383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13559527695178986, "rewards/margins": 0.009311218746006489, "rewards/rejected": -0.14490649104118347, "sft_loss": 1.3559527397155762, "step": 945 }, { "epoch": 0.07, "grad_norm": 5.014403343200684, "learning_rate": 9.890823335917041e-06, "logits/chosen": -1.2923214435577393, "logits/rejected": -0.7891890406608582, "logps/chosen": -1.1381855010986328, "logps/rejected": -1.3333497047424316, "loss": 1.2146, "odds_ratio_loss": 0.7636581659317017, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11381856352090836, "rewards/margins": 0.019516412168741226, "rewards/rejected": -0.1333349645137787, "sft_loss": 1.1381855010986328, "step": 950 }, { "epoch": 0.07, "grad_norm": 5.407112121582031, "learning_rate": 9.889539890993467e-06, "logits/chosen": -1.3674460649490356, "logits/rejected": -0.7717936635017395, "logps/chosen": -1.107033371925354, "logps/rejected": -2.2579007148742676, "loss": 1.1523, "odds_ratio_loss": 0.45254993438720703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11070333421230316, "rewards/margins": 0.11508671194314957, "rewards/rejected": -0.22579005360603333, "sft_loss": 1.107033371925354, "step": 955 }, { "epoch": 0.07, "grad_norm": 22.41192054748535, "learning_rate": 9.888249030461845e-06, "logits/chosen": -1.3265749216079712, "logits/rejected": -0.8742873072624207, "logps/chosen": -0.7914237976074219, "logps/rejected": -2.211487293243408, "loss": 0.8814, "odds_ratio_loss": 0.9001585245132446, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07914238423109055, "rewards/margins": 0.14200636744499207, "rewards/rejected": -0.22114872932434082, "sft_loss": 0.7914237976074219, "step": 960 }, { "epoch": 0.08, "grad_norm": 51.93904495239258, "learning_rate": 9.886950756279933e-06, "logits/chosen": -1.2333533763885498, "logits/rejected": -0.6385300159454346, "logps/chosen": -1.252501368522644, "logps/rejected": -1.7653671503067017, "loss": 1.3013, "odds_ratio_loss": 0.48805102705955505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12525013089179993, "rewards/margins": 0.051286570727825165, "rewards/rejected": -0.1765367090702057, "sft_loss": 1.252501368522644, "step": 965 }, { "epoch": 0.08, "grad_norm": 16.045190811157227, "learning_rate": 9.885645070416728e-06, "logits/chosen": -1.3813869953155518, "logits/rejected": -1.1527819633483887, "logps/chosen": -0.8400813937187195, "logps/rejected": -1.084443211555481, "loss": 0.8877, "odds_ratio_loss": 0.47596150636672974, "rewards/accuracies": 1.0, "rewards/chosen": -0.08400814235210419, "rewards/margins": 0.024436186999082565, "rewards/rejected": -0.10844433307647705, "sft_loss": 0.8400813937187195, "step": 970 }, { "epoch": 0.08, "grad_norm": 14.392130851745605, "learning_rate": 9.884331974852468e-06, "logits/chosen": -1.3196592330932617, "logits/rejected": -1.043312430381775, "logps/chosen": -1.1383229494094849, "logps/rejected": -0.9930847883224487, "loss": 1.2273, "odds_ratio_loss": 0.8899247050285339, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11383228003978729, "rewards/margins": -0.014523811638355255, "rewards/rejected": -0.09930847585201263, "sft_loss": 1.1383229494094849, "step": 975 }, { "epoch": 0.08, "grad_norm": 15.830513954162598, "learning_rate": 9.88301147157863e-06, "logits/chosen": -1.24079167842865, "logits/rejected": -1.3131943941116333, "logps/chosen": -0.9807993769645691, "logps/rejected": -1.419396996498108, "loss": 1.0474, "odds_ratio_loss": 0.6656183004379272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09807994216680527, "rewards/margins": 0.04385975003242493, "rewards/rejected": -0.1419396847486496, "sft_loss": 0.9807993769645691, "step": 980 }, { "epoch": 0.08, "grad_norm": 11.744377136230469, "learning_rate": 9.881683562597924e-06, "logits/chosen": -1.2285785675048828, "logits/rejected": -1.3326553106307983, "logps/chosen": -0.7297677397727966, "logps/rejected": -1.0131244659423828, "loss": 0.7885, "odds_ratio_loss": 0.5870744585990906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07297678291797638, "rewards/margins": 0.02833567187190056, "rewards/rejected": -0.10131244361400604, "sft_loss": 0.7297677397727966, "step": 985 }, { "epoch": 0.08, "grad_norm": 4.617860794067383, "learning_rate": 9.88034824992429e-06, "logits/chosen": -1.4039509296417236, "logits/rejected": -0.8233305215835571, "logps/chosen": -1.162663221359253, "logps/rejected": -2.416837692260742, "loss": 1.1918, "odds_ratio_loss": 0.29158255457878113, "rewards/accuracies": 1.0, "rewards/chosen": -0.11626632511615753, "rewards/margins": 0.12541747093200684, "rewards/rejected": -0.24168379604816437, "sft_loss": 1.162663221359253, "step": 990 }, { "epoch": 0.08, "grad_norm": 9.872160911560059, "learning_rate": 9.879005535582904e-06, "logits/chosen": -1.3318212032318115, "logits/rejected": -1.442276954650879, "logps/chosen": -1.0921529531478882, "logps/rejected": -1.2366979122161865, "loss": 1.1696, "odds_ratio_loss": 0.7744948267936707, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10921530425548553, "rewards/margins": 0.014454501681029797, "rewards/rejected": -0.1236698180437088, "sft_loss": 1.0921529531478882, "step": 995 }, { "epoch": 0.08, "grad_norm": 6.297494411468506, "learning_rate": 9.87765542161016e-06, "logits/chosen": -1.530379056930542, "logits/rejected": -0.9569048881530762, "logps/chosen": -0.9592668414115906, "logps/rejected": -1.179861307144165, "loss": 1.0172, "odds_ratio_loss": 0.579362690448761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0959266871213913, "rewards/margins": 0.02205944061279297, "rewards/rejected": -0.11798612773418427, "sft_loss": 0.9592668414115906, "step": 1000 }, { "epoch": 0.08, "grad_norm": 8.824081420898438, "learning_rate": 9.876297910053678e-06, "logits/chosen": -1.4260704517364502, "logits/rejected": -0.9216135740280151, "logps/chosen": -0.927899956703186, "logps/rejected": -1.1985846757888794, "loss": 0.98, "odds_ratio_loss": 0.5209888219833374, "rewards/accuracies": 1.0, "rewards/chosen": -0.09278999269008636, "rewards/margins": 0.027068469673395157, "rewards/rejected": -0.11985846608877182, "sft_loss": 0.927899956703186, "step": 1005 }, { "epoch": 0.08, "grad_norm": 34.00803756713867, "learning_rate": 9.874933002972297e-06, "logits/chosen": -1.4401054382324219, "logits/rejected": -0.8292428851127625, "logps/chosen": -0.8494836688041687, "logps/rejected": -1.0477453470230103, "loss": 0.9141, "odds_ratio_loss": 0.6464778184890747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08494836837053299, "rewards/margins": 0.019826162606477737, "rewards/rejected": -0.10477451980113983, "sft_loss": 0.8494836688041687, "step": 1010 }, { "epoch": 0.08, "grad_norm": 5.185765743255615, "learning_rate": 9.873560702436072e-06, "logits/chosen": -1.36220383644104, "logits/rejected": -0.8890962600708008, "logps/chosen": -1.1309592723846436, "logps/rejected": -2.1118993759155273, "loss": 1.1752, "odds_ratio_loss": 0.44284600019454956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11309593915939331, "rewards/margins": 0.09809400886297226, "rewards/rejected": -0.21118994057178497, "sft_loss": 1.1309592723846436, "step": 1015 }, { "epoch": 0.08, "grad_norm": 31.76227378845215, "learning_rate": 9.87218101052627e-06, "logits/chosen": -1.484521508216858, "logits/rejected": -1.0151021480560303, "logps/chosen": -1.1594613790512085, "logps/rejected": -1.8886810541152954, "loss": 1.2184, "odds_ratio_loss": 0.5893402099609375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11594613641500473, "rewards/margins": 0.07292197644710541, "rewards/rejected": -0.18886812031269073, "sft_loss": 1.1594613790512085, "step": 1020 }, { "epoch": 0.08, "grad_norm": 22.267271041870117, "learning_rate": 9.870793929335367e-06, "logits/chosen": -1.544433832168579, "logits/rejected": -1.0945574045181274, "logps/chosen": -0.7931126356124878, "logps/rejected": -1.0429248809814453, "loss": 0.8478, "odds_ratio_loss": 0.5466041564941406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07931126654148102, "rewards/margins": 0.024981223046779633, "rewards/rejected": -0.10429248958826065, "sft_loss": 0.7931126356124878, "step": 1025 }, { "epoch": 0.08, "grad_norm": 6.590450286865234, "learning_rate": 9.869399460967052e-06, "logits/chosen": -1.3002541065216064, "logits/rejected": -1.009333610534668, "logps/chosen": -1.110409140586853, "logps/rejected": -1.0610884428024292, "loss": 1.1878, "odds_ratio_loss": 0.7737922668457031, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11104090511798859, "rewards/margins": -0.004932059440761805, "rewards/rejected": -0.10610884428024292, "sft_loss": 1.110409140586853, "step": 1030 }, { "epoch": 0.08, "grad_norm": 8.192142486572266, "learning_rate": 9.867997607536212e-06, "logits/chosen": -1.3670454025268555, "logits/rejected": -0.8192941546440125, "logps/chosen": -1.0318758487701416, "logps/rejected": -0.930589497089386, "loss": 1.1152, "odds_ratio_loss": 0.8337259292602539, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10318758338689804, "rewards/margins": -0.010128635913133621, "rewards/rejected": -0.09305894374847412, "sft_loss": 1.0318758487701416, "step": 1035 }, { "epoch": 0.08, "grad_norm": 5.612542152404785, "learning_rate": 9.866588371168935e-06, "logits/chosen": -1.2067945003509521, "logits/rejected": -0.731787383556366, "logps/chosen": -0.8820972442626953, "logps/rejected": -0.7037122845649719, "loss": 0.9745, "odds_ratio_loss": 0.9238243103027344, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08820972591638565, "rewards/margins": -0.017838502302765846, "rewards/rejected": -0.07037122547626495, "sft_loss": 0.8820972442626953, "step": 1040 }, { "epoch": 0.08, "grad_norm": 6.599497318267822, "learning_rate": 9.865171754002505e-06, "logits/chosen": -1.2855665683746338, "logits/rejected": -0.6700264811515808, "logps/chosen": -0.7599323987960815, "logps/rejected": -1.3745027780532837, "loss": 0.7921, "odds_ratio_loss": 0.3221582770347595, "rewards/accuracies": 1.0, "rewards/chosen": -0.07599325478076935, "rewards/margins": 0.06145703047513962, "rewards/rejected": -0.13745027780532837, "sft_loss": 0.7599323987960815, "step": 1045 }, { "epoch": 0.08, "grad_norm": 141.1793212890625, "learning_rate": 9.863747758185405e-06, "logits/chosen": -1.0344316959381104, "logits/rejected": -0.966625988483429, "logps/chosen": -1.0074121952056885, "logps/rejected": -1.2289427518844604, "loss": 1.0665, "odds_ratio_loss": 0.5912154912948608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10074122250080109, "rewards/margins": 0.022153064608573914, "rewards/rejected": -0.1228942722082138, "sft_loss": 1.0074121952056885, "step": 1050 }, { "epoch": 0.08, "grad_norm": 5.1217217445373535, "learning_rate": 9.862316385877305e-06, "logits/chosen": -1.3833470344543457, "logits/rejected": -0.8735666275024414, "logps/chosen": -0.799805760383606, "logps/rejected": -3.1053359508514404, "loss": 0.8412, "odds_ratio_loss": 0.41374388337135315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07998057454824448, "rewards/margins": 0.2305530607700348, "rewards/rejected": -0.31053364276885986, "sft_loss": 0.799805760383606, "step": 1055 }, { "epoch": 0.08, "grad_norm": 9.84019660949707, "learning_rate": 9.860877639249063e-06, "logits/chosen": -1.3688457012176514, "logits/rejected": -0.9568573832511902, "logps/chosen": -1.2854890823364258, "logps/rejected": -1.8219703435897827, "loss": 1.3305, "odds_ratio_loss": 0.4498967230319977, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12854890525341034, "rewards/margins": 0.05364813655614853, "rewards/rejected": -0.18219704926013947, "sft_loss": 1.2854890823364258, "step": 1060 }, { "epoch": 0.08, "grad_norm": 29.234161376953125, "learning_rate": 9.859431520482716e-06, "logits/chosen": -1.2526874542236328, "logits/rejected": -0.9805940389633179, "logps/chosen": -0.9354702234268188, "logps/rejected": -1.7557783126831055, "loss": 0.975, "odds_ratio_loss": 0.3954404294490814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09354700893163681, "rewards/margins": 0.08203083276748657, "rewards/rejected": -0.17557783424854279, "sft_loss": 0.9354702234268188, "step": 1065 }, { "epoch": 0.08, "grad_norm": 5.677125453948975, "learning_rate": 9.857978031771494e-06, "logits/chosen": -1.3821674585342407, "logits/rejected": -0.8005634546279907, "logps/chosen": -0.8023947477340698, "logps/rejected": -0.9100324511528015, "loss": 0.8635, "odds_ratio_loss": 0.6106234192848206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08023947477340698, "rewards/margins": 0.010763769038021564, "rewards/rejected": -0.09100324660539627, "sft_loss": 0.8023947477340698, "step": 1070 }, { "epoch": 0.08, "grad_norm": 7.841827869415283, "learning_rate": 9.856517175319794e-06, "logits/chosen": -1.4441566467285156, "logits/rejected": -1.024996042251587, "logps/chosen": -0.8957603573799133, "logps/rejected": -1.3328006267547607, "loss": 0.9526, "odds_ratio_loss": 0.5685744881629944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08957605063915253, "rewards/margins": 0.04370402172207832, "rewards/rejected": -0.13328006863594055, "sft_loss": 0.8957603573799133, "step": 1075 }, { "epoch": 0.08, "grad_norm": 134.03005981445312, "learning_rate": 9.85504895334319e-06, "logits/chosen": -1.0448005199432373, "logits/rejected": -0.9956331253051758, "logps/chosen": -1.2046611309051514, "logps/rejected": -1.4932407140731812, "loss": 1.2943, "odds_ratio_loss": 0.895898163318634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12046612799167633, "rewards/margins": 0.02885795198380947, "rewards/rejected": -0.14932407438755035, "sft_loss": 1.2046611309051514, "step": 1080 }, { "epoch": 0.08, "grad_norm": 10.544418334960938, "learning_rate": 9.853573368068426e-06, "logits/chosen": -1.3604201078414917, "logits/rejected": -0.9904428720474243, "logps/chosen": -1.3646055459976196, "logps/rejected": -1.1506952047348022, "loss": 1.457, "odds_ratio_loss": 0.9236465692520142, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1364605724811554, "rewards/margins": -0.021391037851572037, "rewards/rejected": -0.11506952345371246, "sft_loss": 1.3646055459976196, "step": 1085 }, { "epoch": 0.08, "grad_norm": 8.134223937988281, "learning_rate": 9.852090421733416e-06, "logits/chosen": -1.2988691329956055, "logits/rejected": -0.8891332745552063, "logps/chosen": -1.0880292654037476, "logps/rejected": -0.9603763818740845, "loss": 1.1707, "odds_ratio_loss": 0.826417088508606, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.108802929520607, "rewards/margins": -0.012765288352966309, "rewards/rejected": -0.09603764116764069, "sft_loss": 1.0880292654037476, "step": 1090 }, { "epoch": 0.09, "grad_norm": 14.613194465637207, "learning_rate": 9.850600116587236e-06, "logits/chosen": -1.2537554502487183, "logits/rejected": -1.1704847812652588, "logps/chosen": -0.8787897229194641, "logps/rejected": -2.0588157176971436, "loss": 0.9807, "odds_ratio_loss": 1.0188149213790894, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08787896484136581, "rewards/margins": 0.11800263077020645, "rewards/rejected": -0.20588159561157227, "sft_loss": 0.8787897229194641, "step": 1095 }, { "epoch": 0.09, "grad_norm": 10.195762634277344, "learning_rate": 9.849102454890122e-06, "logits/chosen": -1.3870155811309814, "logits/rejected": -0.9877394437789917, "logps/chosen": -1.125795602798462, "logps/rejected": -1.0407532453536987, "loss": 1.2027, "odds_ratio_loss": 0.7686118483543396, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.11257956176996231, "rewards/margins": -0.008504234254360199, "rewards/rejected": -0.10407533496618271, "sft_loss": 1.125795602798462, "step": 1100 }, { "epoch": 0.09, "grad_norm": 20.717632293701172, "learning_rate": 9.847597438913471e-06, "logits/chosen": -1.339658498764038, "logits/rejected": -1.0415122509002686, "logps/chosen": -1.1699798107147217, "logps/rejected": -1.8965017795562744, "loss": 1.2401, "odds_ratio_loss": 0.7011226415634155, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11699797958135605, "rewards/margins": 0.0726521909236908, "rewards/rejected": -0.18965016305446625, "sft_loss": 1.1699798107147217, "step": 1105 }, { "epoch": 0.09, "grad_norm": 7.274383068084717, "learning_rate": 9.846085070939829e-06, "logits/chosen": -1.492741346359253, "logits/rejected": -1.3278045654296875, "logps/chosen": -1.2755863666534424, "logps/rejected": -1.2807852029800415, "loss": 1.3526, "odds_ratio_loss": 0.7704776525497437, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1275586187839508, "rewards/margins": 0.0005198910948820412, "rewards/rejected": -0.12807850539684296, "sft_loss": 1.2755863666534424, "step": 1110 }, { "epoch": 0.09, "grad_norm": 10.192612648010254, "learning_rate": 9.844565353262892e-06, "logits/chosen": -1.1960515975952148, "logits/rejected": -0.8686011433601379, "logps/chosen": -0.7996153235435486, "logps/rejected": -1.4538285732269287, "loss": 0.8463, "odds_ratio_loss": 0.4663833677768707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07996153086423874, "rewards/margins": 0.06542132049798965, "rewards/rejected": -0.1453828513622284, "sft_loss": 0.7996153235435486, "step": 1115 }, { "epoch": 0.09, "grad_norm": 9.693170547485352, "learning_rate": 9.843038288187508e-06, "logits/chosen": -1.298099398612976, "logits/rejected": -1.1106765270233154, "logps/chosen": -0.8047016263008118, "logps/rejected": -1.351907730102539, "loss": 0.8499, "odds_ratio_loss": 0.4519086480140686, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08047015964984894, "rewards/margins": 0.05472060292959213, "rewards/rejected": -0.13519077003002167, "sft_loss": 0.8047016263008118, "step": 1120 }, { "epoch": 0.09, "grad_norm": 21.928857803344727, "learning_rate": 9.841503878029663e-06, "logits/chosen": -1.2702182531356812, "logits/rejected": -0.8940436244010925, "logps/chosen": -1.007550835609436, "logps/rejected": -2.1225733757019043, "loss": 1.0861, "odds_ratio_loss": 0.7852200269699097, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10075507313013077, "rewards/margins": 0.11150224506855011, "rewards/rejected": -0.21225731074810028, "sft_loss": 1.007550835609436, "step": 1125 }, { "epoch": 0.09, "grad_norm": 7.138321876525879, "learning_rate": 9.839962125116489e-06, "logits/chosen": -1.3176006078720093, "logits/rejected": -0.6804088354110718, "logps/chosen": -1.1514666080474854, "logps/rejected": -1.2209088802337646, "loss": 1.2284, "odds_ratio_loss": 0.7691280245780945, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11514665931463242, "rewards/margins": 0.006944227032363415, "rewards/rejected": -0.1220908910036087, "sft_loss": 1.1514666080474854, "step": 1130 }, { "epoch": 0.09, "grad_norm": 6.852372169494629, "learning_rate": 9.838413031786242e-06, "logits/chosen": -1.281246304512024, "logits/rejected": -0.9366620182991028, "logps/chosen": -0.9345917701721191, "logps/rejected": -0.9236852526664734, "loss": 1.0247, "odds_ratio_loss": 0.9015239477157593, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09345918148756027, "rewards/margins": -0.001090650213882327, "rewards/rejected": -0.09236852824687958, "sft_loss": 0.9345917701721191, "step": 1135 }, { "epoch": 0.09, "grad_norm": 5.963149547576904, "learning_rate": 9.836856600388327e-06, "logits/chosen": -1.3900423049926758, "logits/rejected": -0.7759448289871216, "logps/chosen": -1.133793592453003, "logps/rejected": -1.5137890577316284, "loss": 1.1933, "odds_ratio_loss": 0.5953725576400757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11337937414646149, "rewards/margins": 0.03799953684210777, "rewards/rejected": -0.15137891471385956, "sft_loss": 1.133793592453003, "step": 1140 }, { "epoch": 0.09, "grad_norm": 22.794923782348633, "learning_rate": 9.835292833283265e-06, "logits/chosen": -1.3036854267120361, "logits/rejected": -1.2684440612792969, "logps/chosen": -0.8737970590591431, "logps/rejected": -2.6831753253936768, "loss": 0.9183, "odds_ratio_loss": 0.4450407922267914, "rewards/accuracies": 1.0, "rewards/chosen": -0.08737970888614655, "rewards/margins": 0.18093781173229218, "rewards/rejected": -0.2683175206184387, "sft_loss": 0.8737970590591431, "step": 1145 }, { "epoch": 0.09, "grad_norm": 29.34295082092285, "learning_rate": 9.833721732842709e-06, "logits/chosen": -1.2447541952133179, "logits/rejected": -1.1019372940063477, "logps/chosen": -0.7875004410743713, "logps/rejected": -0.9674968719482422, "loss": 0.8673, "odds_ratio_loss": 0.7978585958480835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07875005155801773, "rewards/margins": 0.017999637871980667, "rewards/rejected": -0.0967496782541275, "sft_loss": 0.7875004410743713, "step": 1150 }, { "epoch": 0.09, "grad_norm": 45.776451110839844, "learning_rate": 9.83214330144943e-06, "logits/chosen": -1.4828782081604004, "logits/rejected": -1.3936234712600708, "logps/chosen": -1.1248772144317627, "logps/rejected": -1.4694101810455322, "loss": 1.1788, "odds_ratio_loss": 0.5393758416175842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11248771846294403, "rewards/margins": 0.03445331007242203, "rewards/rejected": -0.14694103598594666, "sft_loss": 1.1248772144317627, "step": 1155 }, { "epoch": 0.09, "grad_norm": 13.452958106994629, "learning_rate": 9.830557541497324e-06, "logits/chosen": -1.2950688600540161, "logits/rejected": -1.086715817451477, "logps/chosen": -0.9902938008308411, "logps/rejected": -1.3207643032073975, "loss": 1.0503, "odds_ratio_loss": 0.6004306674003601, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09902938455343246, "rewards/margins": 0.03304705396294594, "rewards/rejected": -0.1320764124393463, "sft_loss": 0.9902938008308411, "step": 1160 }, { "epoch": 0.09, "grad_norm": 5.851574420928955, "learning_rate": 9.828964455391394e-06, "logits/chosen": -1.3571841716766357, "logits/rejected": -1.0242164134979248, "logps/chosen": -1.282517671585083, "logps/rejected": -1.1523053646087646, "loss": 1.3857, "odds_ratio_loss": 1.0314857959747314, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.12825177609920502, "rewards/margins": -0.013021242804825306, "rewards/rejected": -0.11523053795099258, "sft_loss": 1.282517671585083, "step": 1165 }, { "epoch": 0.09, "grad_norm": 9.247029304504395, "learning_rate": 9.827364045547758e-06, "logits/chosen": -1.2307461500167847, "logits/rejected": -0.9085014462471008, "logps/chosen": -1.0744895935058594, "logps/rejected": -0.807500958442688, "loss": 1.1704, "odds_ratio_loss": 0.9586833119392395, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10744895786046982, "rewards/margins": -0.026698868721723557, "rewards/rejected": -0.08075009286403656, "sft_loss": 1.0744895935058594, "step": 1170 }, { "epoch": 0.09, "grad_norm": 7.52495813369751, "learning_rate": 9.825756314393642e-06, "logits/chosen": -1.3226604461669922, "logits/rejected": -0.9620486497879028, "logps/chosen": -0.7998035550117493, "logps/rejected": -1.128028392791748, "loss": 0.8536, "odds_ratio_loss": 0.5378514528274536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07998035848140717, "rewards/margins": 0.03282248228788376, "rewards/rejected": -0.11280284821987152, "sft_loss": 0.7998035550117493, "step": 1175 }, { "epoch": 0.09, "grad_norm": 14.74472427368164, "learning_rate": 9.824141264367372e-06, "logits/chosen": -1.0445303916931152, "logits/rejected": -1.044632911682129, "logps/chosen": -0.960924506187439, "logps/rejected": -1.0087189674377441, "loss": 1.0244, "odds_ratio_loss": 0.6351147294044495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09609244763851166, "rewards/margins": 0.004779445938766003, "rewards/rejected": -0.10087189823389053, "sft_loss": 0.960924506187439, "step": 1180 }, { "epoch": 0.09, "grad_norm": 4.776372909545898, "learning_rate": 9.822518897918377e-06, "logits/chosen": -1.3258750438690186, "logits/rejected": -0.4255582392215729, "logps/chosen": -1.1844384670257568, "logps/rejected": -1.5825674533843994, "loss": 1.2435, "odds_ratio_loss": 0.590488076210022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11844384670257568, "rewards/margins": 0.03981289640069008, "rewards/rejected": -0.15825673937797546, "sft_loss": 1.1844384670257568, "step": 1185 }, { "epoch": 0.09, "grad_norm": 7.216888904571533, "learning_rate": 9.820889217507184e-06, "logits/chosen": -1.306216835975647, "logits/rejected": -0.8492482304573059, "logps/chosen": -0.8745654225349426, "logps/rejected": -1.1286555528640747, "loss": 0.9287, "odds_ratio_loss": 0.5410099029541016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08745653927326202, "rewards/margins": 0.025409013032913208, "rewards/rejected": -0.11286555230617523, "sft_loss": 0.8745654225349426, "step": 1190 }, { "epoch": 0.09, "grad_norm": 16.240503311157227, "learning_rate": 9.819252225605409e-06, "logits/chosen": -1.1860531568527222, "logits/rejected": -1.108520269393921, "logps/chosen": -1.0403330326080322, "logps/rejected": -1.2271296977996826, "loss": 1.1174, "odds_ratio_loss": 0.7707337141036987, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10403330624103546, "rewards/margins": 0.018679680302739143, "rewards/rejected": -0.12271298468112946, "sft_loss": 1.0403330326080322, "step": 1195 }, { "epoch": 0.09, "grad_norm": 14.383599281311035, "learning_rate": 9.817607924695756e-06, "logits/chosen": -1.2943899631500244, "logits/rejected": -1.288130283355713, "logps/chosen": -1.0370019674301147, "logps/rejected": -1.4093286991119385, "loss": 1.0911, "odds_ratio_loss": 0.5407058596611023, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10370020568370819, "rewards/margins": 0.037232670933008194, "rewards/rejected": -0.1409328728914261, "sft_loss": 1.0370019674301147, "step": 1200 }, { "epoch": 0.09, "grad_norm": 8.22785472869873, "learning_rate": 9.81595631727202e-06, "logits/chosen": -1.1882762908935547, "logits/rejected": -0.9258209466934204, "logps/chosen": -0.8339977264404297, "logps/rejected": -1.1905823945999146, "loss": 0.8972, "odds_ratio_loss": 0.6322519779205322, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08339976519346237, "rewards/margins": 0.03565846011042595, "rewards/rejected": -0.11905822902917862, "sft_loss": 0.8339977264404297, "step": 1205 }, { "epoch": 0.09, "grad_norm": 11.784111976623535, "learning_rate": 9.81429740583907e-06, "logits/chosen": -1.306536316871643, "logits/rejected": -0.9648883938789368, "logps/chosen": -0.957537055015564, "logps/rejected": -1.3278559446334839, "loss": 1.0264, "odds_ratio_loss": 0.6885043382644653, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09575371444225311, "rewards/margins": 0.03703188896179199, "rewards/rejected": -0.1327856034040451, "sft_loss": 0.957537055015564, "step": 1210 }, { "epoch": 0.09, "grad_norm": 46.6259880065918, "learning_rate": 9.812631192912856e-06, "logits/chosen": -1.2349252700805664, "logits/rejected": -0.45999327301979065, "logps/chosen": -0.8471899032592773, "logps/rejected": -2.678170680999756, "loss": 0.8966, "odds_ratio_loss": 0.49428415298461914, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0847189873456955, "rewards/margins": 0.18309810757637024, "rewards/rejected": -0.26781708002090454, "sft_loss": 0.8471899032592773, "step": 1215 }, { "epoch": 0.09, "grad_norm": 117.32431030273438, "learning_rate": 9.810957681020404e-06, "logits/chosen": -1.10783052444458, "logits/rejected": -1.2066490650177002, "logps/chosen": -0.875723659992218, "logps/rejected": -1.6135194301605225, "loss": 0.9094, "odds_ratio_loss": 0.3370504379272461, "rewards/accuracies": 1.0, "rewards/chosen": -0.0875723659992218, "rewards/margins": 0.07377958297729492, "rewards/rejected": -0.16135194897651672, "sft_loss": 0.875723659992218, "step": 1220 }, { "epoch": 0.1, "grad_norm": 65.14192199707031, "learning_rate": 9.809276872699806e-06, "logits/chosen": -1.25543212890625, "logits/rejected": -1.2210181951522827, "logps/chosen": -0.9082130193710327, "logps/rejected": -6.193860054016113, "loss": 0.9352, "odds_ratio_loss": 0.26966392993927, "rewards/accuracies": 1.0, "rewards/chosen": -0.09082130342721939, "rewards/margins": 0.5285647511482239, "rewards/rejected": -0.6193860173225403, "sft_loss": 0.9082130193710327, "step": 1225 }, { "epoch": 0.1, "grad_norm": 6.9969987869262695, "learning_rate": 9.80758877050022e-06, "logits/chosen": -1.374524474143982, "logits/rejected": -0.625987708568573, "logps/chosen": -0.9819453954696655, "logps/rejected": -1.604069709777832, "loss": 1.0319, "odds_ratio_loss": 0.4998885989189148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09819453954696655, "rewards/margins": 0.06221244856715202, "rewards/rejected": -0.16040697693824768, "sft_loss": 0.9819453954696655, "step": 1230 }, { "epoch": 0.1, "grad_norm": 10.998051643371582, "learning_rate": 9.80589337698187e-06, "logits/chosen": -1.3153575658798218, "logits/rejected": -0.9550518989562988, "logps/chosen": -0.9396483302116394, "logps/rejected": -1.1199018955230713, "loss": 1.0017, "odds_ratio_loss": 0.6204285621643066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0939648374915123, "rewards/margins": 0.01802534982562065, "rewards/rejected": -0.11199019104242325, "sft_loss": 0.9396483302116394, "step": 1235 }, { "epoch": 0.1, "grad_norm": 15.231605529785156, "learning_rate": 9.804190694716031e-06, "logits/chosen": -1.3787672519683838, "logits/rejected": -0.8150347471237183, "logps/chosen": -1.0452436208724976, "logps/rejected": -2.0565788745880127, "loss": 1.0783, "odds_ratio_loss": 0.3310582637786865, "rewards/accuracies": 1.0, "rewards/chosen": -0.10452437400817871, "rewards/margins": 0.10113354027271271, "rewards/rejected": -0.20565791428089142, "sft_loss": 1.0452436208724976, "step": 1240 }, { "epoch": 0.1, "grad_norm": 5.462742805480957, "learning_rate": 9.802480726285041e-06, "logits/chosen": -1.3796197175979614, "logits/rejected": -0.7846873998641968, "logps/chosen": -0.6047025918960571, "logps/rejected": -5.4197773933410645, "loss": 0.6444, "odds_ratio_loss": 0.3967844843864441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06047026067972183, "rewards/margins": 0.48150748014450073, "rewards/rejected": -0.5419777631759644, "sft_loss": 0.6047025918960571, "step": 1245 }, { "epoch": 0.1, "grad_norm": 6.896554470062256, "learning_rate": 9.800763474282284e-06, "logits/chosen": -1.3147039413452148, "logits/rejected": -0.6698298454284668, "logps/chosen": -1.0624172687530518, "logps/rejected": -1.228693962097168, "loss": 1.1291, "odds_ratio_loss": 0.6668539047241211, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10624171793460846, "rewards/margins": 0.016627687960863113, "rewards/rejected": -0.12286939471960068, "sft_loss": 1.0624172687530518, "step": 1250 }, { "epoch": 0.1, "grad_norm": 31.245380401611328, "learning_rate": 9.79903894131219e-06, "logits/chosen": -1.3014583587646484, "logits/rejected": -1.0607304573059082, "logps/chosen": -1.271507740020752, "logps/rejected": -1.0553052425384521, "loss": 1.3613, "odds_ratio_loss": 0.8978258967399597, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1271507740020752, "rewards/margins": -0.02162025310099125, "rewards/rejected": -0.1055305227637291, "sft_loss": 1.271507740020752, "step": 1255 }, { "epoch": 0.1, "grad_norm": 13.336468696594238, "learning_rate": 9.797307129990227e-06, "logits/chosen": -1.0399762392044067, "logits/rejected": -1.2948471307754517, "logps/chosen": -0.8306800723075867, "logps/rejected": -1.1717512607574463, "loss": 0.8946, "odds_ratio_loss": 0.6392477750778198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08306801319122314, "rewards/margins": 0.03410711884498596, "rewards/rejected": -0.11717512458562851, "sft_loss": 0.8306800723075867, "step": 1260 }, { "epoch": 0.1, "grad_norm": 11.68088436126709, "learning_rate": 9.795568042942916e-06, "logits/chosen": -1.3981082439422607, "logits/rejected": -1.2398771047592163, "logps/chosen": -0.8838878870010376, "logps/rejected": -5.310142517089844, "loss": 0.9177, "odds_ratio_loss": 0.3379073739051819, "rewards/accuracies": 1.0, "rewards/chosen": -0.08838878571987152, "rewards/margins": 0.4426254630088806, "rewards/rejected": -0.5310143232345581, "sft_loss": 0.8838878870010376, "step": 1265 }, { "epoch": 0.1, "grad_norm": 116.3707046508789, "learning_rate": 9.793821682807797e-06, "logits/chosen": -1.3818522691726685, "logits/rejected": -0.6395906209945679, "logps/chosen": -1.6790422201156616, "logps/rejected": -2.7593679428100586, "loss": 1.7259, "odds_ratio_loss": 0.4688444137573242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16790422797203064, "rewards/margins": 0.10803258419036865, "rewards/rejected": -0.2759368121623993, "sft_loss": 1.6790422201156616, "step": 1270 }, { "epoch": 0.1, "grad_norm": 5.718632698059082, "learning_rate": 9.79206805223345e-06, "logits/chosen": -1.3850736618041992, "logits/rejected": -1.1277830600738525, "logps/chosen": -1.1699743270874023, "logps/rejected": -3.3309669494628906, "loss": 1.2019, "odds_ratio_loss": 0.3196646571159363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11699743568897247, "rewards/margins": 0.21609926223754883, "rewards/rejected": -0.3330966830253601, "sft_loss": 1.1699743270874023, "step": 1275 }, { "epoch": 0.1, "grad_norm": 7.716296195983887, "learning_rate": 9.790307153879477e-06, "logits/chosen": -1.3103944063186646, "logits/rejected": -0.6352896094322205, "logps/chosen": -1.0161986351013184, "logps/rejected": -1.074324131011963, "loss": 1.0845, "odds_ratio_loss": 0.6833962798118591, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10161986202001572, "rewards/margins": 0.005812540650367737, "rewards/rejected": -0.10743240267038345, "sft_loss": 1.0161986351013184, "step": 1280 }, { "epoch": 0.1, "grad_norm": 18.123607635498047, "learning_rate": 9.788538990416503e-06, "logits/chosen": -1.4223918914794922, "logits/rejected": -1.336246132850647, "logps/chosen": -0.8674184083938599, "logps/rejected": -1.0089197158813477, "loss": 0.9325, "odds_ratio_loss": 0.6510507464408875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08674183487892151, "rewards/margins": 0.014150148257613182, "rewards/rejected": -0.10089198499917984, "sft_loss": 0.8674184083938599, "step": 1285 }, { "epoch": 0.1, "grad_norm": 4.782289981842041, "learning_rate": 9.786763564526173e-06, "logits/chosen": -1.364534616470337, "logits/rejected": -0.5958219766616821, "logps/chosen": -0.8636929392814636, "logps/rejected": -1.2223838567733765, "loss": 0.9152, "odds_ratio_loss": 0.5149844288825989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08636929094791412, "rewards/margins": 0.035869091749191284, "rewards/rejected": -0.12223838269710541, "sft_loss": 0.8636929392814636, "step": 1290 }, { "epoch": 0.1, "grad_norm": 7.08095121383667, "learning_rate": 9.78498087890115e-06, "logits/chosen": -1.5712015628814697, "logits/rejected": -1.1497470140457153, "logps/chosen": -0.9990192651748657, "logps/rejected": -1.026200294494629, "loss": 1.0707, "odds_ratio_loss": 0.7166789770126343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09990192949771881, "rewards/margins": 0.0027181021869182587, "rewards/rejected": -0.10262002795934677, "sft_loss": 0.9990192651748657, "step": 1295 }, { "epoch": 0.1, "grad_norm": 12.181612014770508, "learning_rate": 9.783190936245096e-06, "logits/chosen": -1.2592464685440063, "logits/rejected": -0.9734852910041809, "logps/chosen": -0.7972999811172485, "logps/rejected": -1.1606394052505493, "loss": 0.8397, "odds_ratio_loss": 0.4241735339164734, "rewards/accuracies": 1.0, "rewards/chosen": -0.07972999662160873, "rewards/margins": 0.03633394464850426, "rewards/rejected": -0.11606393754482269, "sft_loss": 0.7972999811172485, "step": 1300 }, { "epoch": 0.1, "grad_norm": 7.701669216156006, "learning_rate": 9.781393739272689e-06, "logits/chosen": -1.1740028858184814, "logits/rejected": -0.9590864181518555, "logps/chosen": -1.0751943588256836, "logps/rejected": -1.5750086307525635, "loss": 1.1349, "odds_ratio_loss": 0.5967916250228882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10751944780349731, "rewards/margins": 0.049981411546468735, "rewards/rejected": -0.15750086307525635, "sft_loss": 1.0751943588256836, "step": 1305 }, { "epoch": 0.1, "grad_norm": 5.968133449554443, "learning_rate": 9.779589290709607e-06, "logits/chosen": -1.5533254146575928, "logits/rejected": -1.0931613445281982, "logps/chosen": -0.9862836003303528, "logps/rejected": -1.3846126794815063, "loss": 1.0598, "odds_ratio_loss": 0.7355043888092041, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09862836450338364, "rewards/margins": 0.039832908660173416, "rewards/rejected": -0.13846126198768616, "sft_loss": 0.9862836003303528, "step": 1310 }, { "epoch": 0.1, "grad_norm": 74.6002426147461, "learning_rate": 9.777777593292527e-06, "logits/chosen": -1.1837421655654907, "logits/rejected": -0.6344571709632874, "logps/chosen": -0.9500184059143066, "logps/rejected": -2.0003952980041504, "loss": 0.9983, "odds_ratio_loss": 0.4824226498603821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09500183165073395, "rewards/margins": 0.10503768920898438, "rewards/rejected": -0.20003953576087952, "sft_loss": 0.9500184059143066, "step": 1315 }, { "epoch": 0.1, "grad_norm": 500.04864501953125, "learning_rate": 9.775958649769117e-06, "logits/chosen": -1.3982659578323364, "logits/rejected": -1.1193958520889282, "logps/chosen": -1.9695625305175781, "logps/rejected": -1.2773897647857666, "loss": 2.1121, "odds_ratio_loss": 1.425871729850769, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.19695624709129333, "rewards/margins": -0.0692172572016716, "rewards/rejected": -0.12773898243904114, "sft_loss": 1.9695625305175781, "step": 1320 }, { "epoch": 0.1, "grad_norm": 7.868124008178711, "learning_rate": 9.774132462898033e-06, "logits/chosen": -1.336496353149414, "logits/rejected": -1.1353330612182617, "logps/chosen": -0.7955440282821655, "logps/rejected": -1.272878646850586, "loss": 0.8501, "odds_ratio_loss": 0.5457112193107605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07955440133810043, "rewards/margins": 0.047733455896377563, "rewards/rejected": -0.1272878646850586, "sft_loss": 0.7955440282821655, "step": 1325 }, { "epoch": 0.1, "grad_norm": 8.51680850982666, "learning_rate": 9.772299035448924e-06, "logits/chosen": -1.1256171464920044, "logits/rejected": -0.9005386233329773, "logps/chosen": -1.3089529275894165, "logps/rejected": -1.4084324836730957, "loss": 1.383, "odds_ratio_loss": 0.7403467297554016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13089528679847717, "rewards/margins": 0.009947952814400196, "rewards/rejected": -0.1408432424068451, "sft_loss": 1.3089529275894165, "step": 1330 }, { "epoch": 0.1, "grad_norm": 6.1593499183654785, "learning_rate": 9.770458370202412e-06, "logits/chosen": -1.2082802057266235, "logits/rejected": -0.9753934741020203, "logps/chosen": -1.1150516271591187, "logps/rejected": -2.4671926498413086, "loss": 1.1581, "odds_ratio_loss": 0.4302564561367035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1115051656961441, "rewards/margins": 0.13521410524845123, "rewards/rejected": -0.24671927094459534, "sft_loss": 1.1150516271591187, "step": 1335 }, { "epoch": 0.1, "grad_norm": 6.688883304595947, "learning_rate": 9.7686104699501e-06, "logits/chosen": -1.2678996324539185, "logits/rejected": -0.839708149433136, "logps/chosen": -0.8351173400878906, "logps/rejected": -1.2071555852890015, "loss": 0.8955, "odds_ratio_loss": 0.6039993166923523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08351172506809235, "rewards/margins": 0.0372038260102272, "rewards/rejected": -0.12071555852890015, "sft_loss": 0.8351173400878906, "step": 1340 }, { "epoch": 0.1, "grad_norm": 5.224099159240723, "learning_rate": 9.766755337494565e-06, "logits/chosen": -1.248985767364502, "logits/rejected": -0.7827884554862976, "logps/chosen": -0.9480899572372437, "logps/rejected": -0.8805420994758606, "loss": 1.0298, "odds_ratio_loss": 0.8172227740287781, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09480900317430496, "rewards/margins": -0.006754803005605936, "rewards/rejected": -0.08805420249700546, "sft_loss": 0.9480899572372437, "step": 1345 }, { "epoch": 0.11, "grad_norm": 11.390414237976074, "learning_rate": 9.764892975649349e-06, "logits/chosen": -1.3521184921264648, "logits/rejected": -0.9650154113769531, "logps/chosen": -1.0387392044067383, "logps/rejected": -1.35995352268219, "loss": 1.0944, "odds_ratio_loss": 0.5570557713508606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10387392342090607, "rewards/margins": 0.0321214385330677, "rewards/rejected": -0.13599535822868347, "sft_loss": 1.0387392044067383, "step": 1350 }, { "epoch": 0.11, "grad_norm": 6.765865325927734, "learning_rate": 9.763023387238961e-06, "logits/chosen": -1.3084546327590942, "logits/rejected": -0.7172307968139648, "logps/chosen": -1.1213195323944092, "logps/rejected": -1.8646224737167358, "loss": 1.2094, "odds_ratio_loss": 0.8810539245605469, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11213195323944092, "rewards/margins": 0.07433030009269714, "rewards/rejected": -0.18646225333213806, "sft_loss": 1.1213195323944092, "step": 1355 }, { "epoch": 0.11, "grad_norm": 32.736473083496094, "learning_rate": 9.76114657509887e-06, "logits/chosen": -1.277092695236206, "logits/rejected": -0.8425081968307495, "logps/chosen": -0.9858977198600769, "logps/rejected": -1.1639198064804077, "loss": 1.0578, "odds_ratio_loss": 0.7190499305725098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09858977794647217, "rewards/margins": 0.017802204936742783, "rewards/rejected": -0.11639197915792465, "sft_loss": 0.9858977198600769, "step": 1360 }, { "epoch": 0.11, "grad_norm": 12.08522891998291, "learning_rate": 9.759262542075498e-06, "logits/chosen": -1.3010032176971436, "logits/rejected": -0.9239484071731567, "logps/chosen": -0.8678629994392395, "logps/rejected": -1.2071233987808228, "loss": 0.9279, "odds_ratio_loss": 0.6008371114730835, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08678629994392395, "rewards/margins": 0.03392603620886803, "rewards/rejected": -0.12071233987808228, "sft_loss": 0.8678629994392395, "step": 1365 }, { "epoch": 0.11, "grad_norm": 7.780725955963135, "learning_rate": 9.757371291026223e-06, "logits/chosen": -1.204240083694458, "logits/rejected": -0.834067702293396, "logps/chosen": -0.9842250943183899, "logps/rejected": -1.17227303981781, "loss": 1.0623, "odds_ratio_loss": 0.7805746793746948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09842251241207123, "rewards/margins": 0.01880478672683239, "rewards/rejected": -0.11722730100154877, "sft_loss": 0.9842250943183899, "step": 1370 }, { "epoch": 0.11, "grad_norm": 6.746867656707764, "learning_rate": 9.755472824819366e-06, "logits/chosen": -1.333153486251831, "logits/rejected": -1.0350208282470703, "logps/chosen": -1.1680481433868408, "logps/rejected": -1.410505771636963, "loss": 1.2276, "odds_ratio_loss": 0.5952333211898804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1168048158288002, "rewards/margins": 0.024245765060186386, "rewards/rejected": -0.1410505771636963, "sft_loss": 1.1680481433868408, "step": 1375 }, { "epoch": 0.11, "grad_norm": 109.35551452636719, "learning_rate": 9.753567146334189e-06, "logits/chosen": -1.2310402393341064, "logits/rejected": -0.8814123272895813, "logps/chosen": -1.1120350360870361, "logps/rejected": -2.6216845512390137, "loss": 1.1726, "odds_ratio_loss": 0.6051499247550964, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11120350658893585, "rewards/margins": 0.15096496045589447, "rewards/rejected": -0.2621684670448303, "sft_loss": 1.1120350360870361, "step": 1380 }, { "epoch": 0.11, "grad_norm": 82.70585632324219, "learning_rate": 9.7516542584609e-06, "logits/chosen": -1.381817102432251, "logits/rejected": -1.098077416419983, "logps/chosen": -1.1308685541152954, "logps/rejected": -1.3681640625, "loss": 1.2013, "odds_ratio_loss": 0.7040119171142578, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11308685690164566, "rewards/margins": 0.023729555308818817, "rewards/rejected": -0.13681641221046448, "sft_loss": 1.1308685541152954, "step": 1385 }, { "epoch": 0.11, "grad_norm": 27.114789962768555, "learning_rate": 9.749734164100635e-06, "logits/chosen": -0.9133816957473755, "logits/rejected": -1.0223209857940674, "logps/chosen": -0.8734237551689148, "logps/rejected": -1.461368203163147, "loss": 0.9209, "odds_ratio_loss": 0.4743104875087738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08734238147735596, "rewards/margins": 0.058794427663087845, "rewards/rejected": -0.1461368054151535, "sft_loss": 0.8734237551689148, "step": 1390 }, { "epoch": 0.11, "grad_norm": 22.410390853881836, "learning_rate": 9.74780686616546e-06, "logits/chosen": -1.1132972240447998, "logits/rejected": -1.0505969524383545, "logps/chosen": -1.0885108709335327, "logps/rejected": -2.5375404357910156, "loss": 1.1221, "odds_ratio_loss": 0.3356505036354065, "rewards/accuracies": 1.0, "rewards/chosen": -0.10885109752416611, "rewards/margins": 0.14490298926830292, "rewards/rejected": -0.2537540793418884, "sft_loss": 1.0885108709335327, "step": 1395 }, { "epoch": 0.11, "grad_norm": 11.0577974319458, "learning_rate": 9.745872367578366e-06, "logits/chosen": -1.3365861177444458, "logits/rejected": -0.8117850422859192, "logps/chosen": -1.005133032798767, "logps/rejected": -1.0743589401245117, "loss": 1.0747, "odds_ratio_loss": 0.6960892677307129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10051330178976059, "rewards/margins": 0.006922594271600246, "rewards/rejected": -0.10743590444326401, "sft_loss": 1.005133032798767, "step": 1400 }, { "epoch": 0.11, "grad_norm": 9.530302047729492, "learning_rate": 9.743930671273269e-06, "logits/chosen": -1.328739881515503, "logits/rejected": -1.3169947862625122, "logps/chosen": -1.1289832592010498, "logps/rejected": -4.339430809020996, "loss": 1.1739, "odds_ratio_loss": 0.44948825240135193, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1128983274102211, "rewards/margins": 0.3210447430610657, "rewards/rejected": -0.43394309282302856, "sft_loss": 1.1289832592010498, "step": 1405 }, { "epoch": 0.11, "grad_norm": 6.579575538635254, "learning_rate": 9.741981780194996e-06, "logits/chosen": -1.3872390985488892, "logits/rejected": -1.1063064336776733, "logps/chosen": -1.0387804508209229, "logps/rejected": -1.274841070175171, "loss": 1.1011, "odds_ratio_loss": 0.6233776807785034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10387805849313736, "rewards/margins": 0.023606054484844208, "rewards/rejected": -0.12748411297798157, "sft_loss": 1.0387804508209229, "step": 1410 }, { "epoch": 0.11, "grad_norm": 5.642371654510498, "learning_rate": 9.740025697299288e-06, "logits/chosen": -1.2045328617095947, "logits/rejected": -0.5057806968688965, "logps/chosen": -0.9965047836303711, "logps/rejected": -2.6621475219726562, "loss": 1.0185, "odds_ratio_loss": 0.22015976905822754, "rewards/accuracies": 1.0, "rewards/chosen": -0.09965048730373383, "rewards/margins": 0.16656428575515747, "rewards/rejected": -0.2662147581577301, "sft_loss": 0.9965047836303711, "step": 1415 }, { "epoch": 0.11, "grad_norm": 24.070602416992188, "learning_rate": 9.73806242555279e-06, "logits/chosen": -1.1469743251800537, "logits/rejected": -1.0040260553359985, "logps/chosen": -0.9361482858657837, "logps/rejected": -2.1627020835876465, "loss": 0.9727, "odds_ratio_loss": 0.3653944730758667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09361483156681061, "rewards/margins": 0.12265539169311523, "rewards/rejected": -0.21627020835876465, "sft_loss": 0.9361482858657837, "step": 1420 }, { "epoch": 0.11, "grad_norm": 19.208091735839844, "learning_rate": 9.736091967933058e-06, "logits/chosen": -0.9995628595352173, "logits/rejected": -1.2005598545074463, "logps/chosen": -1.0177791118621826, "logps/rejected": -4.385058879852295, "loss": 1.0733, "odds_ratio_loss": 0.5556063652038574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10177791118621826, "rewards/margins": 0.33672797679901123, "rewards/rejected": -0.4385058879852295, "sft_loss": 1.0177791118621826, "step": 1425 }, { "epoch": 0.11, "grad_norm": 8.281458854675293, "learning_rate": 9.73411432742854e-06, "logits/chosen": -1.1627318859100342, "logits/rejected": -0.9758650064468384, "logps/chosen": -1.022214651107788, "logps/rejected": -1.0216261148452759, "loss": 1.0938, "odds_ratio_loss": 0.7162176966667175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10222147405147552, "rewards/margins": -5.8867037296295166e-05, "rewards/rejected": -0.10216259956359863, "sft_loss": 1.022214651107788, "step": 1430 }, { "epoch": 0.11, "grad_norm": 6.446768283843994, "learning_rate": 9.732129507038576e-06, "logits/chosen": -1.0612138509750366, "logits/rejected": -1.085730791091919, "logps/chosen": -1.2515077590942383, "logps/rejected": -1.6711082458496094, "loss": 1.3005, "odds_ratio_loss": 0.4898054003715515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12515076994895935, "rewards/margins": 0.04196004569530487, "rewards/rejected": -0.16711083054542542, "sft_loss": 1.2515077590942383, "step": 1435 }, { "epoch": 0.11, "grad_norm": 5.9316511154174805, "learning_rate": 9.730137509773401e-06, "logits/chosen": -1.2018150091171265, "logits/rejected": -0.5067328214645386, "logps/chosen": -1.010285496711731, "logps/rejected": -2.8801932334899902, "loss": 1.0461, "odds_ratio_loss": 0.35789528489112854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10102854669094086, "rewards/margins": 0.18699078261852264, "rewards/rejected": -0.2880193293094635, "sft_loss": 1.010285496711731, "step": 1440 }, { "epoch": 0.11, "grad_norm": 5.917844772338867, "learning_rate": 9.728138338654131e-06, "logits/chosen": -1.0095337629318237, "logits/rejected": -0.9432841539382935, "logps/chosen": -1.1060152053833008, "logps/rejected": -1.2171921730041504, "loss": 1.1696, "odds_ratio_loss": 0.6360725164413452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1106015220284462, "rewards/margins": 0.011117706075310707, "rewards/rejected": -0.12171921879053116, "sft_loss": 1.1060152053833008, "step": 1445 }, { "epoch": 0.11, "grad_norm": 64.06980895996094, "learning_rate": 9.726131996712763e-06, "logits/chosen": -1.41555655002594, "logits/rejected": -1.0206005573272705, "logps/chosen": -1.3561770915985107, "logps/rejected": -3.290637493133545, "loss": 1.3955, "odds_ratio_loss": 0.393208771944046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1356177031993866, "rewards/margins": 0.1934460550546646, "rewards/rejected": -0.3290637731552124, "sft_loss": 1.3561770915985107, "step": 1450 }, { "epoch": 0.11, "grad_norm": 6.217737197875977, "learning_rate": 9.724118486992167e-06, "logits/chosen": -1.3447182178497314, "logits/rejected": -0.9803462028503418, "logps/chosen": -1.143945574760437, "logps/rejected": -1.5074526071548462, "loss": 1.2113, "odds_ratio_loss": 0.6737285852432251, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11439456045627594, "rewards/margins": 0.03635070472955704, "rewards/rejected": -0.1507452428340912, "sft_loss": 1.143945574760437, "step": 1455 }, { "epoch": 0.11, "grad_norm": 4.317702770233154, "learning_rate": 9.72209781254609e-06, "logits/chosen": -1.3004049062728882, "logits/rejected": -0.6786057353019714, "logps/chosen": -0.8832548260688782, "logps/rejected": -1.0870282649993896, "loss": 0.941, "odds_ratio_loss": 0.5778591632843018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08832548558712006, "rewards/margins": 0.020377354696393013, "rewards/rejected": -0.10870283842086792, "sft_loss": 0.8832548260688782, "step": 1460 }, { "epoch": 0.11, "grad_norm": 19.033479690551758, "learning_rate": 9.720069976439138e-06, "logits/chosen": -1.258845567703247, "logits/rejected": -0.6938650012016296, "logps/chosen": -1.0265159606933594, "logps/rejected": -2.807030439376831, "loss": 1.0617, "odds_ratio_loss": 0.3515172600746155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10265159606933594, "rewards/margins": 0.1780514419078827, "rewards/rejected": -0.28070303797721863, "sft_loss": 1.0265159606933594, "step": 1465 }, { "epoch": 0.11, "grad_norm": 5.5583295822143555, "learning_rate": 9.718034981746784e-06, "logits/chosen": -1.2468676567077637, "logits/rejected": -0.6721242070198059, "logps/chosen": -0.950405478477478, "logps/rejected": -2.5747733116149902, "loss": 1.0046, "odds_ratio_loss": 0.5416213274002075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09504055976867676, "rewards/margins": 0.16243679821491241, "rewards/rejected": -0.257477343082428, "sft_loss": 0.950405478477478, "step": 1470 }, { "epoch": 0.11, "grad_norm": 5.851568222045898, "learning_rate": 9.715992831555356e-06, "logits/chosen": -1.1191495656967163, "logits/rejected": -0.8659588694572449, "logps/chosen": -1.1071155071258545, "logps/rejected": -1.0805102586746216, "loss": 1.1782, "odds_ratio_loss": 0.7104871273040771, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11071155965328217, "rewards/margins": -0.0026605359744280577, "rewards/rejected": -0.10805102437734604, "sft_loss": 1.1071155071258545, "step": 1475 }, { "epoch": 0.12, "grad_norm": 11.510098457336426, "learning_rate": 9.713943528962031e-06, "logits/chosen": -1.4745122194290161, "logits/rejected": -1.000880479812622, "logps/chosen": -1.1318867206573486, "logps/rejected": -3.0544705390930176, "loss": 1.17, "odds_ratio_loss": 0.3810274004936218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11318866908550262, "rewards/margins": 0.19225840270519257, "rewards/rejected": -0.3054470717906952, "sft_loss": 1.1318867206573486, "step": 1480 }, { "epoch": 0.12, "grad_norm": 128.35110473632812, "learning_rate": 9.71188707707484e-06, "logits/chosen": -1.1662242412567139, "logits/rejected": -1.034181833267212, "logps/chosen": -1.1305862665176392, "logps/rejected": -1.239294409751892, "loss": 1.1962, "odds_ratio_loss": 0.6565095782279968, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.11305862665176392, "rewards/margins": 0.010870824567973614, "rewards/rejected": -0.12392944097518921, "sft_loss": 1.1305862665176392, "step": 1485 }, { "epoch": 0.12, "grad_norm": 24.923812866210938, "learning_rate": 9.709823479012652e-06, "logits/chosen": -1.242684245109558, "logits/rejected": -1.0049892663955688, "logps/chosen": -1.0039126873016357, "logps/rejected": -1.7858537435531616, "loss": 1.0471, "odds_ratio_loss": 0.43143850564956665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10039126873016357, "rewards/margins": 0.07819411903619766, "rewards/rejected": -0.17858538031578064, "sft_loss": 1.0039126873016357, "step": 1490 }, { "epoch": 0.12, "grad_norm": 7.694460868835449, "learning_rate": 9.707752737905175e-06, "logits/chosen": -1.4090046882629395, "logits/rejected": -1.0745227336883545, "logps/chosen": -0.6600391864776611, "logps/rejected": -0.9888060688972473, "loss": 0.7161, "odds_ratio_loss": 0.5601866841316223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06600391864776611, "rewards/margins": 0.03287668898701668, "rewards/rejected": -0.09888060390949249, "sft_loss": 0.6600391864776611, "step": 1495 }, { "epoch": 0.12, "grad_norm": 15.502304077148438, "learning_rate": 9.705674856892953e-06, "logits/chosen": -1.4167745113372803, "logits/rejected": -1.0435190200805664, "logps/chosen": -0.6534953713417053, "logps/rejected": -5.029236793518066, "loss": 0.6772, "odds_ratio_loss": 0.23742082715034485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0653495341539383, "rewards/margins": 0.4375740885734558, "rewards/rejected": -0.5029236078262329, "sft_loss": 0.6534953713417053, "step": 1500 }, { "epoch": 0.12, "grad_norm": 18.245086669921875, "learning_rate": 9.703589839127355e-06, "logits/chosen": -1.4975404739379883, "logits/rejected": -0.8924884796142578, "logps/chosen": -0.7357920408248901, "logps/rejected": -0.905347466468811, "loss": 0.8012, "odds_ratio_loss": 0.6539013981819153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07357920706272125, "rewards/margins": 0.016955537721514702, "rewards/rejected": -0.0905347466468811, "sft_loss": 0.7357920408248901, "step": 1505 }, { "epoch": 0.12, "grad_norm": 10.855551719665527, "learning_rate": 9.701497687770572e-06, "logits/chosen": -1.4533500671386719, "logits/rejected": -1.1055314540863037, "logps/chosen": -1.0148468017578125, "logps/rejected": -0.8302356600761414, "loss": 1.1049, "odds_ratio_loss": 0.9005705118179321, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10148467868566513, "rewards/margins": -0.018461106345057487, "rewards/rejected": -0.0830235704779625, "sft_loss": 1.0148468017578125, "step": 1510 }, { "epoch": 0.12, "grad_norm": 23.625286102294922, "learning_rate": 9.699398405995621e-06, "logits/chosen": -1.3821407556533813, "logits/rejected": -1.2004427909851074, "logps/chosen": -1.1571322679519653, "logps/rejected": -0.9725432395935059, "loss": 1.2445, "odds_ratio_loss": 0.8739679455757141, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.1157132238149643, "rewards/margins": -0.018458906561136246, "rewards/rejected": -0.09725432842969894, "sft_loss": 1.1571322679519653, "step": 1515 }, { "epoch": 0.12, "grad_norm": 9.365283012390137, "learning_rate": 9.69729199698633e-06, "logits/chosen": -1.1097975969314575, "logits/rejected": -0.9331648945808411, "logps/chosen": -0.8817958831787109, "logps/rejected": -1.2720427513122559, "loss": 0.9337, "odds_ratio_loss": 0.5186026692390442, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0881795883178711, "rewards/margins": 0.03902468830347061, "rewards/rejected": -0.1272042840719223, "sft_loss": 0.8817958831787109, "step": 1520 }, { "epoch": 0.12, "grad_norm": 7.966055393218994, "learning_rate": 9.695178463937333e-06, "logits/chosen": -1.245266318321228, "logits/rejected": -0.7221423387527466, "logps/chosen": -1.0875027179718018, "logps/rejected": -1.6951267719268799, "loss": 1.1337, "odds_ratio_loss": 0.46217623353004456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10875026881694794, "rewards/margins": 0.06076239421963692, "rewards/rejected": -0.16951265931129456, "sft_loss": 1.0875027179718018, "step": 1525 }, { "epoch": 0.12, "grad_norm": 7.182195663452148, "learning_rate": 9.693057810054073e-06, "logits/chosen": -1.4914124011993408, "logits/rejected": -1.0812093019485474, "logps/chosen": -0.942160964012146, "logps/rejected": -3.100853204727173, "loss": 0.9657, "odds_ratio_loss": 0.235835000872612, "rewards/accuracies": 1.0, "rewards/chosen": -0.09421609342098236, "rewards/margins": 0.2158692181110382, "rewards/rejected": -0.3100852966308594, "sft_loss": 0.942160964012146, "step": 1530 }, { "epoch": 0.12, "grad_norm": 8.808905601501465, "learning_rate": 9.69093003855279e-06, "logits/chosen": -1.4138414859771729, "logits/rejected": -1.0396727323532104, "logps/chosen": -1.1173908710479736, "logps/rejected": -3.009047746658325, "loss": 1.1733, "odds_ratio_loss": 0.5594109296798706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11173909902572632, "rewards/margins": 0.18916571140289307, "rewards/rejected": -0.300904780626297, "sft_loss": 1.1173908710479736, "step": 1535 }, { "epoch": 0.12, "grad_norm": 53.27058792114258, "learning_rate": 9.68879515266052e-06, "logits/chosen": -1.5334960222244263, "logits/rejected": -1.2154088020324707, "logps/chosen": -0.9809530973434448, "logps/rejected": -1.6464436054229736, "loss": 1.024, "odds_ratio_loss": 0.43026527762413025, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09809531271457672, "rewards/margins": 0.06654904782772064, "rewards/rejected": -0.16464434564113617, "sft_loss": 0.9809530973434448, "step": 1540 }, { "epoch": 0.12, "grad_norm": 5.976083755493164, "learning_rate": 9.686653155615089e-06, "logits/chosen": -1.4377057552337646, "logits/rejected": -1.030389428138733, "logps/chosen": -0.8061229586601257, "logps/rejected": -4.470185279846191, "loss": 0.8256, "odds_ratio_loss": 0.195216566324234, "rewards/accuracies": 1.0, "rewards/chosen": -0.08061229437589645, "rewards/margins": 0.36640629172325134, "rewards/rejected": -0.447018563747406, "sft_loss": 0.8061229586601257, "step": 1545 }, { "epoch": 0.12, "grad_norm": 10.860502243041992, "learning_rate": 9.684504050665106e-06, "logits/chosen": -1.3172852993011475, "logits/rejected": -1.1030348539352417, "logps/chosen": -0.8627084493637085, "logps/rejected": -1.4381139278411865, "loss": 0.8992, "odds_ratio_loss": 0.365181028842926, "rewards/accuracies": 1.0, "rewards/chosen": -0.08627085387706757, "rewards/margins": 0.057540543377399445, "rewards/rejected": -0.1438113898038864, "sft_loss": 0.8627084493637085, "step": 1550 }, { "epoch": 0.12, "grad_norm": 8.827674865722656, "learning_rate": 9.682347841069961e-06, "logits/chosen": -1.2773463726043701, "logits/rejected": -0.7163883447647095, "logps/chosen": -0.9320128560066223, "logps/rejected": -2.8091297149658203, "loss": 0.9929, "odds_ratio_loss": 0.6083893775939941, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09320129454135895, "rewards/margins": 0.1877116858959198, "rewards/rejected": -0.28091296553611755, "sft_loss": 0.9320128560066223, "step": 1555 }, { "epoch": 0.12, "grad_norm": 11.049094200134277, "learning_rate": 9.680184530099822e-06, "logits/chosen": -1.1425576210021973, "logits/rejected": -1.0668671131134033, "logps/chosen": -1.2148054838180542, "logps/rejected": -1.9959310293197632, "loss": 1.2595, "odds_ratio_loss": 0.44650644063949585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1214805468916893, "rewards/margins": 0.07811255753040314, "rewards/rejected": -0.19959311187267303, "sft_loss": 1.2148054838180542, "step": 1560 }, { "epoch": 0.12, "grad_norm": 7.441697597503662, "learning_rate": 9.678014121035626e-06, "logits/chosen": -1.2251560688018799, "logits/rejected": -0.7328432202339172, "logps/chosen": -1.2359546422958374, "logps/rejected": -1.459623098373413, "loss": 1.2983, "odds_ratio_loss": 0.6238261461257935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1235954612493515, "rewards/margins": 0.022366849705576897, "rewards/rejected": -0.14596232771873474, "sft_loss": 1.2359546422958374, "step": 1565 }, { "epoch": 0.12, "grad_norm": 11.25999641418457, "learning_rate": 9.67583661716907e-06, "logits/chosen": -1.2264257669448853, "logits/rejected": -0.9383336901664734, "logps/chosen": -1.144769549369812, "logps/rejected": -1.3678714036941528, "loss": 1.2072, "odds_ratio_loss": 0.624315083026886, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11447696387767792, "rewards/margins": 0.022310173138976097, "rewards/rejected": -0.13678714632987976, "sft_loss": 1.144769549369812, "step": 1570 }, { "epoch": 0.12, "grad_norm": 6.60423469543457, "learning_rate": 9.673652021802615e-06, "logits/chosen": -1.313948631286621, "logits/rejected": -0.851759135723114, "logps/chosen": -0.7691382765769958, "logps/rejected": -3.526015520095825, "loss": 0.8021, "odds_ratio_loss": 0.3292834162712097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07691384106874466, "rewards/margins": 0.27568769454956055, "rewards/rejected": -0.3526015281677246, "sft_loss": 0.7691382765769958, "step": 1575 }, { "epoch": 0.12, "grad_norm": 9.645243644714355, "learning_rate": 9.671460338249481e-06, "logits/chosen": -1.3922085762023926, "logits/rejected": -1.1013203859329224, "logps/chosen": -0.712190568447113, "logps/rejected": -2.190417528152466, "loss": 0.7394, "odds_ratio_loss": 0.27252617478370667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0712190493941307, "rewards/margins": 0.14782269299030304, "rewards/rejected": -0.21904174983501434, "sft_loss": 0.712190568447113, "step": 1580 }, { "epoch": 0.12, "grad_norm": 12.494514465332031, "learning_rate": 9.669261569833632e-06, "logits/chosen": -1.3502166271209717, "logits/rejected": -1.1128828525543213, "logps/chosen": -1.1661348342895508, "logps/rejected": -1.326633095741272, "loss": 1.2252, "odds_ratio_loss": 0.5908174514770508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1166134849190712, "rewards/margins": 0.016049817204475403, "rewards/rejected": -0.132663294672966, "sft_loss": 1.1661348342895508, "step": 1585 }, { "epoch": 0.12, "grad_norm": 5.0776567459106445, "learning_rate": 9.667055719889778e-06, "logits/chosen": -1.349551796913147, "logits/rejected": -0.9270380735397339, "logps/chosen": -0.8073859214782715, "logps/rejected": -1.9800093173980713, "loss": 0.8363, "odds_ratio_loss": 0.2894620895385742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08073858916759491, "rewards/margins": 0.1172623261809349, "rewards/rejected": -0.1980009377002716, "sft_loss": 0.8073859214782715, "step": 1590 }, { "epoch": 0.12, "grad_norm": 5.410458564758301, "learning_rate": 9.664842791763374e-06, "logits/chosen": -1.4142658710479736, "logits/rejected": -0.8889997601509094, "logps/chosen": -0.9126744270324707, "logps/rejected": -2.523634910583496, "loss": 0.9385, "odds_ratio_loss": 0.25870975852012634, "rewards/accuracies": 1.0, "rewards/chosen": -0.09126743674278259, "rewards/margins": 0.16109606623649597, "rewards/rejected": -0.25236350297927856, "sft_loss": 0.9126744270324707, "step": 1595 }, { "epoch": 0.12, "grad_norm": 5.038325786590576, "learning_rate": 9.662622788810604e-06, "logits/chosen": -1.31760835647583, "logits/rejected": -0.963549017906189, "logps/chosen": -0.8750256299972534, "logps/rejected": -1.8355424404144287, "loss": 0.911, "odds_ratio_loss": 0.35989946126937866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08750256896018982, "rewards/margins": 0.09605169296264648, "rewards/rejected": -0.1835542619228363, "sft_loss": 0.8750256299972534, "step": 1600 }, { "epoch": 0.12, "grad_norm": 7.45214319229126, "learning_rate": 9.660395714398387e-06, "logits/chosen": -1.283881664276123, "logits/rejected": -0.7890142202377319, "logps/chosen": -0.9697272181510925, "logps/rejected": -1.381255865097046, "loss": 1.0188, "odds_ratio_loss": 0.49079370498657227, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09697272628545761, "rewards/margins": 0.041152846068143845, "rewards/rejected": -0.13812556862831116, "sft_loss": 0.9697272181510925, "step": 1605 }, { "epoch": 0.13, "grad_norm": 52.52288818359375, "learning_rate": 9.65816157190436e-06, "logits/chosen": -1.067137598991394, "logits/rejected": -1.3262689113616943, "logps/chosen": -1.0371512174606323, "logps/rejected": -4.629284381866455, "loss": 1.0658, "odds_ratio_loss": 0.2862391173839569, "rewards/accuracies": 1.0, "rewards/chosen": -0.10371513664722443, "rewards/margins": 0.35921329259872437, "rewards/rejected": -0.46292844414711, "sft_loss": 1.0371512174606323, "step": 1610 }, { "epoch": 0.13, "grad_norm": 17.328516006469727, "learning_rate": 9.655920364716888e-06, "logits/chosen": -1.2435493469238281, "logits/rejected": -1.1730600595474243, "logps/chosen": -1.148634433746338, "logps/rejected": -1.3713942766189575, "loss": 1.215, "odds_ratio_loss": 0.6635384559631348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11486344039440155, "rewards/margins": 0.02227597124874592, "rewards/rejected": -0.13713940978050232, "sft_loss": 1.148634433746338, "step": 1615 }, { "epoch": 0.13, "grad_norm": 8.536588668823242, "learning_rate": 9.653672096235042e-06, "logits/chosen": -1.2994670867919922, "logits/rejected": -0.5064767599105835, "logps/chosen": -0.8236101269721985, "logps/rejected": -1.6406495571136475, "loss": 0.8592, "odds_ratio_loss": 0.35558241605758667, "rewards/accuracies": 1.0, "rewards/chosen": -0.08236101269721985, "rewards/margins": 0.08170395344495773, "rewards/rejected": -0.16406495869159698, "sft_loss": 0.8236101269721985, "step": 1620 }, { "epoch": 0.13, "grad_norm": 12.32243537902832, "learning_rate": 9.651416769868611e-06, "logits/chosen": -1.3300247192382812, "logits/rejected": -0.6631767153739929, "logps/chosen": -0.9727737307548523, "logps/rejected": -1.7273757457733154, "loss": 1.0122, "odds_ratio_loss": 0.39440396428108215, "rewards/accuracies": 1.0, "rewards/chosen": -0.09727738052606583, "rewards/margins": 0.07546021044254303, "rewards/rejected": -0.17273758351802826, "sft_loss": 0.9727737307548523, "step": 1625 }, { "epoch": 0.13, "grad_norm": 32.79615783691406, "learning_rate": 9.64915438903808e-06, "logits/chosen": -1.4940061569213867, "logits/rejected": -1.1153476238250732, "logps/chosen": -1.0253182649612427, "logps/rejected": -6.928043365478516, "loss": 1.0591, "odds_ratio_loss": 0.33826178312301636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10253182798624039, "rewards/margins": 0.5902725458145142, "rewards/rejected": -0.6928043961524963, "sft_loss": 1.0253182649612427, "step": 1630 }, { "epoch": 0.13, "grad_norm": 6.1653289794921875, "learning_rate": 9.646884957174639e-06, "logits/chosen": -1.3538758754730225, "logits/rejected": -0.8061432838439941, "logps/chosen": -0.9848779439926147, "logps/rejected": -2.1277546882629395, "loss": 1.0324, "odds_ratio_loss": 0.47568100690841675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09848780184984207, "rewards/margins": 0.11428769677877426, "rewards/rejected": -0.21277549862861633, "sft_loss": 0.9848779439926147, "step": 1635 }, { "epoch": 0.13, "grad_norm": 24.2289981842041, "learning_rate": 9.64460847772017e-06, "logits/chosen": -1.1956393718719482, "logits/rejected": -1.2510998249053955, "logps/chosen": -1.0347821712493896, "logps/rejected": -9.191658020019531, "loss": 1.0668, "odds_ratio_loss": 0.3206237256526947, "rewards/accuracies": 1.0, "rewards/chosen": -0.10347823053598404, "rewards/margins": 0.8156875371932983, "rewards/rejected": -0.9191657900810242, "sft_loss": 1.0347821712493896, "step": 1640 }, { "epoch": 0.13, "grad_norm": 13.129053115844727, "learning_rate": 9.642324954127241e-06, "logits/chosen": -1.4041489362716675, "logits/rejected": -1.0233229398727417, "logps/chosen": -1.1459414958953857, "logps/rejected": -4.507222652435303, "loss": 1.1969, "odds_ratio_loss": 0.5096083283424377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11459414660930634, "rewards/margins": 0.3361281156539917, "rewards/rejected": -0.45072221755981445, "sft_loss": 1.1459414958953857, "step": 1645 }, { "epoch": 0.13, "grad_norm": 5.412563323974609, "learning_rate": 9.640034389859105e-06, "logits/chosen": -1.2122042179107666, "logits/rejected": -0.7289305925369263, "logps/chosen": -0.8540544509887695, "logps/rejected": -1.4945560693740845, "loss": 0.8913, "odds_ratio_loss": 0.37204310297966003, "rewards/accuracies": 1.0, "rewards/chosen": -0.08540545403957367, "rewards/margins": 0.06405016034841537, "rewards/rejected": -0.14945560693740845, "sft_loss": 0.8540544509887695, "step": 1650 }, { "epoch": 0.13, "grad_norm": 6.666295528411865, "learning_rate": 9.637736788389698e-06, "logits/chosen": -1.3543541431427002, "logits/rejected": -0.9869254231452942, "logps/chosen": -0.9420193433761597, "logps/rejected": -1.282476544380188, "loss": 0.9937, "odds_ratio_loss": 0.5166751146316528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09420192986726761, "rewards/margins": 0.03404572606086731, "rewards/rejected": -0.12824766337871552, "sft_loss": 0.9420193433761597, "step": 1655 }, { "epoch": 0.13, "grad_norm": 5.094629287719727, "learning_rate": 9.635432153203618e-06, "logits/chosen": -1.3278597593307495, "logits/rejected": -0.9322368502616882, "logps/chosen": -0.8363531827926636, "logps/rejected": -0.9989891052246094, "loss": 0.8974, "odds_ratio_loss": 0.6107999682426453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08363533020019531, "rewards/margins": 0.016263587400317192, "rewards/rejected": -0.09989891946315765, "sft_loss": 0.8363531827926636, "step": 1660 }, { "epoch": 0.13, "grad_norm": 6.596664905548096, "learning_rate": 9.633120487796145e-06, "logits/chosen": -1.32304048538208, "logits/rejected": -0.7720328569412231, "logps/chosen": -0.904772162437439, "logps/rejected": -1.3722435235977173, "loss": 0.9454, "odds_ratio_loss": 0.4057803750038147, "rewards/accuracies": 1.0, "rewards/chosen": -0.09047721326351166, "rewards/margins": 0.04674714058637619, "rewards/rejected": -0.13722436130046844, "sft_loss": 0.904772162437439, "step": 1665 }, { "epoch": 0.13, "grad_norm": 8.84036636352539, "learning_rate": 9.630801795673203e-06, "logits/chosen": -1.4791336059570312, "logits/rejected": -1.0405861139297485, "logps/chosen": -0.7008475065231323, "logps/rejected": -1.8262150287628174, "loss": 0.7382, "odds_ratio_loss": 0.3737823963165283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07008475810289383, "rewards/margins": 0.11253675073385239, "rewards/rejected": -0.18262150883674622, "sft_loss": 0.7008475065231323, "step": 1670 }, { "epoch": 0.13, "grad_norm": 14.353055953979492, "learning_rate": 9.628476080351392e-06, "logits/chosen": -1.5037769079208374, "logits/rejected": -1.0465939044952393, "logps/chosen": -1.0048973560333252, "logps/rejected": -1.4740705490112305, "loss": 1.0509, "odds_ratio_loss": 0.4605104923248291, "rewards/accuracies": 1.0, "rewards/chosen": -0.10048973560333252, "rewards/margins": 0.04691731557250023, "rewards/rejected": -0.14740703999996185, "sft_loss": 1.0048973560333252, "step": 1675 }, { "epoch": 0.13, "grad_norm": 6.722265720367432, "learning_rate": 9.62614334535795e-06, "logits/chosen": -1.1800585985183716, "logits/rejected": -1.0569149255752563, "logps/chosen": -0.9442615509033203, "logps/rejected": -1.088505506515503, "loss": 1.0093, "odds_ratio_loss": 0.6508314609527588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09442616999149323, "rewards/margins": 0.0144243985414505, "rewards/rejected": -0.10885056108236313, "sft_loss": 0.9442615509033203, "step": 1680 }, { "epoch": 0.13, "grad_norm": 22.2547607421875, "learning_rate": 9.623803594230768e-06, "logits/chosen": -1.3695385456085205, "logits/rejected": -1.2436089515686035, "logps/chosen": -0.7560356855392456, "logps/rejected": -1.0340205430984497, "loss": 0.8104, "odds_ratio_loss": 0.5438529253005981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07560355961322784, "rewards/margins": 0.027798492461442947, "rewards/rejected": -0.10340205579996109, "sft_loss": 0.7560356855392456, "step": 1685 }, { "epoch": 0.13, "grad_norm": 7.995831489562988, "learning_rate": 9.621456830518372e-06, "logits/chosen": -1.3292655944824219, "logits/rejected": -1.1388499736785889, "logps/chosen": -1.080413579940796, "logps/rejected": -1.0560017824172974, "loss": 1.15, "odds_ratio_loss": 0.6955240368843079, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10804136842489243, "rewards/margins": -0.002441184129565954, "rewards/rejected": -0.10560017824172974, "sft_loss": 1.080413579940796, "step": 1690 }, { "epoch": 0.13, "grad_norm": 11.818976402282715, "learning_rate": 9.61910305777993e-06, "logits/chosen": -1.2284135818481445, "logits/rejected": -0.9383662939071655, "logps/chosen": -1.0092649459838867, "logps/rejected": -1.004175066947937, "loss": 1.0855, "odds_ratio_loss": 0.7619765996932983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10092649608850479, "rewards/margins": -0.0005089893820695579, "rewards/rejected": -0.10041750967502594, "sft_loss": 1.0092649459838867, "step": 1695 }, { "epoch": 0.13, "grad_norm": 5.940454483032227, "learning_rate": 9.616742279585237e-06, "logits/chosen": -1.2692519426345825, "logits/rejected": -0.5092897415161133, "logps/chosen": -1.0339243412017822, "logps/rejected": -1.1642903089523315, "loss": 1.094, "odds_ratio_loss": 0.6005213856697083, "rewards/accuracies": 1.0, "rewards/chosen": -0.10339243710041046, "rewards/margins": 0.013036603108048439, "rewards/rejected": -0.11642904579639435, "sft_loss": 1.0339243412017822, "step": 1700 }, { "epoch": 0.13, "grad_norm": 5.760742664337158, "learning_rate": 9.614374499514712e-06, "logits/chosen": -1.2022879123687744, "logits/rejected": -0.7368927001953125, "logps/chosen": -1.0404551029205322, "logps/rejected": -0.8103219866752625, "loss": 1.1465, "odds_ratio_loss": 1.0605835914611816, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10404551029205322, "rewards/margins": -0.023013316094875336, "rewards/rejected": -0.08103219419717789, "sft_loss": 1.0404551029205322, "step": 1705 }, { "epoch": 0.13, "grad_norm": 10.959287643432617, "learning_rate": 9.611999721159397e-06, "logits/chosen": -1.2368319034576416, "logits/rejected": -1.1851609945297241, "logps/chosen": -1.2158687114715576, "logps/rejected": -3.8632235527038574, "loss": 1.256, "odds_ratio_loss": 0.4010738730430603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.121586874127388, "rewards/margins": 0.26473551988601685, "rewards/rejected": -0.38632237911224365, "sft_loss": 1.2158687114715576, "step": 1710 }, { "epoch": 0.13, "grad_norm": 8.698751449584961, "learning_rate": 9.609617948120939e-06, "logits/chosen": -1.3946199417114258, "logits/rejected": -1.0465425252914429, "logps/chosen": -0.876213550567627, "logps/rejected": -3.2105965614318848, "loss": 0.9136, "odds_ratio_loss": 0.3735765218734741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08762134611606598, "rewards/margins": 0.23343829810619354, "rewards/rejected": -0.3210596740245819, "sft_loss": 0.876213550567627, "step": 1715 }, { "epoch": 0.13, "grad_norm": 5.588376522064209, "learning_rate": 9.607229184011605e-06, "logits/chosen": -1.3224467039108276, "logits/rejected": -0.8747288584709167, "logps/chosen": -1.0286288261413574, "logps/rejected": -1.219234824180603, "loss": 1.0875, "odds_ratio_loss": 0.5890880823135376, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1028628796339035, "rewards/margins": 0.019060594961047173, "rewards/rejected": -0.12192346900701523, "sft_loss": 1.0286288261413574, "step": 1720 }, { "epoch": 0.13, "grad_norm": 11.192124366760254, "learning_rate": 9.604833432454257e-06, "logits/chosen": -1.3245986700057983, "logits/rejected": -0.7286895513534546, "logps/chosen": -1.0568475723266602, "logps/rejected": -1.8521363735198975, "loss": 1.0987, "odds_ratio_loss": 0.418095201253891, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10568475723266602, "rewards/margins": 0.07952889800071716, "rewards/rejected": -0.18521365523338318, "sft_loss": 1.0568475723266602, "step": 1725 }, { "epoch": 0.13, "grad_norm": 5.898077011108398, "learning_rate": 9.602430697082357e-06, "logits/chosen": -1.3225600719451904, "logits/rejected": -0.8455519676208496, "logps/chosen": -1.1911251544952393, "logps/rejected": -2.105005979537964, "loss": 1.2411, "odds_ratio_loss": 0.49952688813209534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11911250650882721, "rewards/margins": 0.09138808399438858, "rewards/rejected": -0.2105005979537964, "sft_loss": 1.1911251544952393, "step": 1730 }, { "epoch": 0.13, "grad_norm": 17.48480987548828, "learning_rate": 9.600020981539956e-06, "logits/chosen": -1.2910171747207642, "logits/rejected": -0.7443715929985046, "logps/chosen": -1.270747423171997, "logps/rejected": -1.583164930343628, "loss": 1.3271, "odds_ratio_loss": 0.5633386373519897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12707474827766418, "rewards/margins": 0.031241733580827713, "rewards/rejected": -0.1583164930343628, "sft_loss": 1.270747423171997, "step": 1735 }, { "epoch": 0.14, "grad_norm": 24.316553115844727, "learning_rate": 9.597604289481694e-06, "logits/chosen": -1.2967002391815186, "logits/rejected": -1.2517060041427612, "logps/chosen": -1.1744412183761597, "logps/rejected": -1.2925034761428833, "loss": 1.2822, "odds_ratio_loss": 1.077358365058899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11744412034749985, "rewards/margins": 0.011806219816207886, "rewards/rejected": -0.12925033271312714, "sft_loss": 1.1744412183761597, "step": 1740 }, { "epoch": 0.14, "grad_norm": 8.544798851013184, "learning_rate": 9.595180624572796e-06, "logits/chosen": -1.1876946687698364, "logits/rejected": -0.711455762386322, "logps/chosen": -1.0169483423233032, "logps/rejected": -1.6038730144500732, "loss": 1.0643, "odds_ratio_loss": 0.47310882806777954, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10169483721256256, "rewards/margins": 0.05869249254465103, "rewards/rejected": -0.160387322306633, "sft_loss": 1.0169483423233032, "step": 1745 }, { "epoch": 0.14, "grad_norm": 11.438017845153809, "learning_rate": 9.59274999048905e-06, "logits/chosen": -1.2468347549438477, "logits/rejected": -1.1550118923187256, "logps/chosen": -1.0963854789733887, "logps/rejected": -1.1607807874679565, "loss": 1.1889, "odds_ratio_loss": 0.9248707890510559, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10963854938745499, "rewards/margins": 0.006439529359340668, "rewards/rejected": -0.11607807874679565, "sft_loss": 1.0963854789733887, "step": 1750 }, { "epoch": 0.14, "grad_norm": 10.904440879821777, "learning_rate": 9.590312390916827e-06, "logits/chosen": -1.3679497241973877, "logits/rejected": -1.2733559608459473, "logps/chosen": -0.9566739797592163, "logps/rejected": -1.2512528896331787, "loss": 1.0128, "odds_ratio_loss": 0.5608989000320435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09566739946603775, "rewards/margins": 0.029457902535796165, "rewards/rejected": -0.12512531876564026, "sft_loss": 0.9566739797592163, "step": 1755 }, { "epoch": 0.14, "grad_norm": 83.32542419433594, "learning_rate": 9.587867829553055e-06, "logits/chosen": -1.494425654411316, "logits/rejected": -1.193587064743042, "logps/chosen": -0.7924457788467407, "logps/rejected": -1.9595781564712524, "loss": 0.8297, "odds_ratio_loss": 0.37288200855255127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07924458384513855, "rewards/margins": 0.11671324074268341, "rewards/rejected": -0.19595780968666077, "sft_loss": 0.7924457788467407, "step": 1760 }, { "epoch": 0.14, "grad_norm": 6.813757419586182, "learning_rate": 9.58541631010522e-06, "logits/chosen": -1.3695342540740967, "logits/rejected": -0.6321445107460022, "logps/chosen": -0.7915887236595154, "logps/rejected": -4.873553276062012, "loss": 0.8314, "odds_ratio_loss": 0.39797115325927734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07915887981653214, "rewards/margins": 0.40819644927978516, "rewards/rejected": -0.48735538125038147, "sft_loss": 0.7915887236595154, "step": 1765 }, { "epoch": 0.14, "grad_norm": 94.25627899169922, "learning_rate": 9.582957836291365e-06, "logits/chosen": -1.3012911081314087, "logits/rejected": -1.3341959714889526, "logps/chosen": -1.2055413722991943, "logps/rejected": -5.983010292053223, "loss": 1.2533, "odds_ratio_loss": 0.4779466986656189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12055414915084839, "rewards/margins": 0.4777469038963318, "rewards/rejected": -0.5983010530471802, "sft_loss": 1.2055413722991943, "step": 1770 }, { "epoch": 0.14, "grad_norm": 9.417245864868164, "learning_rate": 9.580492411840074e-06, "logits/chosen": -1.2206226587295532, "logits/rejected": -1.2017875909805298, "logps/chosen": -1.0882132053375244, "logps/rejected": -2.914821147918701, "loss": 1.1314, "odds_ratio_loss": 0.43143337965011597, "rewards/accuracies": 1.0, "rewards/chosen": -0.1088213175535202, "rewards/margins": 0.1826608031988144, "rewards/rejected": -0.291482150554657, "sft_loss": 1.0882132053375244, "step": 1775 }, { "epoch": 0.14, "grad_norm": 6.386961936950684, "learning_rate": 9.57802004049048e-06, "logits/chosen": -1.163529396057129, "logits/rejected": -0.9100425839424133, "logps/chosen": -0.9114856719970703, "logps/rejected": -1.1690971851348877, "loss": 0.976, "odds_ratio_loss": 0.6448505520820618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09114857017993927, "rewards/margins": 0.025761157274246216, "rewards/rejected": -0.11690972000360489, "sft_loss": 0.9114856719970703, "step": 1780 }, { "epoch": 0.14, "grad_norm": 8.349102020263672, "learning_rate": 9.575540725992247e-06, "logits/chosen": -1.204837441444397, "logits/rejected": -0.7992308139801025, "logps/chosen": -1.0231040716171265, "logps/rejected": -1.9079253673553467, "loss": 1.0584, "odds_ratio_loss": 0.3529992401599884, "rewards/accuracies": 1.0, "rewards/chosen": -0.10231040418148041, "rewards/margins": 0.08848213404417038, "rewards/rejected": -0.1907925307750702, "sft_loss": 1.0231040716171265, "step": 1785 }, { "epoch": 0.14, "grad_norm": 6.984671115875244, "learning_rate": 9.573054472105569e-06, "logits/chosen": -1.160017967224121, "logits/rejected": -0.872658908367157, "logps/chosen": -1.000333309173584, "logps/rejected": -1.3170549869537354, "loss": 1.0523, "odds_ratio_loss": 0.5200861692428589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10003332793712616, "rewards/margins": 0.031672172248363495, "rewards/rejected": -0.13170549273490906, "sft_loss": 1.000333309173584, "step": 1790 }, { "epoch": 0.14, "grad_norm": 122.16671752929688, "learning_rate": 9.570561282601167e-06, "logits/chosen": -1.2130086421966553, "logits/rejected": -0.7942731380462646, "logps/chosen": -1.1537272930145264, "logps/rejected": -1.1185890436172485, "loss": 1.3542, "odds_ratio_loss": 2.0043020248413086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11537273228168488, "rewards/margins": -0.003513835370540619, "rewards/rejected": -0.11185890436172485, "sft_loss": 1.1537272930145264, "step": 1795 }, { "epoch": 0.14, "grad_norm": 6.3150434494018555, "learning_rate": 9.568061161260278e-06, "logits/chosen": -1.4022128582000732, "logits/rejected": -1.072725772857666, "logps/chosen": -0.8498795628547668, "logps/rejected": -3.3911800384521484, "loss": 0.8751, "odds_ratio_loss": 0.25199708342552185, "rewards/accuracies": 1.0, "rewards/chosen": -0.08498796075582504, "rewards/margins": 0.2541300654411316, "rewards/rejected": -0.33911800384521484, "sft_loss": 0.8498795628547668, "step": 1800 }, { "epoch": 0.14, "grad_norm": 52.85586929321289, "learning_rate": 9.565554111874656e-06, "logits/chosen": -1.3249465227127075, "logits/rejected": -1.126591682434082, "logps/chosen": -1.2122596502304077, "logps/rejected": -2.6743364334106445, "loss": 1.2655, "odds_ratio_loss": 0.5320409536361694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12122596800327301, "rewards/margins": 0.14620766043663025, "rewards/rejected": -0.26743364334106445, "sft_loss": 1.2122596502304077, "step": 1805 }, { "epoch": 0.14, "grad_norm": 12.57426643371582, "learning_rate": 9.563040138246555e-06, "logits/chosen": -1.338123083114624, "logits/rejected": -0.9952232241630554, "logps/chosen": -1.2487857341766357, "logps/rejected": -4.27414083480835, "loss": 1.2862, "odds_ratio_loss": 0.3736916184425354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12487858533859253, "rewards/margins": 0.3025355041027069, "rewards/rejected": -0.4274141192436218, "sft_loss": 1.2487857341766357, "step": 1810 }, { "epoch": 0.14, "grad_norm": 5.040468692779541, "learning_rate": 9.560519244188741e-06, "logits/chosen": -1.1609458923339844, "logits/rejected": -1.012904405593872, "logps/chosen": -0.879071056842804, "logps/rejected": -2.6562893390655518, "loss": 0.8998, "odds_ratio_loss": 0.20734456181526184, "rewards/accuracies": 1.0, "rewards/chosen": -0.0879070982336998, "rewards/margins": 0.17772184312343597, "rewards/rejected": -0.2656289339065552, "sft_loss": 0.879071056842804, "step": 1815 }, { "epoch": 0.14, "grad_norm": 7.037989616394043, "learning_rate": 9.557991433524465e-06, "logits/chosen": -1.0672191381454468, "logits/rejected": -0.8686882257461548, "logps/chosen": -0.988175094127655, "logps/rejected": -3.527780055999756, "loss": 1.0157, "odds_ratio_loss": 0.27490872144699097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09881751239299774, "rewards/margins": 0.2539605498313904, "rewards/rejected": -0.35277801752090454, "sft_loss": 0.988175094127655, "step": 1820 }, { "epoch": 0.14, "grad_norm": 14.428661346435547, "learning_rate": 9.555456710087476e-06, "logits/chosen": -1.3363722562789917, "logits/rejected": -0.9723957777023315, "logps/chosen": -1.0374715328216553, "logps/rejected": -1.990022897720337, "loss": 1.0726, "odds_ratio_loss": 0.35115641355514526, "rewards/accuracies": 1.0, "rewards/chosen": -0.1037471666932106, "rewards/margins": 0.09525513648986816, "rewards/rejected": -0.19900229573249817, "sft_loss": 1.0374715328216553, "step": 1825 }, { "epoch": 0.14, "grad_norm": 8.201395988464355, "learning_rate": 9.552915077722002e-06, "logits/chosen": -1.3187824487686157, "logits/rejected": -0.9559124112129211, "logps/chosen": -0.977192759513855, "logps/rejected": -1.3955583572387695, "loss": 1.0304, "odds_ratio_loss": 0.5320570468902588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09771927446126938, "rewards/margins": 0.041836559772491455, "rewards/rejected": -0.13955584168434143, "sft_loss": 0.977192759513855, "step": 1830 }, { "epoch": 0.14, "grad_norm": 5.69422721862793, "learning_rate": 9.550366540282753e-06, "logits/chosen": -1.3487383127212524, "logits/rejected": -0.5610469579696655, "logps/chosen": -1.0741031169891357, "logps/rejected": -1.5127952098846436, "loss": 1.1234, "odds_ratio_loss": 0.4929905831813812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10741032660007477, "rewards/margins": 0.04386921599507332, "rewards/rejected": -0.1512795388698578, "sft_loss": 1.0741031169891357, "step": 1835 }, { "epoch": 0.14, "grad_norm": 27.743892669677734, "learning_rate": 9.54781110163491e-06, "logits/chosen": -1.2624667882919312, "logits/rejected": -0.8160993456840515, "logps/chosen": -0.9436119794845581, "logps/rejected": -1.8064286708831787, "loss": 0.9741, "odds_ratio_loss": 0.3050013780593872, "rewards/accuracies": 1.0, "rewards/chosen": -0.09436120092868805, "rewards/margins": 0.0862816870212555, "rewards/rejected": -0.18064287304878235, "sft_loss": 0.9436119794845581, "step": 1840 }, { "epoch": 0.14, "grad_norm": 21.507509231567383, "learning_rate": 9.545248765654116e-06, "logits/chosen": -1.3653714656829834, "logits/rejected": -1.1635246276855469, "logps/chosen": -1.2835218906402588, "logps/rejected": -4.833404541015625, "loss": 1.3401, "odds_ratio_loss": 0.5661024451255798, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.12835219502449036, "rewards/margins": 0.3549882471561432, "rewards/rejected": -0.48334044218063354, "sft_loss": 1.2835218906402588, "step": 1845 }, { "epoch": 0.14, "grad_norm": 5.130289077758789, "learning_rate": 9.542679536226483e-06, "logits/chosen": -1.1667282581329346, "logits/rejected": -0.9166663289070129, "logps/chosen": -1.0290402173995972, "logps/rejected": -2.8194077014923096, "loss": 1.0678, "odds_ratio_loss": 0.3873458802700043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10290402173995972, "rewards/margins": 0.17903673648834229, "rewards/rejected": -0.281940758228302, "sft_loss": 1.0290402173995972, "step": 1850 }, { "epoch": 0.14, "grad_norm": 55.149497985839844, "learning_rate": 9.540103417248572e-06, "logits/chosen": -1.3182549476623535, "logits/rejected": -0.9496681094169617, "logps/chosen": -0.9547656774520874, "logps/rejected": -1.39925217628479, "loss": 1.0033, "odds_ratio_loss": 0.485520601272583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09547658264636993, "rewards/margins": 0.04444863274693489, "rewards/rejected": -0.13992521166801453, "sft_loss": 0.9547656774520874, "step": 1855 }, { "epoch": 0.14, "grad_norm": 5.448976039886475, "learning_rate": 9.537520412627395e-06, "logits/chosen": -1.3200477361679077, "logits/rejected": -0.9220551252365112, "logps/chosen": -1.221671223640442, "logps/rejected": -3.222076416015625, "loss": 1.2677, "odds_ratio_loss": 0.4604717195034027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12216712534427643, "rewards/margins": 0.2000405341386795, "rewards/rejected": -0.32220762968063354, "sft_loss": 1.221671223640442, "step": 1860 }, { "epoch": 0.15, "grad_norm": 17.277822494506836, "learning_rate": 9.534930526280406e-06, "logits/chosen": -1.3141456842422485, "logits/rejected": -1.0591038465499878, "logps/chosen": -1.1151491403579712, "logps/rejected": -5.379051208496094, "loss": 1.1551, "odds_ratio_loss": 0.39921683073043823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11151491105556488, "rewards/margins": 0.42639026045799255, "rewards/rejected": -0.5379050970077515, "sft_loss": 1.1151491403579712, "step": 1865 }, { "epoch": 0.15, "grad_norm": 19.059919357299805, "learning_rate": 9.532333762135498e-06, "logits/chosen": -1.5041942596435547, "logits/rejected": -1.2156248092651367, "logps/chosen": -0.9729844331741333, "logps/rejected": -0.8604519963264465, "loss": 1.0619, "odds_ratio_loss": 0.8896477818489075, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09729844331741333, "rewards/margins": -0.0112532377243042, "rewards/rejected": -0.08604519814252853, "sft_loss": 0.9729844331741333, "step": 1870 }, { "epoch": 0.15, "grad_norm": 5.341587066650391, "learning_rate": 9.52973012413099e-06, "logits/chosen": -1.4371575117111206, "logits/rejected": -0.9742057919502258, "logps/chosen": -1.0960537195205688, "logps/rejected": -1.8433376550674438, "loss": 1.1446, "odds_ratio_loss": 0.48561111092567444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10960537195205688, "rewards/margins": 0.07472839951515198, "rewards/rejected": -0.18433377146720886, "sft_loss": 1.0960537195205688, "step": 1875 }, { "epoch": 0.15, "grad_norm": 13.053421974182129, "learning_rate": 9.527119616215632e-06, "logits/chosen": -1.3161237239837646, "logits/rejected": -1.1920946836471558, "logps/chosen": -0.9443743824958801, "logps/rejected": -5.979962348937988, "loss": 0.9891, "odds_ratio_loss": 0.4469161629676819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09443744271993637, "rewards/margins": 0.5035587549209595, "rewards/rejected": -0.5979962348937988, "sft_loss": 0.9443743824958801, "step": 1880 }, { "epoch": 0.15, "grad_norm": 7.233993053436279, "learning_rate": 9.524502242348592e-06, "logits/chosen": -1.0858581066131592, "logits/rejected": -0.6852259635925293, "logps/chosen": -1.0311671495437622, "logps/rejected": -1.154486894607544, "loss": 1.0932, "odds_ratio_loss": 0.6205244064331055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1031167283654213, "rewards/margins": 0.012331971898674965, "rewards/rejected": -0.11544869095087051, "sft_loss": 1.0311671495437622, "step": 1885 }, { "epoch": 0.15, "grad_norm": 6.396711826324463, "learning_rate": 9.521878006499447e-06, "logits/chosen": -1.2524656057357788, "logits/rejected": -0.5958696007728577, "logps/chosen": -1.1098922491073608, "logps/rejected": -6.758184909820557, "loss": 1.1436, "odds_ratio_loss": 0.33697599172592163, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11098922789096832, "rewards/margins": 0.5648292303085327, "rewards/rejected": -0.6758184432983398, "sft_loss": 1.1098922491073608, "step": 1890 }, { "epoch": 0.15, "grad_norm": 10.993759155273438, "learning_rate": 9.519246912648186e-06, "logits/chosen": -1.3063591718673706, "logits/rejected": -0.8458372354507446, "logps/chosen": -0.8645821809768677, "logps/rejected": -1.1725999116897583, "loss": 0.918, "odds_ratio_loss": 0.5336938500404358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.086458221077919, "rewards/margins": 0.030801767483353615, "rewards/rejected": -0.11725999414920807, "sft_loss": 0.8645821809768677, "step": 1895 }, { "epoch": 0.15, "grad_norm": 5.961607933044434, "learning_rate": 9.516608964785196e-06, "logits/chosen": -1.3859764337539673, "logits/rejected": -1.096915602684021, "logps/chosen": -0.8092068433761597, "logps/rejected": -2.0697178840637207, "loss": 0.8515, "odds_ratio_loss": 0.42318421602249146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08092068880796432, "rewards/margins": 0.12605109810829163, "rewards/rejected": -0.20697179436683655, "sft_loss": 0.8092068433761597, "step": 1900 }, { "epoch": 0.15, "grad_norm": 7.101929187774658, "learning_rate": 9.513964166911258e-06, "logits/chosen": -1.4362437725067139, "logits/rejected": -1.1196719408035278, "logps/chosen": -0.7003141641616821, "logps/rejected": -0.7816325426101685, "loss": 0.7882, "odds_ratio_loss": 0.8789259791374207, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07003141939640045, "rewards/margins": 0.008131841197609901, "rewards/rejected": -0.0781632512807846, "sft_loss": 0.7003141641616821, "step": 1905 }, { "epoch": 0.15, "grad_norm": 6.640874862670898, "learning_rate": 9.511312523037549e-06, "logits/chosen": -1.416475772857666, "logits/rejected": -0.9427222013473511, "logps/chosen": -0.9190031290054321, "logps/rejected": -0.7494014501571655, "loss": 1.0176, "odds_ratio_loss": 0.9855034947395325, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.0919003039598465, "rewards/margins": -0.01696016639471054, "rewards/rejected": -0.07494015246629715, "sft_loss": 0.9190031290054321, "step": 1910 }, { "epoch": 0.15, "grad_norm": 6.442885875701904, "learning_rate": 9.508654037185619e-06, "logits/chosen": -1.3226420879364014, "logits/rejected": -1.0659013986587524, "logps/chosen": -1.0546033382415771, "logps/rejected": -1.6016933917999268, "loss": 1.1363, "odds_ratio_loss": 0.8171154260635376, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10546033084392548, "rewards/margins": 0.054708998650312424, "rewards/rejected": -0.1601693332195282, "sft_loss": 1.0546033382415771, "step": 1915 }, { "epoch": 0.15, "grad_norm": 4.770626068115234, "learning_rate": 9.505988713387398e-06, "logits/chosen": -1.2759541273117065, "logits/rejected": -0.9872153997421265, "logps/chosen": -1.1276299953460693, "logps/rejected": -1.5661379098892212, "loss": 1.181, "odds_ratio_loss": 0.5340844392776489, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11276300996541977, "rewards/margins": 0.04385078325867653, "rewards/rejected": -0.1566137969493866, "sft_loss": 1.1276299953460693, "step": 1920 }, { "epoch": 0.15, "grad_norm": 12.970624923706055, "learning_rate": 9.503316555685194e-06, "logits/chosen": -1.363874912261963, "logits/rejected": -0.482523113489151, "logps/chosen": -1.0931895971298218, "logps/rejected": -1.794382095336914, "loss": 1.1275, "odds_ratio_loss": 0.343106746673584, "rewards/accuracies": 1.0, "rewards/chosen": -0.10931895673274994, "rewards/margins": 0.07011924684047699, "rewards/rejected": -0.17943820357322693, "sft_loss": 1.0931895971298218, "step": 1925 }, { "epoch": 0.15, "grad_norm": 7.856452465057373, "learning_rate": 9.500637568131667e-06, "logits/chosen": -1.2575715780258179, "logits/rejected": -1.1664985418319702, "logps/chosen": -0.9274052381515503, "logps/rejected": -1.0278116464614868, "loss": 0.9976, "odds_ratio_loss": 0.7020447850227356, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09274052828550339, "rewards/margins": 0.010040633380413055, "rewards/rejected": -0.10278116166591644, "sft_loss": 0.9274052381515503, "step": 1930 }, { "epoch": 0.15, "grad_norm": 11.970571517944336, "learning_rate": 9.497951754789847e-06, "logits/chosen": -1.3612158298492432, "logits/rejected": -0.9841570854187012, "logps/chosen": -1.069284439086914, "logps/rejected": -3.824751377105713, "loss": 1.1041, "odds_ratio_loss": 0.34776008129119873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10692846775054932, "rewards/margins": 0.275546669960022, "rewards/rejected": -0.3824751377105713, "sft_loss": 1.069284439086914, "step": 1935 }, { "epoch": 0.15, "grad_norm": 32.797340393066406, "learning_rate": 9.495259119733108e-06, "logits/chosen": -1.3850946426391602, "logits/rejected": -1.2886738777160645, "logps/chosen": -0.9914947748184204, "logps/rejected": -1.1922539472579956, "loss": 1.0509, "odds_ratio_loss": 0.5942121744155884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09914946556091309, "rewards/margins": 0.020075935870409012, "rewards/rejected": -0.1192254051566124, "sft_loss": 0.9914947748184204, "step": 1940 }, { "epoch": 0.15, "grad_norm": 11.763391494750977, "learning_rate": 9.492559667045174e-06, "logits/chosen": -1.273818016052246, "logits/rejected": -1.1597501039505005, "logps/chosen": -0.7548612952232361, "logps/rejected": -8.429471015930176, "loss": 0.7835, "odds_ratio_loss": 0.2863296866416931, "rewards/accuracies": 1.0, "rewards/chosen": -0.07548613101243973, "rewards/margins": 0.7674610018730164, "rewards/rejected": -0.8429471254348755, "sft_loss": 0.7548612952232361, "step": 1945 }, { "epoch": 0.15, "grad_norm": 11.239775657653809, "learning_rate": 9.489853400820106e-06, "logits/chosen": -1.252996563911438, "logits/rejected": -0.5567450523376465, "logps/chosen": -0.9976485967636108, "logps/rejected": -2.493691921234131, "loss": 1.0317, "odds_ratio_loss": 0.3405466675758362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0997648537158966, "rewards/margins": 0.14960432052612305, "rewards/rejected": -0.24936918914318085, "sft_loss": 0.9976485967636108, "step": 1950 }, { "epoch": 0.15, "grad_norm": 8.288493156433105, "learning_rate": 9.487140325162303e-06, "logits/chosen": -1.3895283937454224, "logits/rejected": -1.0272207260131836, "logps/chosen": -0.6770743131637573, "logps/rejected": -3.0709948539733887, "loss": 0.7256, "odds_ratio_loss": 0.48501911759376526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06770743429660797, "rewards/margins": 0.23939207196235657, "rewards/rejected": -0.30709946155548096, "sft_loss": 0.6770743131637573, "step": 1955 }, { "epoch": 0.15, "grad_norm": 41.745906829833984, "learning_rate": 9.484420444186486e-06, "logits/chosen": -1.0695884227752686, "logits/rejected": -0.9039579629898071, "logps/chosen": -0.7909213304519653, "logps/rejected": -0.9298604130744934, "loss": 0.8515, "odds_ratio_loss": 0.6061114072799683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07909214496612549, "rewards/margins": 0.013893899507820606, "rewards/rejected": -0.09298603981733322, "sft_loss": 0.7909213304519653, "step": 1960 }, { "epoch": 0.15, "grad_norm": 5.356940746307373, "learning_rate": 9.481693762017702e-06, "logits/chosen": -1.3417937755584717, "logits/rejected": -0.7301517724990845, "logps/chosen": -1.1014509201049805, "logps/rejected": -1.3265063762664795, "loss": 1.1633, "odds_ratio_loss": 0.6179971694946289, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11014509201049805, "rewards/margins": 0.022505560889840126, "rewards/rejected": -0.13265065848827362, "sft_loss": 1.1014509201049805, "step": 1965 }, { "epoch": 0.15, "grad_norm": 33.3004035949707, "learning_rate": 9.47896028279131e-06, "logits/chosen": -1.3251553773880005, "logits/rejected": -0.9696298837661743, "logps/chosen": -1.0699702501296997, "logps/rejected": -0.8503786325454712, "loss": 1.1599, "odds_ratio_loss": 0.8989558219909668, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10699701309204102, "rewards/margins": -0.021959159523248672, "rewards/rejected": -0.08503787219524384, "sft_loss": 1.0699702501296997, "step": 1970 }, { "epoch": 0.15, "grad_norm": 8.4285888671875, "learning_rate": 9.476220010652978e-06, "logits/chosen": -1.1229819059371948, "logits/rejected": -0.9317318201065063, "logps/chosen": -0.8251851797103882, "logps/rejected": -0.8069744110107422, "loss": 0.9044, "odds_ratio_loss": 0.7926350831985474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08251851797103882, "rewards/margins": -0.0018210865091532469, "rewards/rejected": -0.0806974321603775, "sft_loss": 0.8251851797103882, "step": 1975 }, { "epoch": 0.15, "grad_norm": 8.010242462158203, "learning_rate": 9.473472949758677e-06, "logits/chosen": -1.1807481050491333, "logits/rejected": -0.8190022706985474, "logps/chosen": -1.0934984683990479, "logps/rejected": -1.292314887046814, "loss": 1.1505, "odds_ratio_loss": 0.5697231292724609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10934984683990479, "rewards/margins": 0.019881639629602432, "rewards/rejected": -0.1292314976453781, "sft_loss": 1.0934984683990479, "step": 1980 }, { "epoch": 0.15, "grad_norm": 5.987873554229736, "learning_rate": 9.470719104274675e-06, "logits/chosen": -1.3005092144012451, "logits/rejected": -0.6768913269042969, "logps/chosen": -1.0687867403030396, "logps/rejected": -1.6101669073104858, "loss": 1.1172, "odds_ratio_loss": 0.4846063554286957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10687867552042007, "rewards/margins": 0.054138004779815674, "rewards/rejected": -0.16101667284965515, "sft_loss": 1.0687867403030396, "step": 1985 }, { "epoch": 0.15, "grad_norm": 15.15396785736084, "learning_rate": 9.467958478377525e-06, "logits/chosen": -1.2687232494354248, "logits/rejected": -0.8056892156600952, "logps/chosen": -1.0249241590499878, "logps/rejected": -1.5329560041427612, "loss": 1.07, "odds_ratio_loss": 0.4506935477256775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10249241441488266, "rewards/margins": 0.05080317705869675, "rewards/rejected": -0.1532956063747406, "sft_loss": 1.0249241590499878, "step": 1990 }, { "epoch": 0.16, "grad_norm": 33.13812255859375, "learning_rate": 9.465191076254067e-06, "logits/chosen": -1.2532131671905518, "logits/rejected": -1.1098170280456543, "logps/chosen": -0.9342479705810547, "logps/rejected": -1.725606918334961, "loss": 0.9789, "odds_ratio_loss": 0.44632625579833984, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09342481195926666, "rewards/margins": 0.07913590222597122, "rewards/rejected": -0.1725607067346573, "sft_loss": 0.9342479705810547, "step": 1995 }, { "epoch": 0.16, "grad_norm": 5.594306468963623, "learning_rate": 9.462416902101422e-06, "logits/chosen": -1.0992923974990845, "logits/rejected": -0.8452315330505371, "logps/chosen": -1.4404175281524658, "logps/rejected": -2.1999869346618652, "loss": 1.502, "odds_ratio_loss": 0.6154400110244751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1440417468547821, "rewards/margins": 0.07595695555210114, "rewards/rejected": -0.21999871730804443, "sft_loss": 1.4404175281524658, "step": 2000 }, { "epoch": 0.16, "grad_norm": 15.01916217803955, "learning_rate": 9.459635960126973e-06, "logits/chosen": -1.1773990392684937, "logits/rejected": -0.8374426960945129, "logps/chosen": -0.9732074737548828, "logps/rejected": -1.0846574306488037, "loss": 1.073, "odds_ratio_loss": 0.9976279139518738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09732075035572052, "rewards/margins": 0.011144992895424366, "rewards/rejected": -0.10846574604511261, "sft_loss": 0.9732074737548828, "step": 2005 }, { "epoch": 0.16, "grad_norm": 3.13708233833313, "learning_rate": 9.456848254548373e-06, "logits/chosen": -1.4408618211746216, "logits/rejected": -0.8943861722946167, "logps/chosen": -1.1539796590805054, "logps/rejected": -3.8530449867248535, "loss": 1.1918, "odds_ratio_loss": 0.37784868478775024, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11539797484874725, "rewards/margins": 0.26990655064582825, "rewards/rejected": -0.3853045105934143, "sft_loss": 1.1539796590805054, "step": 2010 }, { "epoch": 0.16, "grad_norm": 10.954537391662598, "learning_rate": 9.454053789593532e-06, "logits/chosen": -1.28227698802948, "logits/rejected": -0.8783136606216431, "logps/chosen": -0.9702832102775574, "logps/rejected": -2.25543475151062, "loss": 1.0171, "odds_ratio_loss": 0.4677762985229492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09702832996845245, "rewards/margins": 0.12851516902446747, "rewards/rejected": -0.22554349899291992, "sft_loss": 0.9702832102775574, "step": 2015 }, { "epoch": 0.16, "grad_norm": 5.139076232910156, "learning_rate": 9.451252569500609e-06, "logits/chosen": -1.4350674152374268, "logits/rejected": -1.1181226968765259, "logps/chosen": -1.0790382623672485, "logps/rejected": -2.726478099822998, "loss": 1.1174, "odds_ratio_loss": 0.38367173075675964, "rewards/accuracies": 1.0, "rewards/chosen": -0.10790381580591202, "rewards/margins": 0.16474398970603943, "rewards/rejected": -0.27264779806137085, "sft_loss": 1.0790382623672485, "step": 2020 }, { "epoch": 0.16, "grad_norm": 17.504884719848633, "learning_rate": 9.448444598518013e-06, "logits/chosen": -1.28285813331604, "logits/rejected": -0.5891727209091187, "logps/chosen": -1.0441431999206543, "logps/rejected": -1.1051876544952393, "loss": 1.1253, "odds_ratio_loss": 0.8114008903503418, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10441432148218155, "rewards/margins": 0.006104458123445511, "rewards/rejected": -0.11051877588033676, "sft_loss": 1.0441431999206543, "step": 2025 }, { "epoch": 0.16, "grad_norm": 8.165739059448242, "learning_rate": 9.445629880904386e-06, "logits/chosen": -1.2738823890686035, "logits/rejected": -1.1569932699203491, "logps/chosen": -0.8925234079360962, "logps/rejected": -1.6296312808990479, "loss": 0.925, "odds_ratio_loss": 0.3249626159667969, "rewards/accuracies": 1.0, "rewards/chosen": -0.08925233781337738, "rewards/margins": 0.07371079176664352, "rewards/rejected": -0.1629631370306015, "sft_loss": 0.8925234079360962, "step": 2030 }, { "epoch": 0.16, "grad_norm": 7.672553539276123, "learning_rate": 9.442808420928606e-06, "logits/chosen": -1.3439207077026367, "logits/rejected": -0.768078625202179, "logps/chosen": -0.9887423515319824, "logps/rejected": -2.255394220352173, "loss": 1.0238, "odds_ratio_loss": 0.3510589003562927, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09887423366308212, "rewards/margins": 0.1266651749610901, "rewards/rejected": -0.2255394011735916, "sft_loss": 0.9887423515319824, "step": 2035 }, { "epoch": 0.16, "grad_norm": 5.961903095245361, "learning_rate": 9.439980222869774e-06, "logits/chosen": -1.3768771886825562, "logits/rejected": -0.7793500423431396, "logps/chosen": -1.2942712306976318, "logps/rejected": -1.8602949380874634, "loss": 1.3402, "odds_ratio_loss": 0.4595089852809906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12942712008953094, "rewards/margins": 0.056602366268634796, "rewards/rejected": -0.18602950870990753, "sft_loss": 1.2942712306976318, "step": 2040 }, { "epoch": 0.16, "grad_norm": 33.816226959228516, "learning_rate": 9.437145291017213e-06, "logits/chosen": -1.2508987188339233, "logits/rejected": -1.352567434310913, "logps/chosen": -0.686786413192749, "logps/rejected": -1.241456151008606, "loss": 0.7285, "odds_ratio_loss": 0.4171048700809479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06867863982915878, "rewards/margins": 0.05546698719263077, "rewards/rejected": -0.12414562702178955, "sft_loss": 0.686786413192749, "step": 2045 }, { "epoch": 0.16, "grad_norm": 5.645260810852051, "learning_rate": 9.434303629670456e-06, "logits/chosen": -1.3583166599273682, "logits/rejected": -0.9741853475570679, "logps/chosen": -0.840481162071228, "logps/rejected": -1.6484066247940063, "loss": 0.9071, "odds_ratio_loss": 0.6664353609085083, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08404812216758728, "rewards/margins": 0.08079256117343903, "rewards/rejected": -0.1648406684398651, "sft_loss": 0.840481162071228, "step": 2050 }, { "epoch": 0.16, "grad_norm": 26.574750900268555, "learning_rate": 9.431455243139242e-06, "logits/chosen": -1.1145718097686768, "logits/rejected": -1.1763341426849365, "logps/chosen": -1.0447132587432861, "logps/rejected": -0.8651531934738159, "loss": 1.1318, "odds_ratio_loss": 0.8704781532287598, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10447131097316742, "rewards/margins": -0.017955996096134186, "rewards/rejected": -0.08651532232761383, "sft_loss": 1.0447132587432861, "step": 2055 }, { "epoch": 0.16, "grad_norm": 84.78226470947266, "learning_rate": 9.428600135743514e-06, "logits/chosen": -1.399448037147522, "logits/rejected": -0.9923950433731079, "logps/chosen": -0.8076363801956177, "logps/rejected": -1.329791784286499, "loss": 0.8522, "odds_ratio_loss": 0.4460652470588684, "rewards/accuracies": 1.0, "rewards/chosen": -0.08076363801956177, "rewards/margins": 0.05221554636955261, "rewards/rejected": -0.13297918438911438, "sft_loss": 0.8076363801956177, "step": 2060 }, { "epoch": 0.16, "grad_norm": 11.356202125549316, "learning_rate": 9.425738311813403e-06, "logits/chosen": -1.3921433687210083, "logits/rejected": -1.068137764930725, "logps/chosen": -1.303889513015747, "logps/rejected": -1.587685227394104, "loss": 1.373, "odds_ratio_loss": 0.6907719373703003, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13038896024227142, "rewards/margins": 0.02837957814335823, "rewards/rejected": -0.15876853466033936, "sft_loss": 1.303889513015747, "step": 2065 }, { "epoch": 0.16, "grad_norm": 26.550935745239258, "learning_rate": 9.422869775689227e-06, "logits/chosen": -1.0806728601455688, "logits/rejected": -1.7661387920379639, "logps/chosen": -1.0194052457809448, "logps/rejected": -7.546328544616699, "loss": 1.0475, "odds_ratio_loss": 0.28096455335617065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10194053500890732, "rewards/margins": 0.6526923179626465, "rewards/rejected": -0.754632830619812, "sft_loss": 1.0194052457809448, "step": 2070 }, { "epoch": 0.16, "grad_norm": 7.983142375946045, "learning_rate": 9.419994531721488e-06, "logits/chosen": -1.3471333980560303, "logits/rejected": -1.0253812074661255, "logps/chosen": -0.8576242327690125, "logps/rejected": -1.83051335811615, "loss": 0.888, "odds_ratio_loss": 0.30377694964408875, "rewards/accuracies": 1.0, "rewards/chosen": -0.08576242625713348, "rewards/margins": 0.09728892892599106, "rewards/rejected": -0.18305133283138275, "sft_loss": 0.8576242327690125, "step": 2075 }, { "epoch": 0.16, "grad_norm": 7.150165557861328, "learning_rate": 9.417112584270858e-06, "logits/chosen": -1.5308114290237427, "logits/rejected": -1.1309223175048828, "logps/chosen": -1.0225160121917725, "logps/rejected": -5.807229042053223, "loss": 1.0527, "odds_ratio_loss": 0.3023206293582916, "rewards/accuracies": 1.0, "rewards/chosen": -0.10225160419940948, "rewards/margins": 0.4784712791442871, "rewards/rejected": -0.5807229280471802, "sft_loss": 1.0225160121917725, "step": 2080 }, { "epoch": 0.16, "grad_norm": 36.25994110107422, "learning_rate": 9.414223937708175e-06, "logits/chosen": -1.3622469902038574, "logits/rejected": -1.109290361404419, "logps/chosen": -0.9507700204849243, "logps/rejected": -1.2247527837753296, "loss": 1.0037, "odds_ratio_loss": 0.5290084481239319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09507700055837631, "rewards/margins": 0.027398282662034035, "rewards/rejected": -0.1224752888083458, "sft_loss": 0.9507700204849243, "step": 2085 }, { "epoch": 0.16, "grad_norm": 6.515985488891602, "learning_rate": 9.411328596414439e-06, "logits/chosen": -1.2988791465759277, "logits/rejected": -1.144550085067749, "logps/chosen": -1.0585949420928955, "logps/rejected": -1.748295783996582, "loss": 1.1064, "odds_ratio_loss": 0.4781731963157654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10585949569940567, "rewards/margins": 0.06897006928920746, "rewards/rejected": -0.17482957243919373, "sft_loss": 1.0585949420928955, "step": 2090 }, { "epoch": 0.16, "grad_norm": 53.535892486572266, "learning_rate": 9.4084265647808e-06, "logits/chosen": -1.3115184307098389, "logits/rejected": -1.0028080940246582, "logps/chosen": -0.8936856389045715, "logps/rejected": -2.9408774375915527, "loss": 0.9272, "odds_ratio_loss": 0.3351721465587616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0893685594201088, "rewards/margins": 0.2047191858291626, "rewards/rejected": -0.2940877377986908, "sft_loss": 0.8936856389045715, "step": 2095 }, { "epoch": 0.16, "grad_norm": 4.780445098876953, "learning_rate": 9.405517847208562e-06, "logits/chosen": -1.1145271062850952, "logits/rejected": -0.7654793858528137, "logps/chosen": -1.0037806034088135, "logps/rejected": -1.2181943655014038, "loss": 1.0649, "odds_ratio_loss": 0.6109730005264282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10037805885076523, "rewards/margins": 0.021441373974084854, "rewards/rejected": -0.12181943655014038, "sft_loss": 1.0037806034088135, "step": 2100 }, { "epoch": 0.16, "grad_norm": 20.804689407348633, "learning_rate": 9.402602448109163e-06, "logits/chosen": -1.0638505220413208, "logits/rejected": -0.7821773290634155, "logps/chosen": -1.1113550662994385, "logps/rejected": -1.3830729722976685, "loss": 1.1701, "odds_ratio_loss": 0.587774932384491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11113552004098892, "rewards/margins": 0.027171779423952103, "rewards/rejected": -0.13830728828907013, "sft_loss": 1.1113550662994385, "step": 2105 }, { "epoch": 0.16, "grad_norm": 16.03834342956543, "learning_rate": 9.399680371904174e-06, "logits/chosen": -0.8229387998580933, "logits/rejected": -1.0971095561981201, "logps/chosen": -0.6263498067855835, "logps/rejected": -1.5988829135894775, "loss": 0.6522, "odds_ratio_loss": 0.25828564167022705, "rewards/accuracies": 1.0, "rewards/chosen": -0.06263498961925507, "rewards/margins": 0.09725330770015717, "rewards/rejected": -0.15988829731941223, "sft_loss": 0.6263498067855835, "step": 2110 }, { "epoch": 0.16, "grad_norm": 52.247398376464844, "learning_rate": 9.396751623025297e-06, "logits/chosen": -1.418639898300171, "logits/rejected": -0.9515444040298462, "logps/chosen": -1.0927083492279053, "logps/rejected": -3.6967151165008545, "loss": 1.1338, "odds_ratio_loss": 0.4113377630710602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10927083343267441, "rewards/margins": 0.2604006826877594, "rewards/rejected": -0.3696715235710144, "sft_loss": 1.0927083492279053, "step": 2115 }, { "epoch": 0.16, "grad_norm": 15.996710777282715, "learning_rate": 9.393816205914348e-06, "logits/chosen": -1.182966947555542, "logits/rejected": -0.7741319537162781, "logps/chosen": -0.864666759967804, "logps/rejected": -1.3445454835891724, "loss": 0.9136, "odds_ratio_loss": 0.48978710174560547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08646667748689651, "rewards/margins": 0.04798787087202072, "rewards/rejected": -0.13445454835891724, "sft_loss": 0.864666759967804, "step": 2120 }, { "epoch": 0.17, "grad_norm": 5.824068546295166, "learning_rate": 9.390874125023265e-06, "logits/chosen": -1.319427490234375, "logits/rejected": -0.6739305257797241, "logps/chosen": -1.122070074081421, "logps/rejected": -1.3879320621490479, "loss": 1.178, "odds_ratio_loss": 0.5590441823005676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11220701038837433, "rewards/margins": 0.026586193591356277, "rewards/rejected": -0.1387932002544403, "sft_loss": 1.122070074081421, "step": 2125 }, { "epoch": 0.17, "grad_norm": 16.000850677490234, "learning_rate": 9.387925384814083e-06, "logits/chosen": -1.3848741054534912, "logits/rejected": -1.0673903226852417, "logps/chosen": -1.0577576160430908, "logps/rejected": -4.389899253845215, "loss": 1.1129, "odds_ratio_loss": 0.5515421032905579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10577578842639923, "rewards/margins": 0.33321413397789, "rewards/rejected": -0.43898987770080566, "sft_loss": 1.0577576160430908, "step": 2130 }, { "epoch": 0.17, "grad_norm": 39.926612854003906, "learning_rate": 9.384969989758942e-06, "logits/chosen": -1.1923563480377197, "logits/rejected": -0.7410465478897095, "logps/chosen": -1.1001938581466675, "logps/rejected": -1.2848488092422485, "loss": 1.1697, "odds_ratio_loss": 0.695135772228241, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11001940071582794, "rewards/margins": 0.01846548728644848, "rewards/rejected": -0.12848487496376038, "sft_loss": 1.1001938581466675, "step": 2135 }, { "epoch": 0.17, "grad_norm": 11.095691680908203, "learning_rate": 9.382007944340075e-06, "logits/chosen": -1.323016881942749, "logits/rejected": -0.8939719200134277, "logps/chosen": -1.0650465488433838, "logps/rejected": -0.9738779067993164, "loss": 1.148, "odds_ratio_loss": 0.8292935490608215, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1065046414732933, "rewards/margins": -0.009116856381297112, "rewards/rejected": -0.09738779067993164, "sft_loss": 1.0650465488433838, "step": 2140 }, { "epoch": 0.17, "grad_norm": 4.360654830932617, "learning_rate": 9.379039253049798e-06, "logits/chosen": -1.243947148323059, "logits/rejected": -0.8853607177734375, "logps/chosen": -1.1823415756225586, "logps/rejected": -1.3976190090179443, "loss": 1.2821, "odds_ratio_loss": 0.9972108006477356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11823415756225586, "rewards/margins": 0.021527742967009544, "rewards/rejected": -0.13976189494132996, "sft_loss": 1.1823415756225586, "step": 2145 }, { "epoch": 0.17, "grad_norm": 4.95953369140625, "learning_rate": 9.376063920390509e-06, "logits/chosen": -1.208055853843689, "logits/rejected": -0.985071063041687, "logps/chosen": -0.8940626978874207, "logps/rejected": -1.6095507144927979, "loss": 0.9281, "odds_ratio_loss": 0.34006446599960327, "rewards/accuracies": 1.0, "rewards/chosen": -0.08940626680850983, "rewards/margins": 0.07154880464076996, "rewards/rejected": -0.16095507144927979, "sft_loss": 0.8940626978874207, "step": 2150 }, { "epoch": 0.17, "grad_norm": 52.542457580566406, "learning_rate": 9.373081950874678e-06, "logits/chosen": -1.2736427783966064, "logits/rejected": -0.7791846990585327, "logps/chosen": -1.151737093925476, "logps/rejected": -7.746405601501465, "loss": 1.243, "odds_ratio_loss": 0.9124053120613098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11517371982336044, "rewards/margins": 0.6594668626785278, "rewards/rejected": -0.7746405601501465, "sft_loss": 1.151737093925476, "step": 2155 }, { "epoch": 0.17, "grad_norm": 9.984672546386719, "learning_rate": 9.370093349024842e-06, "logits/chosen": -1.3483428955078125, "logits/rejected": -1.1995983123779297, "logps/chosen": -1.030102014541626, "logps/rejected": -5.066445827484131, "loss": 1.0975, "odds_ratio_loss": 0.6737374067306519, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10301021486520767, "rewards/margins": 0.40363436937332153, "rewards/rejected": -0.506644606590271, "sft_loss": 1.030102014541626, "step": 2160 }, { "epoch": 0.17, "grad_norm": 8.059449195861816, "learning_rate": 9.367098119373592e-06, "logits/chosen": -1.26102614402771, "logits/rejected": -0.9494965672492981, "logps/chosen": -0.9010990262031555, "logps/rejected": -1.2991758584976196, "loss": 0.9497, "odds_ratio_loss": 0.48647040128707886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09010989964008331, "rewards/margins": 0.03980768844485283, "rewards/rejected": -0.12991759181022644, "sft_loss": 0.9010990262031555, "step": 2165 }, { "epoch": 0.17, "grad_norm": 7.4642815589904785, "learning_rate": 9.364096266463577e-06, "logits/chosen": -1.4591320753097534, "logits/rejected": -1.0818897485733032, "logps/chosen": -0.9885584115982056, "logps/rejected": -1.1860976219177246, "loss": 1.051, "odds_ratio_loss": 0.6242485642433167, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09885583817958832, "rewards/margins": 0.019753912463784218, "rewards/rejected": -0.11860976368188858, "sft_loss": 0.9885584115982056, "step": 2170 }, { "epoch": 0.17, "grad_norm": 8.31343936920166, "learning_rate": 9.361087794847485e-06, "logits/chosen": -1.3828623294830322, "logits/rejected": -1.0185619592666626, "logps/chosen": -0.8504158854484558, "logps/rejected": -1.4988811016082764, "loss": 0.9257, "odds_ratio_loss": 0.7529224157333374, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0850415974855423, "rewards/margins": 0.06484649330377579, "rewards/rejected": -0.14988809823989868, "sft_loss": 0.8504158854484558, "step": 2175 }, { "epoch": 0.17, "grad_norm": 7.262338638305664, "learning_rate": 9.358072709088046e-06, "logits/chosen": -1.2764170169830322, "logits/rejected": -1.0059707164764404, "logps/chosen": -0.6619844436645508, "logps/rejected": -7.922823905944824, "loss": 0.6878, "odds_ratio_loss": 0.2583473324775696, "rewards/accuracies": 1.0, "rewards/chosen": -0.0661984458565712, "rewards/margins": 0.7260838747024536, "rewards/rejected": -0.7922824025154114, "sft_loss": 0.6619844436645508, "step": 2180 }, { "epoch": 0.17, "grad_norm": 26.565011978149414, "learning_rate": 9.355051013758023e-06, "logits/chosen": -1.3052898645401, "logits/rejected": -1.0967880487442017, "logps/chosen": -0.8731967210769653, "logps/rejected": -1.453786849975586, "loss": 0.9169, "odds_ratio_loss": 0.43690699338912964, "rewards/accuracies": 1.0, "rewards/chosen": -0.08731966465711594, "rewards/margins": 0.05805901437997818, "rewards/rejected": -0.14537867903709412, "sft_loss": 0.8731967210769653, "step": 2185 }, { "epoch": 0.17, "grad_norm": 33.61377716064453, "learning_rate": 9.352022713440198e-06, "logits/chosen": -1.225475549697876, "logits/rejected": -1.052595853805542, "logps/chosen": -0.8050892949104309, "logps/rejected": -0.9161909222602844, "loss": 0.8753, "odds_ratio_loss": 0.7018290758132935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08050892502069473, "rewards/margins": 0.011110165156424046, "rewards/rejected": -0.0916190966963768, "sft_loss": 0.8050892949104309, "step": 2190 }, { "epoch": 0.17, "grad_norm": 8.001112937927246, "learning_rate": 9.348987812727375e-06, "logits/chosen": -1.2792341709136963, "logits/rejected": -0.8740849494934082, "logps/chosen": -1.581862211227417, "logps/rejected": -1.0725221633911133, "loss": 1.7016, "odds_ratio_loss": 1.1973588466644287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15818621218204498, "rewards/margins": -0.050933998078107834, "rewards/rejected": -0.10725221782922745, "sft_loss": 1.581862211227417, "step": 2195 }, { "epoch": 0.17, "grad_norm": 11.436029434204102, "learning_rate": 9.345946316222365e-06, "logits/chosen": -1.3950872421264648, "logits/rejected": -1.0151147842407227, "logps/chosen": -0.9989234209060669, "logps/rejected": -0.9934768676757812, "loss": 1.0715, "odds_ratio_loss": 0.7261602282524109, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09989233314990997, "rewards/margins": -0.0005446464056149125, "rewards/rejected": -0.09934769570827484, "sft_loss": 0.9989234209060669, "step": 2200 }, { "epoch": 0.17, "grad_norm": 30.081104278564453, "learning_rate": 9.342898228537983e-06, "logits/chosen": -1.218121886253357, "logits/rejected": -1.3421285152435303, "logps/chosen": -1.7342615127563477, "logps/rejected": -1.285917043685913, "loss": 1.8419, "odds_ratio_loss": 1.0760008096694946, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.17342616617679596, "rewards/margins": -0.04483447223901749, "rewards/rejected": -0.12859170138835907, "sft_loss": 1.7342615127563477, "step": 2205 }, { "epoch": 0.17, "grad_norm": 9.11589241027832, "learning_rate": 9.339843554297042e-06, "logits/chosen": -1.4177284240722656, "logits/rejected": -0.9231816530227661, "logps/chosen": -0.9941714406013489, "logps/rejected": -1.2196388244628906, "loss": 1.0538, "odds_ratio_loss": 0.5962220430374146, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09941715002059937, "rewards/margins": 0.02254674583673477, "rewards/rejected": -0.12196389585733414, "sft_loss": 0.9941714406013489, "step": 2210 }, { "epoch": 0.17, "grad_norm": 10.992576599121094, "learning_rate": 9.33678229813234e-06, "logits/chosen": -1.3786756992340088, "logits/rejected": -1.3493220806121826, "logps/chosen": -0.9669061899185181, "logps/rejected": -1.7660000324249268, "loss": 1.0027, "odds_ratio_loss": 0.3575536608695984, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09669061005115509, "rewards/margins": 0.07990939170122147, "rewards/rejected": -0.17660000920295715, "sft_loss": 0.9669061899185181, "step": 2215 }, { "epoch": 0.17, "grad_norm": 8.534214973449707, "learning_rate": 9.333714464686668e-06, "logits/chosen": -1.202580213546753, "logits/rejected": -1.1010208129882812, "logps/chosen": -1.1651172637939453, "logps/rejected": -1.1718440055847168, "loss": 1.2364, "odds_ratio_loss": 0.7130334973335266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11651172488927841, "rewards/margins": 0.0006726846331730485, "rewards/rejected": -0.11718441545963287, "sft_loss": 1.1651172637939453, "step": 2220 }, { "epoch": 0.17, "grad_norm": 15.713763236999512, "learning_rate": 9.330640058612777e-06, "logits/chosen": -1.5429356098175049, "logits/rejected": -0.8350217938423157, "logps/chosen": -1.4011694192886353, "logps/rejected": -1.835808515548706, "loss": 1.469, "odds_ratio_loss": 0.6785898804664612, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.14011695981025696, "rewards/margins": 0.043463896960020065, "rewards/rejected": -0.18358086049556732, "sft_loss": 1.4011694192886353, "step": 2225 }, { "epoch": 0.17, "grad_norm": 7.8334126472473145, "learning_rate": 9.327559084573399e-06, "logits/chosen": -1.346500277519226, "logits/rejected": -1.1512044668197632, "logps/chosen": -1.2371644973754883, "logps/rejected": -1.9674861431121826, "loss": 1.2897, "odds_ratio_loss": 0.5257105827331543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12371645122766495, "rewards/margins": 0.07303217798471451, "rewards/rejected": -0.19674862921237946, "sft_loss": 1.2371644973754883, "step": 2230 }, { "epoch": 0.17, "grad_norm": 10.72829818725586, "learning_rate": 9.32447154724122e-06, "logits/chosen": -1.3767178058624268, "logits/rejected": -1.1266549825668335, "logps/chosen": -1.1838889122009277, "logps/rejected": -3.7046380043029785, "loss": 1.2507, "odds_ratio_loss": 0.6676278114318848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11838889122009277, "rewards/margins": 0.2520748972892761, "rewards/rejected": -0.3704637885093689, "sft_loss": 1.1838889122009277, "step": 2235 }, { "epoch": 0.17, "grad_norm": 5.583121299743652, "learning_rate": 9.321377451298886e-06, "logits/chosen": -1.2876561880111694, "logits/rejected": -1.1324607133865356, "logps/chosen": -0.8333019018173218, "logps/rejected": -1.9429174661636353, "loss": 0.8603, "odds_ratio_loss": 0.27039480209350586, "rewards/accuracies": 1.0, "rewards/chosen": -0.0833301991224289, "rewards/margins": 0.11096155643463135, "rewards/rejected": -0.19429175555706024, "sft_loss": 0.8333019018173218, "step": 2240 }, { "epoch": 0.17, "grad_norm": 6.8710856437683105, "learning_rate": 9.318276801438981e-06, "logits/chosen": -1.5717952251434326, "logits/rejected": -0.9774179458618164, "logps/chosen": -0.9509202837944031, "logps/rejected": -1.7160125970840454, "loss": 1.0017, "odds_ratio_loss": 0.5077941417694092, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0950920358300209, "rewards/margins": 0.07650923728942871, "rewards/rejected": -0.17160126566886902, "sft_loss": 0.9509202837944031, "step": 2245 }, { "epoch": 0.18, "grad_norm": 26.359830856323242, "learning_rate": 9.315169602364038e-06, "logits/chosen": -1.438494324684143, "logits/rejected": -1.2772901058197021, "logps/chosen": -0.963545024394989, "logps/rejected": -1.3646507263183594, "loss": 1.018, "odds_ratio_loss": 0.5441502928733826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09635450690984726, "rewards/margins": 0.04011055827140808, "rewards/rejected": -0.13646505773067474, "sft_loss": 0.963545024394989, "step": 2250 }, { "epoch": 0.18, "grad_norm": 11.73903751373291, "learning_rate": 9.312055858786517e-06, "logits/chosen": -1.4635202884674072, "logits/rejected": -1.1016645431518555, "logps/chosen": -0.8518376350402832, "logps/rejected": -1.5293066501617432, "loss": 0.8947, "odds_ratio_loss": 0.42870035767555237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0851837620139122, "rewards/margins": 0.06774689257144928, "rewards/rejected": -0.15293066203594208, "sft_loss": 0.8518376350402832, "step": 2255 }, { "epoch": 0.18, "grad_norm": 18.999237060546875, "learning_rate": 9.308935575428808e-06, "logits/chosen": -1.4547194242477417, "logits/rejected": -1.178485631942749, "logps/chosen": -1.1227895021438599, "logps/rejected": -6.287472724914551, "loss": 1.1341, "odds_ratio_loss": 0.11270429193973541, "rewards/accuracies": 1.0, "rewards/chosen": -0.11227895319461823, "rewards/margins": 0.5164682269096375, "rewards/rejected": -0.6287471652030945, "sft_loss": 1.1227895021438599, "step": 2260 }, { "epoch": 0.18, "grad_norm": 8.228962898254395, "learning_rate": 9.305808757023213e-06, "logits/chosen": -1.4437748193740845, "logits/rejected": -0.9789659380912781, "logps/chosen": -1.308319330215454, "logps/rejected": -4.175393104553223, "loss": 1.3224, "odds_ratio_loss": 0.14052268862724304, "rewards/accuracies": 1.0, "rewards/chosen": -0.13083192706108093, "rewards/margins": 0.28670734167099, "rewards/rejected": -0.4175392985343933, "sft_loss": 1.308319330215454, "step": 2265 }, { "epoch": 0.18, "grad_norm": 10.382689476013184, "learning_rate": 9.302675408311953e-06, "logits/chosen": -1.3628900051116943, "logits/rejected": -1.0054218769073486, "logps/chosen": -0.9282622337341309, "logps/rejected": -1.898972511291504, "loss": 0.9669, "odds_ratio_loss": 0.38678181171417236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0928262248635292, "rewards/margins": 0.09707103669643402, "rewards/rejected": -0.18989725410938263, "sft_loss": 0.9282622337341309, "step": 2270 }, { "epoch": 0.18, "grad_norm": 5.989121437072754, "learning_rate": 9.299535534047145e-06, "logits/chosen": -1.4441674947738647, "logits/rejected": -1.094054937362671, "logps/chosen": -0.8899669647216797, "logps/rejected": -1.7062129974365234, "loss": 0.9281, "odds_ratio_loss": 0.3816324770450592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08899669349193573, "rewards/margins": 0.0816246047616005, "rewards/rejected": -0.17062130570411682, "sft_loss": 0.8899669647216797, "step": 2275 }, { "epoch": 0.18, "grad_norm": 12.697397232055664, "learning_rate": 9.296389138990812e-06, "logits/chosen": -1.1660821437835693, "logits/rejected": -1.038727045059204, "logps/chosen": -1.1280267238616943, "logps/rejected": -1.5812492370605469, "loss": 1.1791, "odds_ratio_loss": 0.5110870003700256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11280268430709839, "rewards/margins": 0.045322246849536896, "rewards/rejected": -0.1581249237060547, "sft_loss": 1.1280267238616943, "step": 2280 }, { "epoch": 0.18, "grad_norm": 17.71702766418457, "learning_rate": 9.293236227914856e-06, "logits/chosen": -1.3867652416229248, "logits/rejected": -0.7695621252059937, "logps/chosen": -1.1080429553985596, "logps/rejected": -1.8552356958389282, "loss": 1.1608, "odds_ratio_loss": 0.5275697708129883, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11080429702997208, "rewards/margins": 0.07471926510334015, "rewards/rejected": -0.18552353978157043, "sft_loss": 1.1080429553985596, "step": 2285 }, { "epoch": 0.18, "grad_norm": 6.90964937210083, "learning_rate": 9.290076805601071e-06, "logits/chosen": -1.4995615482330322, "logits/rejected": -0.8228855133056641, "logps/chosen": -1.0126750469207764, "logps/rejected": -1.122462511062622, "loss": 1.0914, "odds_ratio_loss": 0.7877473831176758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10126751661300659, "rewards/margins": 0.010978740639984608, "rewards/rejected": -0.11224625259637833, "sft_loss": 1.0126750469207764, "step": 2290 }, { "epoch": 0.18, "grad_norm": 5.955761432647705, "learning_rate": 9.286910876841122e-06, "logits/chosen": -1.4855306148529053, "logits/rejected": -1.0339566469192505, "logps/chosen": -1.0420253276824951, "logps/rejected": -1.2085959911346436, "loss": 1.105, "odds_ratio_loss": 0.629336953163147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10420253127813339, "rewards/margins": 0.016657069325447083, "rewards/rejected": -0.12085960805416107, "sft_loss": 1.0420253276824951, "step": 2295 }, { "epoch": 0.18, "grad_norm": 65.14389038085938, "learning_rate": 9.28373844643654e-06, "logits/chosen": -1.0751597881317139, "logits/rejected": -1.0112690925598145, "logps/chosen": -0.9169561266899109, "logps/rejected": -1.040330171585083, "loss": 0.9989, "odds_ratio_loss": 0.8191441297531128, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.09169561415910721, "rewards/margins": 0.012337413616478443, "rewards/rejected": -0.10403303056955338, "sft_loss": 0.9169561266899109, "step": 2300 }, { "epoch": 0.18, "grad_norm": 17.593547821044922, "learning_rate": 9.28055951919872e-06, "logits/chosen": -1.4363292455673218, "logits/rejected": -1.0941526889801025, "logps/chosen": -0.7269886136054993, "logps/rejected": -1.8268687725067139, "loss": 0.7623, "odds_ratio_loss": 0.3534363806247711, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07269886881113052, "rewards/margins": 0.1099880188703537, "rewards/rejected": -0.18268688023090363, "sft_loss": 0.7269886136054993, "step": 2305 }, { "epoch": 0.18, "grad_norm": 7.644922256469727, "learning_rate": 9.277374099948908e-06, "logits/chosen": -1.32466721534729, "logits/rejected": -0.9058082699775696, "logps/chosen": -1.083791732788086, "logps/rejected": -2.586423873901367, "loss": 1.1446, "odds_ratio_loss": 0.6080135107040405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10837917029857635, "rewards/margins": 0.1502632200717926, "rewards/rejected": -0.25864237546920776, "sft_loss": 1.083791732788086, "step": 2310 }, { "epoch": 0.18, "grad_norm": 6.352106094360352, "learning_rate": 9.274182193518195e-06, "logits/chosen": -1.3612511157989502, "logits/rejected": -1.0367705821990967, "logps/chosen": -0.9755949974060059, "logps/rejected": -1.4311678409576416, "loss": 1.0182, "odds_ratio_loss": 0.42629605531692505, "rewards/accuracies": 1.0, "rewards/chosen": -0.09755949676036835, "rewards/margins": 0.045557279139757156, "rewards/rejected": -0.1431167870759964, "sft_loss": 0.9755949974060059, "step": 2315 }, { "epoch": 0.18, "grad_norm": 8.281800270080566, "learning_rate": 9.270983804747516e-06, "logits/chosen": -1.256667137145996, "logits/rejected": -0.8662792444229126, "logps/chosen": -1.1143862009048462, "logps/rejected": -0.9090407490730286, "loss": 1.2097, "odds_ratio_loss": 0.9536363482475281, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.11143863201141357, "rewards/margins": -0.02053455263376236, "rewards/rejected": -0.09090407192707062, "sft_loss": 1.1143862009048462, "step": 2320 }, { "epoch": 0.18, "grad_norm": 6.306697845458984, "learning_rate": 9.267778938487633e-06, "logits/chosen": -1.4480441808700562, "logits/rejected": -0.8130139112472534, "logps/chosen": -1.0359063148498535, "logps/rejected": -1.931692123413086, "loss": 1.0838, "odds_ratio_loss": 0.47853994369506836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10359062999486923, "rewards/margins": 0.08957858383655548, "rewards/rejected": -0.19316920638084412, "sft_loss": 1.0359063148498535, "step": 2325 }, { "epoch": 0.18, "grad_norm": 53.15751266479492, "learning_rate": 9.264567599599129e-06, "logits/chosen": -1.4305864572525024, "logits/rejected": -1.1423676013946533, "logps/chosen": -1.1684757471084595, "logps/rejected": -1.9025484323501587, "loss": 1.2308, "odds_ratio_loss": 0.6231135129928589, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.11684757471084595, "rewards/margins": 0.07340727746486664, "rewards/rejected": -0.19025485217571259, "sft_loss": 1.1684757471084595, "step": 2330 }, { "epoch": 0.18, "grad_norm": 14.97216510772705, "learning_rate": 9.26134979295241e-06, "logits/chosen": -1.1160290241241455, "logits/rejected": -0.7148804664611816, "logps/chosen": -0.8799691200256348, "logps/rejected": -1.4380155801773071, "loss": 0.9225, "odds_ratio_loss": 0.4255514144897461, "rewards/accuracies": 1.0, "rewards/chosen": -0.08799691498279572, "rewards/margins": 0.05580463260412216, "rewards/rejected": -0.14380155503749847, "sft_loss": 0.8799691200256348, "step": 2335 }, { "epoch": 0.18, "grad_norm": 5.440201282501221, "learning_rate": 9.25812552342769e-06, "logits/chosen": -1.1734294891357422, "logits/rejected": -0.7869594693183899, "logps/chosen": -1.125270128250122, "logps/rejected": -0.9226048588752747, "loss": 1.217, "odds_ratio_loss": 0.9175472259521484, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1125270277261734, "rewards/margins": -0.020266540348529816, "rewards/rejected": -0.09226048737764359, "sft_loss": 1.125270128250122, "step": 2340 }, { "epoch": 0.18, "grad_norm": 6.525335788726807, "learning_rate": 9.254894795914979e-06, "logits/chosen": -1.0414087772369385, "logits/rejected": -1.1698774099349976, "logps/chosen": -1.0201406478881836, "logps/rejected": -1.202549695968628, "loss": 1.0814, "odds_ratio_loss": 0.6127591133117676, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10201406478881836, "rewards/margins": 0.018240921199321747, "rewards/rejected": -0.12025497853755951, "sft_loss": 1.0201406478881836, "step": 2345 }, { "epoch": 0.18, "grad_norm": 10.264086723327637, "learning_rate": 9.251657615314088e-06, "logits/chosen": -1.3329168558120728, "logits/rejected": -1.095649003982544, "logps/chosen": -0.8811469078063965, "logps/rejected": -1.50531804561615, "loss": 0.9181, "odds_ratio_loss": 0.369753360748291, "rewards/accuracies": 1.0, "rewards/chosen": -0.08811469376087189, "rewards/margins": 0.06241711229085922, "rewards/rejected": -0.1505317986011505, "sft_loss": 0.8811469078063965, "step": 2350 }, { "epoch": 0.18, "grad_norm": 7.230742931365967, "learning_rate": 9.248413986534612e-06, "logits/chosen": -1.4222882986068726, "logits/rejected": -1.0978469848632812, "logps/chosen": -0.7870491743087769, "logps/rejected": -2.3897218704223633, "loss": 0.8448, "odds_ratio_loss": 0.577102780342102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07870491594076157, "rewards/margins": 0.16026729345321655, "rewards/rejected": -0.23897221684455872, "sft_loss": 0.7870491743087769, "step": 2355 }, { "epoch": 0.18, "grad_norm": 7.951897621154785, "learning_rate": 9.245163914495926e-06, "logits/chosen": -1.176154613494873, "logits/rejected": -0.6637752056121826, "logps/chosen": -1.075390100479126, "logps/rejected": -2.4893295764923096, "loss": 1.1414, "odds_ratio_loss": 0.6604292988777161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10753902047872543, "rewards/margins": 0.14139392971992493, "rewards/rejected": -0.24893295764923096, "sft_loss": 1.075390100479126, "step": 2360 }, { "epoch": 0.18, "grad_norm": 9.111823081970215, "learning_rate": 9.241907404127176e-06, "logits/chosen": -1.1562446355819702, "logits/rejected": -0.9415414929389954, "logps/chosen": -0.654162585735321, "logps/rejected": -2.4974708557128906, "loss": 0.6713, "odds_ratio_loss": 0.17174024879932404, "rewards/accuracies": 1.0, "rewards/chosen": -0.06541626155376434, "rewards/margins": 0.18433082103729248, "rewards/rejected": -0.2497471123933792, "sft_loss": 0.654162585735321, "step": 2365 }, { "epoch": 0.18, "grad_norm": 7.270651340484619, "learning_rate": 9.238644460367274e-06, "logits/chosen": -1.2659519910812378, "logits/rejected": -0.7697997093200684, "logps/chosen": -0.9518246650695801, "logps/rejected": -2.2645275592803955, "loss": 0.9776, "odds_ratio_loss": 0.2575392723083496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09518247097730637, "rewards/margins": 0.13127027451992035, "rewards/rejected": -0.2264527529478073, "sft_loss": 0.9518246650695801, "step": 2370 }, { "epoch": 0.18, "grad_norm": 12.318182945251465, "learning_rate": 9.235375088164891e-06, "logits/chosen": -1.2376809120178223, "logits/rejected": -0.8642924427986145, "logps/chosen": -1.155277967453003, "logps/rejected": -1.1210805177688599, "loss": 1.2366, "odds_ratio_loss": 0.8130849599838257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11552779376506805, "rewards/margins": -0.0034197538625448942, "rewards/rejected": -0.11210803687572479, "sft_loss": 1.155277967453003, "step": 2375 }, { "epoch": 0.19, "grad_norm": 15.424654960632324, "learning_rate": 9.23209929247844e-06, "logits/chosen": -1.2814226150512695, "logits/rejected": -0.8389566540718079, "logps/chosen": -0.8976057767868042, "logps/rejected": -1.801372766494751, "loss": 0.9418, "odds_ratio_loss": 0.4419097304344177, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08976057916879654, "rewards/margins": 0.09037669003009796, "rewards/rejected": -0.1801372617483139, "sft_loss": 0.8976057767868042, "step": 2380 }, { "epoch": 0.19, "grad_norm": 6.769630432128906, "learning_rate": 9.228817078276084e-06, "logits/chosen": -1.233152151107788, "logits/rejected": -0.6566184759140015, "logps/chosen": -1.1040191650390625, "logps/rejected": -1.585508108139038, "loss": 1.1519, "odds_ratio_loss": 0.47855883836746216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11040191352367401, "rewards/margins": 0.04814889281988144, "rewards/rejected": -0.15855081379413605, "sft_loss": 1.1040191650390625, "step": 2385 }, { "epoch": 0.19, "grad_norm": 9.030645370483398, "learning_rate": 9.225528450535718e-06, "logits/chosen": -1.408438801765442, "logits/rejected": -0.8707016706466675, "logps/chosen": -1.164117693901062, "logps/rejected": -2.1822171211242676, "loss": 1.2221, "odds_ratio_loss": 0.5799840092658997, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11641178280115128, "rewards/margins": 0.10180990397930145, "rewards/rejected": -0.21822169423103333, "sft_loss": 1.164117693901062, "step": 2390 }, { "epoch": 0.19, "grad_norm": 5.576879501342773, "learning_rate": 9.222233414244963e-06, "logits/chosen": -1.548218011856079, "logits/rejected": -1.1314003467559814, "logps/chosen": -0.889999270439148, "logps/rejected": -1.3307504653930664, "loss": 0.9389, "odds_ratio_loss": 0.48932284116744995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08899993449449539, "rewards/margins": 0.044075123965740204, "rewards/rejected": -0.1330750584602356, "sft_loss": 0.889999270439148, "step": 2395 }, { "epoch": 0.19, "grad_norm": 30.624387741088867, "learning_rate": 9.218931974401158e-06, "logits/chosen": -1.4214167594909668, "logits/rejected": -0.949417769908905, "logps/chosen": -1.1055647134780884, "logps/rejected": -1.0172256231307983, "loss": 1.1906, "odds_ratio_loss": 0.850671112537384, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11055648326873779, "rewards/margins": -0.008833914995193481, "rewards/rejected": -0.10172256082296371, "sft_loss": 1.1055647134780884, "step": 2400 }, { "epoch": 0.19, "grad_norm": 4.737048149108887, "learning_rate": 9.21562413601136e-06, "logits/chosen": -1.3176019191741943, "logits/rejected": -0.5910056829452515, "logps/chosen": -0.7685356140136719, "logps/rejected": -1.1007511615753174, "loss": 0.819, "odds_ratio_loss": 0.5050622224807739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07685355842113495, "rewards/margins": 0.03322155401110649, "rewards/rejected": -0.11007511615753174, "sft_loss": 0.7685356140136719, "step": 2405 }, { "epoch": 0.19, "grad_norm": 11.416131973266602, "learning_rate": 9.21230990409232e-06, "logits/chosen": -1.397815227508545, "logits/rejected": -1.1700330972671509, "logps/chosen": -0.7989002466201782, "logps/rejected": -2.4671411514282227, "loss": 0.8525, "odds_ratio_loss": 0.5361326336860657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07989002764225006, "rewards/margins": 0.1668240875005722, "rewards/rejected": -0.24671411514282227, "sft_loss": 0.7989002466201782, "step": 2410 }, { "epoch": 0.19, "grad_norm": 70.81454467773438, "learning_rate": 9.208989283670498e-06, "logits/chosen": -1.350322961807251, "logits/rejected": -1.126625895500183, "logps/chosen": -1.1136561632156372, "logps/rejected": -1.5111583471298218, "loss": 1.161, "odds_ratio_loss": 0.4734528660774231, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11136561632156372, "rewards/margins": 0.039750225841999054, "rewards/rejected": -0.15111584961414337, "sft_loss": 1.1136561632156372, "step": 2415 }, { "epoch": 0.19, "grad_norm": 11.097872734069824, "learning_rate": 9.20566227978203e-06, "logits/chosen": -1.2976523637771606, "logits/rejected": -0.7848803997039795, "logps/chosen": -1.0063698291778564, "logps/rejected": -0.9979653358459473, "loss": 1.0846, "odds_ratio_loss": 0.7826443910598755, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10063699632883072, "rewards/margins": -0.0008404649561271071, "rewards/rejected": -0.09979653358459473, "sft_loss": 1.0063698291778564, "step": 2420 }, { "epoch": 0.19, "grad_norm": 10.51763916015625, "learning_rate": 9.202328897472746e-06, "logits/chosen": -1.3862828016281128, "logits/rejected": -1.381044626235962, "logps/chosen": -0.9244590997695923, "logps/rejected": -1.8402154445648193, "loss": 0.9667, "odds_ratio_loss": 0.42251425981521606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09244590997695923, "rewards/margins": 0.09157565236091614, "rewards/rejected": -0.18402156233787537, "sft_loss": 0.9244590997695923, "step": 2425 }, { "epoch": 0.19, "grad_norm": 48.170528411865234, "learning_rate": 9.198989141798138e-06, "logits/chosen": -1.2233293056488037, "logits/rejected": -0.9823969006538391, "logps/chosen": -1.0954564809799194, "logps/rejected": -1.1875765323638916, "loss": 1.1602, "odds_ratio_loss": 0.6473854184150696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10954566299915314, "rewards/margins": 0.009211997501552105, "rewards/rejected": -0.11875765025615692, "sft_loss": 1.0954564809799194, "step": 2430 }, { "epoch": 0.19, "grad_norm": 6.970828533172607, "learning_rate": 9.195643017823374e-06, "logits/chosen": -1.1448237895965576, "logits/rejected": -1.0005139112472534, "logps/chosen": -1.251326084136963, "logps/rejected": -4.398923873901367, "loss": 1.3175, "odds_ratio_loss": 0.6618432998657227, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12513260543346405, "rewards/margins": 0.3147597908973694, "rewards/rejected": -0.43989238142967224, "sft_loss": 1.251326084136963, "step": 2435 }, { "epoch": 0.19, "grad_norm": 11.294906616210938, "learning_rate": 9.192290530623274e-06, "logits/chosen": -1.1162798404693604, "logits/rejected": -0.7499058842658997, "logps/chosen": -1.2006309032440186, "logps/rejected": -1.4552767276763916, "loss": 1.2631, "odds_ratio_loss": 0.6247986555099487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12006310373544693, "rewards/margins": 0.025464575737714767, "rewards/rejected": -0.1455276906490326, "sft_loss": 1.2006309032440186, "step": 2440 }, { "epoch": 0.19, "grad_norm": 72.77327728271484, "learning_rate": 9.18893168528231e-06, "logits/chosen": -1.0885790586471558, "logits/rejected": -0.7597593665122986, "logps/chosen": -1.6269006729125977, "logps/rejected": -1.4666858911514282, "loss": 1.7196, "odds_ratio_loss": 0.9270656704902649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16269007325172424, "rewards/margins": -0.016021480783820152, "rewards/rejected": -0.14666858315467834, "sft_loss": 1.6269006729125977, "step": 2445 }, { "epoch": 0.19, "grad_norm": 10.69701862335205, "learning_rate": 9.185566486894597e-06, "logits/chosen": -1.1007755994796753, "logits/rejected": -1.287514090538025, "logps/chosen": -0.8829814791679382, "logps/rejected": -1.010026216506958, "loss": 0.9429, "odds_ratio_loss": 0.5991403460502625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08829815685749054, "rewards/margins": 0.012704456225037575, "rewards/rejected": -0.10100261121988297, "sft_loss": 0.8829814791679382, "step": 2450 }, { "epoch": 0.19, "grad_norm": 13.927633285522461, "learning_rate": 9.182194940563887e-06, "logits/chosen": -0.9834834933280945, "logits/rejected": -1.1958738565444946, "logps/chosen": -0.699195921421051, "logps/rejected": -1.6200673580169678, "loss": 0.741, "odds_ratio_loss": 0.4177609086036682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06991958618164062, "rewards/margins": 0.09208716452121735, "rewards/rejected": -0.16200676560401917, "sft_loss": 0.699195921421051, "step": 2455 }, { "epoch": 0.19, "grad_norm": 8.999503135681152, "learning_rate": 9.178817051403556e-06, "logits/chosen": -1.2043085098266602, "logits/rejected": -0.9950903058052063, "logps/chosen": -0.9628440141677856, "logps/rejected": -1.3881338834762573, "loss": 1.0108, "odds_ratio_loss": 0.47996312379837036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0962844043970108, "rewards/margins": 0.042528990656137466, "rewards/rejected": -0.13881340622901917, "sft_loss": 0.9628440141677856, "step": 2460 }, { "epoch": 0.19, "grad_norm": 26.092164993286133, "learning_rate": 9.175432824536604e-06, "logits/chosen": -1.362648606300354, "logits/rejected": -1.0125370025634766, "logps/chosen": -1.0991069078445435, "logps/rejected": -1.0862895250320435, "loss": 1.1755, "odds_ratio_loss": 0.7640587687492371, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10991068929433823, "rewards/margins": -0.0012817397946491838, "rewards/rejected": -0.10862895101308823, "sft_loss": 1.0991069078445435, "step": 2465 }, { "epoch": 0.19, "grad_norm": 32.62111282348633, "learning_rate": 9.17204226509564e-06, "logits/chosen": -1.08524751663208, "logits/rejected": -0.9146364331245422, "logps/chosen": -1.142344355583191, "logps/rejected": -0.99430912733078, "loss": 1.232, "odds_ratio_loss": 0.8967956304550171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11423443257808685, "rewards/margins": -0.01480352133512497, "rewards/rejected": -0.09943092614412308, "sft_loss": 1.142344355583191, "step": 2470 }, { "epoch": 0.19, "grad_norm": 6.865599155426025, "learning_rate": 9.16864537822288e-06, "logits/chosen": -1.104943871498108, "logits/rejected": -1.0251874923706055, "logps/chosen": -0.8822401762008667, "logps/rejected": -0.8117051124572754, "loss": 0.9565, "odds_ratio_loss": 0.7424853444099426, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08822402358055115, "rewards/margins": -0.007053514011204243, "rewards/rejected": -0.08117051422595978, "sft_loss": 0.8822401762008667, "step": 2475 }, { "epoch": 0.19, "grad_norm": 11.853043556213379, "learning_rate": 9.165242169070129e-06, "logits/chosen": -1.2353646755218506, "logits/rejected": -0.8373567461967468, "logps/chosen": -0.9056793451309204, "logps/rejected": -1.1309173107147217, "loss": 0.9665, "odds_ratio_loss": 0.6080256104469299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0905679315328598, "rewards/margins": 0.02252379059791565, "rewards/rejected": -0.11309172958135605, "sft_loss": 0.9056793451309204, "step": 2480 }, { "epoch": 0.19, "grad_norm": 7.211149215698242, "learning_rate": 9.16183264279879e-06, "logits/chosen": -1.17579984664917, "logits/rejected": -0.982610821723938, "logps/chosen": -0.7722324728965759, "logps/rejected": -2.1497650146484375, "loss": 0.8077, "odds_ratio_loss": 0.35467711091041565, "rewards/accuracies": 1.0, "rewards/chosen": -0.07722325623035431, "rewards/margins": 0.13775324821472168, "rewards/rejected": -0.214976504445076, "sft_loss": 0.7722324728965759, "step": 2485 }, { "epoch": 0.19, "grad_norm": 8.517918586730957, "learning_rate": 9.158416804579841e-06, "logits/chosen": -1.2026476860046387, "logits/rejected": -0.8483761548995972, "logps/chosen": -1.1936149597167969, "logps/rejected": -1.2055301666259766, "loss": 1.2698, "odds_ratio_loss": 0.7616353034973145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1193614974617958, "rewards/margins": 0.0011915117502212524, "rewards/rejected": -0.12055301666259766, "sft_loss": 1.1936149597167969, "step": 2490 }, { "epoch": 0.19, "grad_norm": 7.186039447784424, "learning_rate": 9.154994659593836e-06, "logits/chosen": -1.1555768251419067, "logits/rejected": -0.8785859942436218, "logps/chosen": -0.7830021381378174, "logps/rejected": -1.2585175037384033, "loss": 0.8254, "odds_ratio_loss": 0.4238009452819824, "rewards/accuracies": 1.0, "rewards/chosen": -0.07830022275447845, "rewards/margins": 0.04755154997110367, "rewards/rejected": -0.12585176527500153, "sft_loss": 0.7830021381378174, "step": 2495 }, { "epoch": 0.19, "grad_norm": 45.5631217956543, "learning_rate": 9.151566213030891e-06, "logits/chosen": -1.4677820205688477, "logits/rejected": -1.2527766227722168, "logps/chosen": -1.0443228483200073, "logps/rejected": -3.552145004272461, "loss": 1.0872, "odds_ratio_loss": 0.428349107503891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10443229973316193, "rewards/margins": 0.25078222155570984, "rewards/rejected": -0.35521453619003296, "sft_loss": 1.0443228483200073, "step": 2500 }, { "epoch": 0.19, "grad_norm": 22.740446090698242, "learning_rate": 9.14813147009068e-06, "logits/chosen": -1.301882266998291, "logits/rejected": -1.150469183921814, "logps/chosen": -0.7994235754013062, "logps/rejected": -1.0263789892196655, "loss": 0.855, "odds_ratio_loss": 0.5557239055633545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07994236052036285, "rewards/margins": 0.022695545107126236, "rewards/rejected": -0.10263790935277939, "sft_loss": 0.7994235754013062, "step": 2505 }, { "epoch": 0.2, "grad_norm": 2.394989252090454, "learning_rate": 9.144690435982427e-06, "logits/chosen": -1.2863742113113403, "logits/rejected": -0.9021209478378296, "logps/chosen": -1.0736862421035767, "logps/rejected": -2.3909831047058105, "loss": 1.1261, "odds_ratio_loss": 0.5244934558868408, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10736862570047379, "rewards/margins": 0.13172969222068787, "rewards/rejected": -0.23909831047058105, "sft_loss": 1.0736862421035767, "step": 2510 }, { "epoch": 0.2, "grad_norm": 37.13337326049805, "learning_rate": 9.141243115924898e-06, "logits/chosen": -1.3705681562423706, "logits/rejected": -0.9290679693222046, "logps/chosen": -0.7407183647155762, "logps/rejected": -1.197664499282837, "loss": 0.8096, "odds_ratio_loss": 0.6890398859977722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07407183945178986, "rewards/margins": 0.04569462686777115, "rewards/rejected": -0.11976645886898041, "sft_loss": 0.7407183647155762, "step": 2515 }, { "epoch": 0.2, "grad_norm": 10.020195960998535, "learning_rate": 9.13778951514639e-06, "logits/chosen": -1.2581623792648315, "logits/rejected": -0.8416546583175659, "logps/chosen": -1.1621432304382324, "logps/rejected": -2.305757522583008, "loss": 1.2145, "odds_ratio_loss": 0.5237524509429932, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.116214320063591, "rewards/margins": 0.11436142027378082, "rewards/rejected": -0.23057575523853302, "sft_loss": 1.1621432304382324, "step": 2520 }, { "epoch": 0.2, "grad_norm": 2.6576640605926514, "learning_rate": 9.134329638884729e-06, "logits/chosen": -1.2983006238937378, "logits/rejected": -0.7410690784454346, "logps/chosen": -0.8040630221366882, "logps/rejected": -2.982477903366089, "loss": 0.8338, "odds_ratio_loss": 0.2976140081882477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0804063007235527, "rewards/margins": 0.2178414762020111, "rewards/rejected": -0.2982478141784668, "sft_loss": 0.8040630221366882, "step": 2525 }, { "epoch": 0.2, "grad_norm": 18.113977432250977, "learning_rate": 9.130863492387254e-06, "logits/chosen": -1.171718716621399, "logits/rejected": -1.1481155157089233, "logps/chosen": -1.0631039142608643, "logps/rejected": -1.413201928138733, "loss": 1.1453, "odds_ratio_loss": 0.8220809698104858, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1063103899359703, "rewards/margins": 0.03500979021191597, "rewards/rejected": -0.14132019877433777, "sft_loss": 1.0631039142608643, "step": 2530 }, { "epoch": 0.2, "grad_norm": 6.383808612823486, "learning_rate": 9.12739108091082e-06, "logits/chosen": -1.3552782535552979, "logits/rejected": -0.7787104249000549, "logps/chosen": -1.0695326328277588, "logps/rejected": -1.128353238105774, "loss": 1.1368, "odds_ratio_loss": 0.6724019050598145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10695326328277588, "rewards/margins": 0.005882044322788715, "rewards/rejected": -0.11283531039953232, "sft_loss": 1.0695326328277588, "step": 2535 }, { "epoch": 0.2, "grad_norm": 94.40227508544922, "learning_rate": 9.123912409721777e-06, "logits/chosen": -1.1765425205230713, "logits/rejected": -0.8706881403923035, "logps/chosen": -0.9728943109512329, "logps/rejected": -0.8993405103683472, "loss": 1.0499, "odds_ratio_loss": 0.7700883150100708, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09728943556547165, "rewards/margins": -0.007355389650911093, "rewards/rejected": -0.08993404358625412, "sft_loss": 0.9728943109512329, "step": 2540 }, { "epoch": 0.2, "grad_norm": 13.701937675476074, "learning_rate": 9.120427484095972e-06, "logits/chosen": -1.2000676393508911, "logits/rejected": -0.7381768822669983, "logps/chosen": -0.7090678215026855, "logps/rejected": -1.2819874286651611, "loss": 0.743, "odds_ratio_loss": 0.3396037220954895, "rewards/accuracies": 1.0, "rewards/chosen": -0.07090678811073303, "rewards/margins": 0.05729196220636368, "rewards/rejected": -0.1281987428665161, "sft_loss": 0.7090678215026855, "step": 2545 }, { "epoch": 0.2, "grad_norm": 5.354995250701904, "learning_rate": 9.116936309318739e-06, "logits/chosen": -1.2523038387298584, "logits/rejected": -0.9723421335220337, "logps/chosen": -0.747650146484375, "logps/rejected": -1.4843276739120483, "loss": 0.7891, "odds_ratio_loss": 0.4141773581504822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07476501911878586, "rewards/margins": 0.07366775721311569, "rewards/rejected": -0.14843276143074036, "sft_loss": 0.747650146484375, "step": 2550 }, { "epoch": 0.2, "grad_norm": 6.774299621582031, "learning_rate": 9.113438890684886e-06, "logits/chosen": -1.3876330852508545, "logits/rejected": -0.5892521142959595, "logps/chosen": -0.8726062774658203, "logps/rejected": -1.361816167831421, "loss": 0.9561, "odds_ratio_loss": 0.8344847559928894, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08726062625646591, "rewards/margins": 0.048920996487140656, "rewards/rejected": -0.13618160784244537, "sft_loss": 0.8726062774658203, "step": 2555 }, { "epoch": 0.2, "grad_norm": 6.449535369873047, "learning_rate": 9.10993523349869e-06, "logits/chosen": -1.2600324153900146, "logits/rejected": -0.6542048454284668, "logps/chosen": -1.1139378547668457, "logps/rejected": -2.1798148155212402, "loss": 1.1481, "odds_ratio_loss": 0.3412316143512726, "rewards/accuracies": 1.0, "rewards/chosen": -0.1113937720656395, "rewards/margins": 0.10658769309520721, "rewards/rejected": -0.2179814875125885, "sft_loss": 1.1139378547668457, "step": 2560 }, { "epoch": 0.2, "grad_norm": 12.766084671020508, "learning_rate": 9.106425343073897e-06, "logits/chosen": -1.1841919422149658, "logits/rejected": -1.0210702419281006, "logps/chosen": -0.8251067996025085, "logps/rejected": -1.2712657451629639, "loss": 0.873, "odds_ratio_loss": 0.47851771116256714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08251067250967026, "rewards/margins": 0.04461590573191643, "rewards/rejected": -0.12712658941745758, "sft_loss": 0.8251067996025085, "step": 2565 }, { "epoch": 0.2, "grad_norm": 11.313969612121582, "learning_rate": 9.1029092247337e-06, "logits/chosen": -1.2296737432479858, "logits/rejected": -0.9485149383544922, "logps/chosen": -0.928150475025177, "logps/rejected": -2.3807272911071777, "loss": 0.9677, "odds_ratio_loss": 0.3950374722480774, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09281504154205322, "rewards/margins": 0.14525768160820007, "rewards/rejected": -0.2380727082490921, "sft_loss": 0.928150475025177, "step": 2570 }, { "epoch": 0.2, "grad_norm": 12.160032272338867, "learning_rate": 9.099386883810736e-06, "logits/chosen": -1.254393219947815, "logits/rejected": -0.8775162696838379, "logps/chosen": -1.248295545578003, "logps/rejected": -1.8063290119171143, "loss": 1.3072, "odds_ratio_loss": 0.5886574983596802, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.12482955306768417, "rewards/margins": 0.05580334737896919, "rewards/rejected": -0.18063290417194366, "sft_loss": 1.248295545578003, "step": 2575 }, { "epoch": 0.2, "grad_norm": 12.303729057312012, "learning_rate": 9.095858325647084e-06, "logits/chosen": -1.2073808908462524, "logits/rejected": -0.872015118598938, "logps/chosen": -1.0176329612731934, "logps/rejected": -1.3849756717681885, "loss": 1.0668, "odds_ratio_loss": 0.49167561531066895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1017632931470871, "rewards/margins": 0.03673427551984787, "rewards/rejected": -0.13849757611751556, "sft_loss": 1.0176329612731934, "step": 2580 }, { "epoch": 0.2, "grad_norm": 7.670443534851074, "learning_rate": 9.092323555594254e-06, "logits/chosen": -1.3451400995254517, "logits/rejected": -1.0177253484725952, "logps/chosen": -1.0468345880508423, "logps/rejected": -1.6150470972061157, "loss": 1.1329, "odds_ratio_loss": 0.8601625561714172, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10468345880508423, "rewards/margins": 0.056821249425411224, "rewards/rejected": -0.16150471568107605, "sft_loss": 1.0468345880508423, "step": 2585 }, { "epoch": 0.2, "grad_norm": 17.628047943115234, "learning_rate": 9.088782579013167e-06, "logits/chosen": -1.3400976657867432, "logits/rejected": -0.9388322830200195, "logps/chosen": -0.7216960787773132, "logps/rejected": -0.7665186524391174, "loss": 0.8021, "odds_ratio_loss": 0.8036432266235352, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07216961681842804, "rewards/margins": 0.004482255782932043, "rewards/rejected": -0.07665187120437622, "sft_loss": 0.7216960787773132, "step": 2590 }, { "epoch": 0.2, "grad_norm": 15.716256141662598, "learning_rate": 9.08523540127417e-06, "logits/chosen": -1.2134828567504883, "logits/rejected": -1.1170094013214111, "logps/chosen": -0.9080262184143066, "logps/rejected": -1.2125194072723389, "loss": 0.9598, "odds_ratio_loss": 0.5175421237945557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0908026248216629, "rewards/margins": 0.030449315905570984, "rewards/rejected": -0.12125194072723389, "sft_loss": 0.9080262184143066, "step": 2595 }, { "epoch": 0.2, "grad_norm": 16.95549201965332, "learning_rate": 9.081682027757001e-06, "logits/chosen": -1.1575896739959717, "logits/rejected": -0.7381139993667603, "logps/chosen": -0.824720025062561, "logps/rejected": -4.392837047576904, "loss": 0.8607, "odds_ratio_loss": 0.3601614534854889, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08247199654579163, "rewards/margins": 0.3568117022514343, "rewards/rejected": -0.4392837584018707, "sft_loss": 0.824720025062561, "step": 2600 }, { "epoch": 0.2, "grad_norm": 16.999509811401367, "learning_rate": 9.07812246385081e-06, "logits/chosen": -1.338104248046875, "logits/rejected": -0.9916426539421082, "logps/chosen": -1.3346434831619263, "logps/rejected": -4.51638650894165, "loss": 1.4249, "odds_ratio_loss": 0.9022731781005859, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.13346433639526367, "rewards/margins": 0.31817427277565, "rewards/rejected": -0.4516386091709137, "sft_loss": 1.3346434831619263, "step": 2605 }, { "epoch": 0.2, "grad_norm": 95.34765625, "learning_rate": 9.074556714954121e-06, "logits/chosen": -1.1742122173309326, "logits/rejected": -0.8790088891983032, "logps/chosen": -0.996563732624054, "logps/rejected": -1.647459626197815, "loss": 1.0471, "odds_ratio_loss": 0.5054280757904053, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0996563732624054, "rewards/margins": 0.06508957594633102, "rewards/rejected": -0.16474595665931702, "sft_loss": 0.996563732624054, "step": 2610 }, { "epoch": 0.2, "grad_norm": 9.325736045837402, "learning_rate": 9.07098478647485e-06, "logits/chosen": -1.3385982513427734, "logits/rejected": -0.8874040842056274, "logps/chosen": -1.1331145763397217, "logps/rejected": -1.6950451135635376, "loss": 1.1968, "odds_ratio_loss": 0.6363669633865356, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11331145465373993, "rewards/margins": 0.056193046271800995, "rewards/rejected": -0.16950449347496033, "sft_loss": 1.1331145763397217, "step": 2615 }, { "epoch": 0.2, "grad_norm": 22.75711441040039, "learning_rate": 9.067406683830278e-06, "logits/chosen": -1.2250087261199951, "logits/rejected": -0.9773575663566589, "logps/chosen": -1.0626041889190674, "logps/rejected": -1.3051557540893555, "loss": 1.1219, "odds_ratio_loss": 0.5928469896316528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10626041889190674, "rewards/margins": 0.02425515279173851, "rewards/rejected": -0.13051557540893555, "sft_loss": 1.0626041889190674, "step": 2620 }, { "epoch": 0.2, "grad_norm": 6.514204978942871, "learning_rate": 9.06382241244705e-06, "logits/chosen": -1.1473455429077148, "logits/rejected": -0.6742517352104187, "logps/chosen": -1.03928804397583, "logps/rejected": -3.828688144683838, "loss": 1.0807, "odds_ratio_loss": 0.41433659195899963, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10392878949642181, "rewards/margins": 0.27894002199172974, "rewards/rejected": -0.38286882638931274, "sft_loss": 1.03928804397583, "step": 2625 }, { "epoch": 0.2, "grad_norm": 81.91191864013672, "learning_rate": 9.060231977761173e-06, "logits/chosen": -1.4000599384307861, "logits/rejected": -0.8785271644592285, "logps/chosen": -0.8743749856948853, "logps/rejected": -4.196818828582764, "loss": 0.9417, "odds_ratio_loss": 0.673501193523407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08743749558925629, "rewards/margins": 0.3322443962097168, "rewards/rejected": -0.4196818470954895, "sft_loss": 0.8743749856948853, "step": 2630 }, { "epoch": 0.2, "grad_norm": 25.8214111328125, "learning_rate": 9.056635385217994e-06, "logits/chosen": -1.306239366531372, "logits/rejected": -1.0091142654418945, "logps/chosen": -0.8158831596374512, "logps/rejected": -1.0368363857269287, "loss": 0.8713, "odds_ratio_loss": 0.5539509057998657, "rewards/accuracies": 1.0, "rewards/chosen": -0.08158832043409348, "rewards/margins": 0.022095322608947754, "rewards/rejected": -0.10368363559246063, "sft_loss": 0.8158831596374512, "step": 2635 }, { "epoch": 0.21, "grad_norm": 6.849333763122559, "learning_rate": 9.053032640272202e-06, "logits/chosen": -1.411442756652832, "logits/rejected": -1.0196300745010376, "logps/chosen": -0.845949649810791, "logps/rejected": -1.260571002960205, "loss": 0.8894, "odds_ratio_loss": 0.43468767404556274, "rewards/accuracies": 1.0, "rewards/chosen": -0.0845949649810791, "rewards/margins": 0.041462142020463943, "rewards/rejected": -0.12605710327625275, "sft_loss": 0.845949649810791, "step": 2640 }, { "epoch": 0.21, "grad_norm": 14.452634811401367, "learning_rate": 9.049423748387819e-06, "logits/chosen": -1.3134500980377197, "logits/rejected": -0.8436886668205261, "logps/chosen": -1.2949182987213135, "logps/rejected": -4.814360618591309, "loss": 1.337, "odds_ratio_loss": 0.4204350411891937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12949183583259583, "rewards/margins": 0.351944237947464, "rewards/rejected": -0.4814360737800598, "sft_loss": 1.2949182987213135, "step": 2645 }, { "epoch": 0.21, "grad_norm": 29.886714935302734, "learning_rate": 9.045808715038184e-06, "logits/chosen": -1.2806081771850586, "logits/rejected": -0.842139720916748, "logps/chosen": -0.9247692227363586, "logps/rejected": -1.1130586862564087, "loss": 0.982, "odds_ratio_loss": 0.5720853209495544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09247691929340363, "rewards/margins": 0.018828941509127617, "rewards/rejected": -0.1113058552145958, "sft_loss": 0.9247692227363586, "step": 2650 }, { "epoch": 0.21, "grad_norm": 33.8629035949707, "learning_rate": 9.04218754570596e-06, "logits/chosen": -1.1466368436813354, "logits/rejected": -0.9131369590759277, "logps/chosen": -1.3172786235809326, "logps/rejected": -3.089834690093994, "loss": 1.3603, "odds_ratio_loss": 0.43001121282577515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13172785937786102, "rewards/margins": 0.17725564539432526, "rewards/rejected": -0.3089835047721863, "sft_loss": 1.3172786235809326, "step": 2655 }, { "epoch": 0.21, "grad_norm": 15.455634117126465, "learning_rate": 9.038560245883105e-06, "logits/chosen": -1.412030577659607, "logits/rejected": -1.102386236190796, "logps/chosen": -1.2090859413146973, "logps/rejected": -1.5070087909698486, "loss": 1.2694, "odds_ratio_loss": 0.6028513312339783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12090860307216644, "rewards/margins": 0.02979227900505066, "rewards/rejected": -0.1507008820772171, "sft_loss": 1.2090859413146973, "step": 2660 }, { "epoch": 0.21, "grad_norm": 5.274667739868164, "learning_rate": 9.034926821070883e-06, "logits/chosen": -1.2878334522247314, "logits/rejected": -0.8525265455245972, "logps/chosen": -1.1998794078826904, "logps/rejected": -1.4124677181243896, "loss": 1.2567, "odds_ratio_loss": 0.5681849718093872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11998794227838516, "rewards/margins": 0.02125883661210537, "rewards/rejected": -0.1412467658519745, "sft_loss": 1.1998794078826904, "step": 2665 }, { "epoch": 0.21, "grad_norm": 17.122329711914062, "learning_rate": 9.03128727677984e-06, "logits/chosen": -1.2906736135482788, "logits/rejected": -0.612545907497406, "logps/chosen": -1.0701696872711182, "logps/rejected": -1.8918075561523438, "loss": 1.1072, "odds_ratio_loss": 0.370770126581192, "rewards/accuracies": 1.0, "rewards/chosen": -0.10701696574687958, "rewards/margins": 0.08216379582881927, "rewards/rejected": -0.18918077647686005, "sft_loss": 1.0701696872711182, "step": 2670 }, { "epoch": 0.21, "grad_norm": 13.244214057922363, "learning_rate": 9.027641618529813e-06, "logits/chosen": -1.4152615070343018, "logits/rejected": -1.1327699422836304, "logps/chosen": -1.1863515377044678, "logps/rejected": -1.7167526483535767, "loss": 1.2365, "odds_ratio_loss": 0.5016016960144043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1186351552605629, "rewards/margins": 0.05304010584950447, "rewards/rejected": -0.17167526483535767, "sft_loss": 1.1863515377044678, "step": 2675 }, { "epoch": 0.21, "grad_norm": 6.127563953399658, "learning_rate": 9.023989851849899e-06, "logits/chosen": -1.3840445280075073, "logits/rejected": -1.0111591815948486, "logps/chosen": -1.281292200088501, "logps/rejected": -2.281388282775879, "loss": 1.3298, "odds_ratio_loss": 0.4846586287021637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12812921404838562, "rewards/margins": 0.10000962018966675, "rewards/rejected": -0.22813884913921356, "sft_loss": 1.281292200088501, "step": 2680 }, { "epoch": 0.21, "grad_norm": 6.400108814239502, "learning_rate": 9.02033198227847e-06, "logits/chosen": -1.47408127784729, "logits/rejected": -0.6259415745735168, "logps/chosen": -0.9586564898490906, "logps/rejected": -4.168588161468506, "loss": 0.9967, "odds_ratio_loss": 0.38054126501083374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0958656519651413, "rewards/margins": 0.32099318504333496, "rewards/rejected": -0.41685882210731506, "sft_loss": 0.9586564898490906, "step": 2685 }, { "epoch": 0.21, "grad_norm": 27.631380081176758, "learning_rate": 9.01666801536315e-06, "logits/chosen": -1.2730721235275269, "logits/rejected": -1.0189874172210693, "logps/chosen": -1.5083661079406738, "logps/rejected": -2.5043952465057373, "loss": 1.5848, "odds_ratio_loss": 0.7645183205604553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15083661675453186, "rewards/margins": 0.09960292279720306, "rewards/rejected": -0.25043952465057373, "sft_loss": 1.5083661079406738, "step": 2690 }, { "epoch": 0.21, "grad_norm": 48.11613082885742, "learning_rate": 9.012997956660807e-06, "logits/chosen": -1.2310947179794312, "logits/rejected": -0.9574726819992065, "logps/chosen": -0.809436023235321, "logps/rejected": -1.5579251050949097, "loss": 0.8551, "odds_ratio_loss": 0.45699796080589294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08094360679388046, "rewards/margins": 0.07484889030456543, "rewards/rejected": -0.1557925045490265, "sft_loss": 0.809436023235321, "step": 2695 }, { "epoch": 0.21, "grad_norm": 8.816434860229492, "learning_rate": 9.009321811737553e-06, "logits/chosen": -1.2568022012710571, "logits/rejected": -0.9708755612373352, "logps/chosen": -0.8139735460281372, "logps/rejected": -1.1872222423553467, "loss": 0.868, "odds_ratio_loss": 0.5405431985855103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08139735460281372, "rewards/margins": 0.03732486814260483, "rewards/rejected": -0.11872222274541855, "sft_loss": 0.8139735460281372, "step": 2700 }, { "epoch": 0.21, "grad_norm": 21.41962242126465, "learning_rate": 9.005639586168728e-06, "logits/chosen": -1.076005458831787, "logits/rejected": -1.0931252241134644, "logps/chosen": -0.6707652807235718, "logps/rejected": -2.1130595207214355, "loss": 0.6878, "odds_ratio_loss": 0.16990558803081512, "rewards/accuracies": 1.0, "rewards/chosen": -0.06707652658224106, "rewards/margins": 0.14422942698001862, "rewards/rejected": -0.21130594611167908, "sft_loss": 0.6707652807235718, "step": 2705 }, { "epoch": 0.21, "grad_norm": 31.207767486572266, "learning_rate": 9.001951285538897e-06, "logits/chosen": -1.1946260929107666, "logits/rejected": -0.9534111022949219, "logps/chosen": -1.2287012338638306, "logps/rejected": -2.725151538848877, "loss": 1.2922, "odds_ratio_loss": 0.634853720664978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12287012487649918, "rewards/margins": 0.14964503049850464, "rewards/rejected": -0.2725151479244232, "sft_loss": 1.2287012338638306, "step": 2710 }, { "epoch": 0.21, "grad_norm": 36.257164001464844, "learning_rate": 8.998256915441831e-06, "logits/chosen": -1.2600607872009277, "logits/rejected": -0.5680662393569946, "logps/chosen": -0.9931353330612183, "logps/rejected": -2.1860270500183105, "loss": 1.0308, "odds_ratio_loss": 0.37645816802978516, "rewards/accuracies": 1.0, "rewards/chosen": -0.09931354224681854, "rewards/margins": 0.11928915977478027, "rewards/rejected": -0.21860270202159882, "sft_loss": 0.9931353330612183, "step": 2715 }, { "epoch": 0.21, "grad_norm": 9.290372848510742, "learning_rate": 8.994556481480517e-06, "logits/chosen": -0.9715999364852905, "logits/rejected": -0.8543729782104492, "logps/chosen": -0.6672913432121277, "logps/rejected": -1.5520168542861938, "loss": 0.6947, "odds_ratio_loss": 0.2744672894477844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06672913581132889, "rewards/margins": 0.08847255259752274, "rewards/rejected": -0.15520168840885162, "sft_loss": 0.6672913432121277, "step": 2720 }, { "epoch": 0.21, "grad_norm": 29.265583038330078, "learning_rate": 8.990849989267127e-06, "logits/chosen": -1.2258832454681396, "logits/rejected": -1.1225849390029907, "logps/chosen": -0.988205075263977, "logps/rejected": -0.9803763628005981, "loss": 1.063, "odds_ratio_loss": 0.7477890849113464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0988205075263977, "rewards/margins": -0.0007828705129213631, "rewards/rejected": -0.09803762286901474, "sft_loss": 0.988205075263977, "step": 2725 }, { "epoch": 0.21, "grad_norm": 30.148481369018555, "learning_rate": 8.987137444423033e-06, "logits/chosen": -1.217212438583374, "logits/rejected": -0.7578709125518799, "logps/chosen": -0.9597604870796204, "logps/rejected": -1.3635588884353638, "loss": 1.0054, "odds_ratio_loss": 0.4561527669429779, "rewards/accuracies": 1.0, "rewards/chosen": -0.09597603976726532, "rewards/margins": 0.0403798371553421, "rewards/rejected": -0.13635587692260742, "sft_loss": 0.9597604870796204, "step": 2730 }, { "epoch": 0.21, "grad_norm": 6.763095378875732, "learning_rate": 8.983418852578776e-06, "logits/chosen": -1.2369617223739624, "logits/rejected": -0.9568303823471069, "logps/chosen": -1.1668567657470703, "logps/rejected": -1.3419616222381592, "loss": 1.2252, "odds_ratio_loss": 0.5838689804077148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11668567359447479, "rewards/margins": 0.01751049794256687, "rewards/rejected": -0.1341961771249771, "sft_loss": 1.1668567657470703, "step": 2735 }, { "epoch": 0.21, "grad_norm": 6.214346885681152, "learning_rate": 8.979694219374076e-06, "logits/chosen": -1.202335238456726, "logits/rejected": -1.1979544162750244, "logps/chosen": -0.9494991302490234, "logps/rejected": -2.23736834526062, "loss": 0.9914, "odds_ratio_loss": 0.4188031256198883, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09494991600513458, "rewards/margins": 0.12878692150115967, "rewards/rejected": -0.22373683750629425, "sft_loss": 0.9494991302490234, "step": 2740 }, { "epoch": 0.21, "grad_norm": 6.824653625488281, "learning_rate": 8.975963550457809e-06, "logits/chosen": -1.456319808959961, "logits/rejected": -1.2307958602905273, "logps/chosen": -0.7797650694847107, "logps/rejected": -1.6499179601669312, "loss": 0.8556, "odds_ratio_loss": 0.7585657835006714, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07797650992870331, "rewards/margins": 0.087015300989151, "rewards/rejected": -0.1649918109178543, "sft_loss": 0.7797650694847107, "step": 2745 }, { "epoch": 0.21, "grad_norm": 12.145956993103027, "learning_rate": 8.97222685148801e-06, "logits/chosen": -1.3519519567489624, "logits/rejected": -1.1805908679962158, "logps/chosen": -1.2413402795791626, "logps/rejected": -3.3706047534942627, "loss": 1.284, "odds_ratio_loss": 0.4268109202384949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12413404136896133, "rewards/margins": 0.2129264622926712, "rewards/rejected": -0.33706048130989075, "sft_loss": 1.2413402795791626, "step": 2750 }, { "epoch": 0.21, "grad_norm": 11.599871635437012, "learning_rate": 8.968484128131858e-06, "logits/chosen": -1.253807544708252, "logits/rejected": -0.717187225818634, "logps/chosen": -1.1322736740112305, "logps/rejected": -4.610743045806885, "loss": 1.1829, "odds_ratio_loss": 0.5058093070983887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11322736740112305, "rewards/margins": 0.34784695506095886, "rewards/rejected": -0.4610742926597595, "sft_loss": 1.1322736740112305, "step": 2755 }, { "epoch": 0.21, "grad_norm": 5.39821720123291, "learning_rate": 8.964735386065669e-06, "logits/chosen": -1.196275234222412, "logits/rejected": -0.8260028958320618, "logps/chosen": -1.1362543106079102, "logps/rejected": -1.7856251001358032, "loss": 1.1734, "odds_ratio_loss": 0.3714643120765686, "rewards/accuracies": 1.0, "rewards/chosen": -0.11362544447183609, "rewards/margins": 0.06493707001209259, "rewards/rejected": -0.1785624921321869, "sft_loss": 1.1362543106079102, "step": 2760 }, { "epoch": 0.22, "grad_norm": 14.481295585632324, "learning_rate": 8.960980630974881e-06, "logits/chosen": -0.6916632056236267, "logits/rejected": -1.1656701564788818, "logps/chosen": -1.1203669309616089, "logps/rejected": -1.38091242313385, "loss": 1.1758, "odds_ratio_loss": 0.5547033548355103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11203669011592865, "rewards/margins": 0.026054542511701584, "rewards/rejected": -0.13809125125408173, "sft_loss": 1.1203669309616089, "step": 2765 }, { "epoch": 0.22, "grad_norm": 5.397483825683594, "learning_rate": 8.957219868554064e-06, "logits/chosen": -1.1858707666397095, "logits/rejected": -0.7736166715621948, "logps/chosen": -0.6137800812721252, "logps/rejected": -2.991201877593994, "loss": 0.6472, "odds_ratio_loss": 0.3342156410217285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06137801334261894, "rewards/margins": 0.23774215579032898, "rewards/rejected": -0.2991201877593994, "sft_loss": 0.6137800812721252, "step": 2770 }, { "epoch": 0.22, "grad_norm": 25.549015045166016, "learning_rate": 8.953453104506886e-06, "logits/chosen": -1.0718984603881836, "logits/rejected": -1.097247838973999, "logps/chosen": -1.0510852336883545, "logps/rejected": -1.8239879608154297, "loss": 1.0958, "odds_ratio_loss": 0.4471181035041809, "rewards/accuracies": 1.0, "rewards/chosen": -0.10510852187871933, "rewards/margins": 0.07729027420282364, "rewards/rejected": -0.18239879608154297, "sft_loss": 1.0510852336883545, "step": 2775 }, { "epoch": 0.22, "grad_norm": 24.154788970947266, "learning_rate": 8.949680344546125e-06, "logits/chosen": -1.3380266427993774, "logits/rejected": -1.0320308208465576, "logps/chosen": -1.1754895448684692, "logps/rejected": -1.2273073196411133, "loss": 1.2414, "odds_ratio_loss": 0.6591736674308777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11754894256591797, "rewards/margins": 0.0051817819476127625, "rewards/rejected": -0.12273073196411133, "sft_loss": 1.1754895448684692, "step": 2780 }, { "epoch": 0.22, "grad_norm": 6.844038009643555, "learning_rate": 8.94590159439365e-06, "logits/chosen": -1.368798851966858, "logits/rejected": -0.6834617257118225, "logps/chosen": -1.0847288370132446, "logps/rejected": -1.476498007774353, "loss": 1.1346, "odds_ratio_loss": 0.49874448776245117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10847288370132446, "rewards/margins": 0.03917691111564636, "rewards/rejected": -0.14764979481697083, "sft_loss": 1.0847288370132446, "step": 2785 }, { "epoch": 0.22, "grad_norm": 16.110488891601562, "learning_rate": 8.942116859780416e-06, "logits/chosen": -1.2514593601226807, "logits/rejected": -1.0386936664581299, "logps/chosen": -0.8438073396682739, "logps/rejected": -3.5511767864227295, "loss": 0.88, "odds_ratio_loss": 0.3621874451637268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08438073098659515, "rewards/margins": 0.270736962556839, "rewards/rejected": -0.35511770844459534, "sft_loss": 0.8438073396682739, "step": 2790 }, { "epoch": 0.22, "grad_norm": 6.961461067199707, "learning_rate": 8.938326146446455e-06, "logits/chosen": -1.1863247156143188, "logits/rejected": -0.8467245101928711, "logps/chosen": -1.0176557302474976, "logps/rejected": -1.874770164489746, "loss": 1.0542, "odds_ratio_loss": 0.3654174208641052, "rewards/accuracies": 1.0, "rewards/chosen": -0.10176558792591095, "rewards/margins": 0.08571141958236694, "rewards/rejected": -0.1874770075082779, "sft_loss": 1.0176557302474976, "step": 2795 }, { "epoch": 0.22, "grad_norm": 5.351832389831543, "learning_rate": 8.934529460140864e-06, "logits/chosen": -1.2538938522338867, "logits/rejected": -0.8942509889602661, "logps/chosen": -0.9106897115707397, "logps/rejected": -2.4164175987243652, "loss": 0.94, "odds_ratio_loss": 0.2929394841194153, "rewards/accuracies": 1.0, "rewards/chosen": -0.09106897562742233, "rewards/margins": 0.1505727767944336, "rewards/rejected": -0.24164175987243652, "sft_loss": 0.9106897115707397, "step": 2800 }, { "epoch": 0.22, "grad_norm": 15.171923637390137, "learning_rate": 8.930726806621797e-06, "logits/chosen": -1.1682040691375732, "logits/rejected": -0.8433181643486023, "logps/chosen": -1.0888586044311523, "logps/rejected": -4.776035785675049, "loss": 1.1557, "odds_ratio_loss": 0.6680582761764526, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.10888586193323135, "rewards/margins": 0.3687177300453186, "rewards/rejected": -0.47760358452796936, "sft_loss": 1.0888586044311523, "step": 2805 }, { "epoch": 0.22, "grad_norm": 18.85120391845703, "learning_rate": 8.926918191656465e-06, "logits/chosen": -1.31667160987854, "logits/rejected": -1.06211256980896, "logps/chosen": -1.033383846282959, "logps/rejected": -3.6757144927978516, "loss": 1.072, "odds_ratio_loss": 0.3861328065395355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10333838313817978, "rewards/margins": 0.2642330825328827, "rewards/rejected": -0.36757150292396545, "sft_loss": 1.033383846282959, "step": 2810 }, { "epoch": 0.22, "grad_norm": 50.54446792602539, "learning_rate": 8.923103621021114e-06, "logits/chosen": -1.196171522140503, "logits/rejected": -1.0727007389068604, "logps/chosen": -1.1526124477386475, "logps/rejected": -1.1024290323257446, "loss": 1.2311, "odds_ratio_loss": 0.7850225567817688, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1152612566947937, "rewards/margins": -0.0050183548592031, "rewards/rejected": -0.11024288833141327, "sft_loss": 1.1526124477386475, "step": 2815 }, { "epoch": 0.22, "grad_norm": 5.652276515960693, "learning_rate": 8.919283100501025e-06, "logits/chosen": -1.1610941886901855, "logits/rejected": -0.6269224882125854, "logps/chosen": -0.9240479469299316, "logps/rejected": -1.1618537902832031, "loss": 0.9806, "odds_ratio_loss": 0.5652657151222229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0924047976732254, "rewards/margins": 0.02378058061003685, "rewards/rejected": -0.11618538200855255, "sft_loss": 0.9240479469299316, "step": 2820 }, { "epoch": 0.22, "grad_norm": 6.214611530303955, "learning_rate": 8.915456635890503e-06, "logits/chosen": -1.1411340236663818, "logits/rejected": -0.859778881072998, "logps/chosen": -0.7868792414665222, "logps/rejected": -5.329236030578613, "loss": 0.8132, "odds_ratio_loss": 0.26342785358428955, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07868792116641998, "rewards/margins": 0.45423564314842224, "rewards/rejected": -0.5329235792160034, "sft_loss": 0.7868792414665222, "step": 2825 }, { "epoch": 0.22, "grad_norm": 14.912748336791992, "learning_rate": 8.911624232992867e-06, "logits/chosen": -1.376704454421997, "logits/rejected": -0.46073848009109497, "logps/chosen": -1.1049047708511353, "logps/rejected": -1.2518284320831299, "loss": 1.1677, "odds_ratio_loss": 0.6280218362808228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11049047857522964, "rewards/margins": 0.014692360535264015, "rewards/rejected": -0.1251828372478485, "sft_loss": 1.1049047708511353, "step": 2830 }, { "epoch": 0.22, "grad_norm": 12.590785026550293, "learning_rate": 8.90778589762044e-06, "logits/chosen": -1.136904001235962, "logits/rejected": -0.8878492116928101, "logps/chosen": -1.616204023361206, "logps/rejected": -1.5891444683074951, "loss": 1.7234, "odds_ratio_loss": 1.0716639757156372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1616203933954239, "rewards/margins": -0.0027059554122388363, "rewards/rejected": -0.1589144468307495, "sft_loss": 1.616204023361206, "step": 2835 }, { "epoch": 0.22, "grad_norm": 8.816327095031738, "learning_rate": 8.90394163559455e-06, "logits/chosen": -1.3756773471832275, "logits/rejected": -1.093867301940918, "logps/chosen": -0.8899661302566528, "logps/rejected": -5.7786455154418945, "loss": 0.9543, "odds_ratio_loss": 0.6429457664489746, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08899661153554916, "rewards/margins": 0.48886799812316895, "rewards/rejected": -0.5778645277023315, "sft_loss": 0.8899661302566528, "step": 2840 }, { "epoch": 0.22, "grad_norm": 7.251855373382568, "learning_rate": 8.900091452745506e-06, "logits/chosen": -1.2806169986724854, "logits/rejected": -1.050431251525879, "logps/chosen": -0.8310653567314148, "logps/rejected": -0.6103538274765015, "loss": 0.9344, "odds_ratio_loss": 1.0331388711929321, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.08310654014348984, "rewards/margins": -0.02207115665078163, "rewards/rejected": -0.06103537604212761, "sft_loss": 0.8310653567314148, "step": 2845 }, { "epoch": 0.22, "grad_norm": 10.528026580810547, "learning_rate": 8.896235354912597e-06, "logits/chosen": -1.1204943656921387, "logits/rejected": -1.0685988664627075, "logps/chosen": -1.2023366689682007, "logps/rejected": -1.5722219944000244, "loss": 1.2622, "odds_ratio_loss": 0.598936140537262, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.12023366987705231, "rewards/margins": 0.03698853403329849, "rewards/rejected": -0.1572222113609314, "sft_loss": 1.2023366689682007, "step": 2850 }, { "epoch": 0.22, "grad_norm": 74.59951782226562, "learning_rate": 8.892373347944088e-06, "logits/chosen": -1.1229302883148193, "logits/rejected": -0.9611026644706726, "logps/chosen": -1.1025960445404053, "logps/rejected": -2.4425206184387207, "loss": 1.1763, "odds_ratio_loss": 0.7375203967094421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11025960743427277, "rewards/margins": 0.13399241864681244, "rewards/rejected": -0.2442520409822464, "sft_loss": 1.1025960445404053, "step": 2855 }, { "epoch": 0.22, "grad_norm": 16.174827575683594, "learning_rate": 8.888505437697201e-06, "logits/chosen": -1.128024697303772, "logits/rejected": -0.9245980381965637, "logps/chosen": -0.8279545903205872, "logps/rejected": -3.0395867824554443, "loss": 0.8703, "odds_ratio_loss": 0.4236716330051422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08279546350240707, "rewards/margins": 0.22116322815418243, "rewards/rejected": -0.3039587140083313, "sft_loss": 0.8279545903205872, "step": 2860 }, { "epoch": 0.22, "grad_norm": 22.010711669921875, "learning_rate": 8.884631630038117e-06, "logits/chosen": -1.1404074430465698, "logits/rejected": -0.6384499669075012, "logps/chosen": -1.0815197229385376, "logps/rejected": -2.512594699859619, "loss": 1.111, "odds_ratio_loss": 0.29465144872665405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10815198719501495, "rewards/margins": 0.14310748875141144, "rewards/rejected": -0.2512594759464264, "sft_loss": 1.0815197229385376, "step": 2865 }, { "epoch": 0.22, "grad_norm": 7.166597843170166, "learning_rate": 8.88075193084195e-06, "logits/chosen": -1.188714861869812, "logits/rejected": -0.8500461578369141, "logps/chosen": -0.9702251553535461, "logps/rejected": -1.9410117864608765, "loss": 1.0235, "odds_ratio_loss": 0.5329502820968628, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09702251851558685, "rewards/margins": 0.09707866609096527, "rewards/rejected": -0.19410118460655212, "sft_loss": 0.9702251553535461, "step": 2870 }, { "epoch": 0.22, "grad_norm": 7.927453994750977, "learning_rate": 8.876866345992762e-06, "logits/chosen": -1.1141269207000732, "logits/rejected": -0.5961653590202332, "logps/chosen": -0.9098241925239563, "logps/rejected": -1.1537498235702515, "loss": 0.9654, "odds_ratio_loss": 0.5554467439651489, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09098242223262787, "rewards/margins": 0.024392560124397278, "rewards/rejected": -0.11537498235702515, "sft_loss": 0.9098241925239563, "step": 2875 }, { "epoch": 0.22, "grad_norm": 7.518460750579834, "learning_rate": 8.872974881383535e-06, "logits/chosen": -1.0662617683410645, "logits/rejected": -1.094089388847351, "logps/chosen": -0.6162663698196411, "logps/rejected": -1.763864517211914, "loss": 0.6453, "odds_ratio_loss": 0.2900220453739166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06162664294242859, "rewards/margins": 0.11475982517004013, "rewards/rejected": -0.17638644576072693, "sft_loss": 0.6162663698196411, "step": 2880 }, { "epoch": 0.22, "grad_norm": 5.836972236633301, "learning_rate": 8.869077542916167e-06, "logits/chosen": -1.1317639350891113, "logits/rejected": -0.6082301735877991, "logps/chosen": -0.9128414392471313, "logps/rejected": -1.3924942016601562, "loss": 0.9544, "odds_ratio_loss": 0.4153948426246643, "rewards/accuracies": 1.0, "rewards/chosen": -0.0912841409444809, "rewards/margins": 0.04796527698636055, "rewards/rejected": -0.13924942910671234, "sft_loss": 0.9128414392471313, "step": 2885 }, { "epoch": 0.22, "grad_norm": 8.127260208129883, "learning_rate": 8.86517433650147e-06, "logits/chosen": -1.1355172395706177, "logits/rejected": -0.9076636433601379, "logps/chosen": -0.9803677797317505, "logps/rejected": -0.9864256978034973, "loss": 1.0539, "odds_ratio_loss": 0.7349095344543457, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09803678095340729, "rewards/margins": 0.0006057933205738664, "rewards/rejected": -0.09864256531000137, "sft_loss": 0.9803677797317505, "step": 2890 }, { "epoch": 0.23, "grad_norm": 33.14106750488281, "learning_rate": 8.86126526805915e-06, "logits/chosen": -1.4334921836853027, "logits/rejected": -1.1608220338821411, "logps/chosen": -0.9514063000679016, "logps/rejected": -1.1313843727111816, "loss": 1.0146, "odds_ratio_loss": 0.6314960718154907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09514062106609344, "rewards/margins": 0.017997819930315018, "rewards/rejected": -0.11313845217227936, "sft_loss": 0.9514063000679016, "step": 2895 }, { "epoch": 0.23, "grad_norm": 5.668753147125244, "learning_rate": 8.857350343517804e-06, "logits/chosen": -1.2259176969528198, "logits/rejected": -0.9829230308532715, "logps/chosen": -0.9202004671096802, "logps/rejected": -1.180841326713562, "loss": 0.9764, "odds_ratio_loss": 0.5624373555183411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09202004969120026, "rewards/margins": 0.02606409788131714, "rewards/rejected": -0.1180841475725174, "sft_loss": 0.9202004671096802, "step": 2900 }, { "epoch": 0.23, "grad_norm": 19.66217803955078, "learning_rate": 8.853429568814913e-06, "logits/chosen": -1.197495698928833, "logits/rejected": -0.9274722933769226, "logps/chosen": -0.8191972970962524, "logps/rejected": -1.4239537715911865, "loss": 0.8657, "odds_ratio_loss": 0.4648253917694092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08191972225904465, "rewards/margins": 0.060475654900074005, "rewards/rejected": -0.14239537715911865, "sft_loss": 0.8191972970962524, "step": 2905 }, { "epoch": 0.23, "grad_norm": 6.289087295532227, "learning_rate": 8.849502949896831e-06, "logits/chosen": -1.064294695854187, "logits/rejected": -1.1289174556732178, "logps/chosen": -0.6857664585113525, "logps/rejected": -4.31264591217041, "loss": 0.7191, "odds_ratio_loss": 0.3333652913570404, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06857664883136749, "rewards/margins": 0.362687885761261, "rewards/rejected": -0.4312645494937897, "sft_loss": 0.6857664585113525, "step": 2910 }, { "epoch": 0.23, "grad_norm": 17.237995147705078, "learning_rate": 8.845570492718776e-06, "logits/chosen": -1.1497989892959595, "logits/rejected": -1.2945317029953003, "logps/chosen": -0.995174765586853, "logps/rejected": -1.2497352361679077, "loss": 1.0528, "odds_ratio_loss": 0.5761274099349976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09951747953891754, "rewards/margins": 0.02545604668557644, "rewards/rejected": -0.12497353553771973, "sft_loss": 0.995174765586853, "step": 2915 }, { "epoch": 0.23, "grad_norm": 6.107137203216553, "learning_rate": 8.841632203244813e-06, "logits/chosen": -1.2816154956817627, "logits/rejected": -0.6775996088981628, "logps/chosen": -0.8471586108207703, "logps/rejected": -4.0009355545043945, "loss": 0.8722, "odds_ratio_loss": 0.2508474588394165, "rewards/accuracies": 1.0, "rewards/chosen": -0.08471586555242538, "rewards/margins": 0.3153776526451111, "rewards/rejected": -0.40009355545043945, "sft_loss": 0.8471586108207703, "step": 2920 }, { "epoch": 0.23, "grad_norm": 10.699955940246582, "learning_rate": 8.837688087447862e-06, "logits/chosen": -1.3370736837387085, "logits/rejected": -0.5626960396766663, "logps/chosen": -1.0836195945739746, "logps/rejected": -9.543293952941895, "loss": 1.1216, "odds_ratio_loss": 0.3799092769622803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10836195945739746, "rewards/margins": 0.8459674715995789, "rewards/rejected": -0.9543293714523315, "sft_loss": 1.0836195945739746, "step": 2925 }, { "epoch": 0.23, "grad_norm": 12.650343894958496, "learning_rate": 8.833738151309677e-06, "logits/chosen": -1.4565495252609253, "logits/rejected": -1.268577218055725, "logps/chosen": -0.9817711710929871, "logps/rejected": -4.19735860824585, "loss": 1.0108, "odds_ratio_loss": 0.2902795970439911, "rewards/accuracies": 1.0, "rewards/chosen": -0.09817712008953094, "rewards/margins": 0.32155877351760864, "rewards/rejected": -0.4197359085083008, "sft_loss": 0.9817711710929871, "step": 2930 }, { "epoch": 0.23, "grad_norm": 9.384466171264648, "learning_rate": 8.829782400820833e-06, "logits/chosen": -1.376010537147522, "logits/rejected": -0.5250366926193237, "logps/chosen": -0.8629180192947388, "logps/rejected": -1.7529948949813843, "loss": 0.8961, "odds_ratio_loss": 0.33201155066490173, "rewards/accuracies": 1.0, "rewards/chosen": -0.08629179745912552, "rewards/margins": 0.08900769799947739, "rewards/rejected": -0.1752994954586029, "sft_loss": 0.8629180192947388, "step": 2935 }, { "epoch": 0.23, "grad_norm": 6.656403541564941, "learning_rate": 8.825820841980729e-06, "logits/chosen": -1.4098700284957886, "logits/rejected": -1.2165441513061523, "logps/chosen": -0.8539711833000183, "logps/rejected": -2.8836426734924316, "loss": 0.8861, "odds_ratio_loss": 0.321702778339386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08539711683988571, "rewards/margins": 0.20296712219715118, "rewards/rejected": -0.2883642315864563, "sft_loss": 0.8539711833000183, "step": 2940 }, { "epoch": 0.23, "grad_norm": 12.5855073928833, "learning_rate": 8.821853480797574e-06, "logits/chosen": -1.4244425296783447, "logits/rejected": -1.0870120525360107, "logps/chosen": -1.1766611337661743, "logps/rejected": -1.3328752517700195, "loss": 1.2367, "odds_ratio_loss": 0.6007108688354492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11766611039638519, "rewards/margins": 0.015621413476765156, "rewards/rejected": -0.13328751921653748, "sft_loss": 1.1766611337661743, "step": 2945 }, { "epoch": 0.23, "grad_norm": 7.5176310539245605, "learning_rate": 8.817880323288376e-06, "logits/chosen": -1.2485527992248535, "logits/rejected": -1.1338937282562256, "logps/chosen": -1.0665452480316162, "logps/rejected": -1.3328628540039062, "loss": 1.1206, "odds_ratio_loss": 0.5406354665756226, "rewards/accuracies": 1.0, "rewards/chosen": -0.10665452480316162, "rewards/margins": 0.026631761342287064, "rewards/rejected": -0.13328629732131958, "sft_loss": 1.0665452480316162, "step": 2950 }, { "epoch": 0.23, "grad_norm": 10.515802383422852, "learning_rate": 8.813901375478928e-06, "logits/chosen": -1.373665452003479, "logits/rejected": -1.2379958629608154, "logps/chosen": -0.9441508054733276, "logps/rejected": -3.6124789714813232, "loss": 0.9766, "odds_ratio_loss": 0.32453179359436035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.094415083527565, "rewards/margins": 0.2668328285217285, "rewards/rejected": -0.3612478971481323, "sft_loss": 0.9441508054733276, "step": 2955 }, { "epoch": 0.23, "grad_norm": 8.03735065460205, "learning_rate": 8.809916643403813e-06, "logits/chosen": -1.3460569381713867, "logits/rejected": -0.6389718651771545, "logps/chosen": -1.1100437641143799, "logps/rejected": -1.21809983253479, "loss": 1.1721, "odds_ratio_loss": 0.6203465461730957, "rewards/accuracies": 1.0, "rewards/chosen": -0.11100438982248306, "rewards/margins": 0.010805593803524971, "rewards/rejected": -0.12180998176336288, "sft_loss": 1.1100437641143799, "step": 2960 }, { "epoch": 0.23, "grad_norm": 6.766620635986328, "learning_rate": 8.805926133106382e-06, "logits/chosen": -1.2804914712905884, "logits/rejected": -0.7814501523971558, "logps/chosen": -1.7104488611221313, "logps/rejected": -5.756571292877197, "loss": 1.7352, "odds_ratio_loss": 0.24745836853981018, "rewards/accuracies": 1.0, "rewards/chosen": -0.17104490101337433, "rewards/margins": 0.4046122431755066, "rewards/rejected": -0.5756571888923645, "sft_loss": 1.7104488611221313, "step": 2965 }, { "epoch": 0.23, "grad_norm": 16.29014778137207, "learning_rate": 8.80192985063875e-06, "logits/chosen": -1.2973188161849976, "logits/rejected": -1.2357347011566162, "logps/chosen": -0.8642646670341492, "logps/rejected": -1.2275454998016357, "loss": 0.9122, "odds_ratio_loss": 0.4792478680610657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08642647415399551, "rewards/margins": 0.036328066140413284, "rewards/rejected": -0.1227545365691185, "sft_loss": 0.8642646670341492, "step": 2970 }, { "epoch": 0.23, "grad_norm": 6.046884059906006, "learning_rate": 8.797927802061791e-06, "logits/chosen": -1.42294442653656, "logits/rejected": -0.8688791990280151, "logps/chosen": -0.9156146049499512, "logps/rejected": -0.9524520635604858, "loss": 0.9983, "odds_ratio_loss": 0.8266631364822388, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09156147390604019, "rewards/margins": 0.0036837488878518343, "rewards/rejected": -0.09524521231651306, "sft_loss": 0.9156146049499512, "step": 2975 }, { "epoch": 0.23, "grad_norm": 5.413948059082031, "learning_rate": 8.793919993445114e-06, "logits/chosen": -1.3873004913330078, "logits/rejected": -0.7676628828048706, "logps/chosen": -1.1006882190704346, "logps/rejected": -1.3490431308746338, "loss": 1.1669, "odds_ratio_loss": 0.6618945002555847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11006882041692734, "rewards/margins": 0.024835485965013504, "rewards/rejected": -0.13490431010723114, "sft_loss": 1.1006882190704346, "step": 2980 }, { "epoch": 0.23, "grad_norm": 19.28611183166504, "learning_rate": 8.789906430867073e-06, "logits/chosen": -1.3977916240692139, "logits/rejected": -1.0037000179290771, "logps/chosen": -2.9157166481018066, "logps/rejected": -4.102760314941406, "loss": 3.0109, "odds_ratio_loss": 0.9522919654846191, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2915716767311096, "rewards/margins": 0.11870436370372772, "rewards/rejected": -0.41027602553367615, "sft_loss": 2.9157166481018066, "step": 2985 }, { "epoch": 0.23, "grad_norm": 6.863065719604492, "learning_rate": 8.785887120414744e-06, "logits/chosen": -1.468731164932251, "logits/rejected": -0.7206265330314636, "logps/chosen": -1.1206532716751099, "logps/rejected": -1.4424384832382202, "loss": 1.1806, "odds_ratio_loss": 0.5996042490005493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11206533014774323, "rewards/margins": 0.03217853978276253, "rewards/rejected": -0.14424386620521545, "sft_loss": 1.1206532716751099, "step": 2990 }, { "epoch": 0.23, "grad_norm": 25.038049697875977, "learning_rate": 8.781862068183922e-06, "logits/chosen": -1.0874309539794922, "logits/rejected": -1.0442769527435303, "logps/chosen": -0.9091469049453735, "logps/rejected": -8.371574401855469, "loss": 0.9341, "odds_ratio_loss": 0.2496640980243683, "rewards/accuracies": 1.0, "rewards/chosen": -0.09091468900442123, "rewards/margins": 0.7462427020072937, "rewards/rejected": -0.8371574282646179, "sft_loss": 0.9091469049453735, "step": 2995 }, { "epoch": 0.23, "grad_norm": 8.39319133758545, "learning_rate": 8.77783128027911e-06, "logits/chosen": -1.457811713218689, "logits/rejected": -0.886703610420227, "logps/chosen": -1.1325379610061646, "logps/rejected": -1.4151691198349, "loss": 1.1911, "odds_ratio_loss": 0.5853082537651062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11325379461050034, "rewards/margins": 0.02826312743127346, "rewards/rejected": -0.14151692390441895, "sft_loss": 1.1325379610061646, "step": 3000 }, { "epoch": 0.23, "grad_norm": 9.52746868133545, "learning_rate": 8.773794762813507e-06, "logits/chosen": -1.4067108631134033, "logits/rejected": -1.0598171949386597, "logps/chosen": -0.8269651532173157, "logps/rejected": -1.1110790967941284, "loss": 0.877, "odds_ratio_loss": 0.5001311302185059, "rewards/accuracies": 1.0, "rewards/chosen": -0.08269651234149933, "rewards/margins": 0.02841140702366829, "rewards/rejected": -0.11110792309045792, "sft_loss": 0.8269651532173157, "step": 3005 }, { "epoch": 0.23, "grad_norm": 12.703961372375488, "learning_rate": 8.76975252190901e-06, "logits/chosen": -1.2674983739852905, "logits/rejected": -1.1456859111785889, "logps/chosen": -1.1914570331573486, "logps/rejected": -10.400805473327637, "loss": 1.2233, "odds_ratio_loss": 0.3181864321231842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1191457062959671, "rewards/margins": 0.9209348559379578, "rewards/rejected": -1.0400805473327637, "sft_loss": 1.1914570331573486, "step": 3010 }, { "epoch": 0.23, "grad_norm": 5.381165027618408, "learning_rate": 8.765704563696187e-06, "logits/chosen": -1.3665794134140015, "logits/rejected": -0.9956024885177612, "logps/chosen": -1.2555207014083862, "logps/rejected": -1.8276020288467407, "loss": 1.3062, "odds_ratio_loss": 0.5069686770439148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12555207312107086, "rewards/margins": 0.05720812827348709, "rewards/rejected": -0.18276020884513855, "sft_loss": 1.2555207014083862, "step": 3015 }, { "epoch": 0.23, "grad_norm": 13.621868133544922, "learning_rate": 8.761650894314278e-06, "logits/chosen": -1.299678921699524, "logits/rejected": -0.7859451174736023, "logps/chosen": -1.0665977001190186, "logps/rejected": -2.9078383445739746, "loss": 1.0893, "odds_ratio_loss": 0.22742700576782227, "rewards/accuracies": 1.0, "rewards/chosen": -0.10665978491306305, "rewards/margins": 0.18412408232688904, "rewards/rejected": -0.2907838523387909, "sft_loss": 1.0665977001190186, "step": 3020 }, { "epoch": 0.24, "grad_norm": 25.114072799682617, "learning_rate": 8.757591519911192e-06, "logits/chosen": -1.3486571311950684, "logits/rejected": -1.2607526779174805, "logps/chosen": -0.8685344457626343, "logps/rejected": -1.245876431465149, "loss": 0.9133, "odds_ratio_loss": 0.4474593698978424, "rewards/accuracies": 1.0, "rewards/chosen": -0.08685345202684402, "rewards/margins": 0.03773418813943863, "rewards/rejected": -0.12458764016628265, "sft_loss": 0.8685344457626343, "step": 3025 }, { "epoch": 0.24, "grad_norm": 16.62470054626465, "learning_rate": 8.753526446643483e-06, "logits/chosen": -1.3854665756225586, "logits/rejected": -0.668252170085907, "logps/chosen": -0.8873245120048523, "logps/rejected": -9.595453262329102, "loss": 0.9257, "odds_ratio_loss": 0.3835451602935791, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08873245120048523, "rewards/margins": 0.8708128929138184, "rewards/rejected": -0.9595453143119812, "sft_loss": 0.8873245120048523, "step": 3030 }, { "epoch": 0.24, "grad_norm": 9.757149696350098, "learning_rate": 8.74945568067635e-06, "logits/chosen": -1.3147366046905518, "logits/rejected": -1.037381649017334, "logps/chosen": -1.0702860355377197, "logps/rejected": -5.288527011871338, "loss": 1.1103, "odds_ratio_loss": 0.3997670114040375, "rewards/accuracies": 1.0, "rewards/chosen": -0.10702860355377197, "rewards/margins": 0.4218241274356842, "rewards/rejected": -0.5288527011871338, "sft_loss": 1.0702860355377197, "step": 3035 }, { "epoch": 0.24, "grad_norm": 8.116015434265137, "learning_rate": 8.74537922818363e-06, "logits/chosen": -1.3341821432113647, "logits/rejected": -0.8699405789375305, "logps/chosen": -1.1210837364196777, "logps/rejected": -5.426255702972412, "loss": 1.1637, "odds_ratio_loss": 0.42615023255348206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11210837215185165, "rewards/margins": 0.4305172562599182, "rewards/rejected": -0.5426255464553833, "sft_loss": 1.1210837364196777, "step": 3040 }, { "epoch": 0.24, "grad_norm": 17.718124389648438, "learning_rate": 8.741297095347779e-06, "logits/chosen": -1.3567863702774048, "logits/rejected": -0.9880765676498413, "logps/chosen": -1.135987639427185, "logps/rejected": -1.6871782541275024, "loss": 1.2054, "odds_ratio_loss": 0.6937232613563538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1135987788438797, "rewards/margins": 0.055119067430496216, "rewards/rejected": -0.16871783137321472, "sft_loss": 1.135987639427185, "step": 3045 }, { "epoch": 0.24, "grad_norm": 4.6651716232299805, "learning_rate": 8.737209288359868e-06, "logits/chosen": -1.4298700094223022, "logits/rejected": -0.8255916833877563, "logps/chosen": -0.9301006197929382, "logps/rejected": -1.5776050090789795, "loss": 0.9788, "odds_ratio_loss": 0.48722043633461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0930100604891777, "rewards/margins": 0.06475045531988144, "rewards/rejected": -0.15776051580905914, "sft_loss": 0.9301006197929382, "step": 3050 }, { "epoch": 0.24, "grad_norm": 9.877421379089355, "learning_rate": 8.733115813419575e-06, "logits/chosen": -1.4384605884552002, "logits/rejected": -1.1984599828720093, "logps/chosen": -0.6977513432502747, "logps/rejected": -1.4935929775238037, "loss": 0.732, "odds_ratio_loss": 0.3421218693256378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06977514922618866, "rewards/margins": 0.07958415895700455, "rewards/rejected": -0.1493593007326126, "sft_loss": 0.6977513432502747, "step": 3055 }, { "epoch": 0.24, "grad_norm": 62.607574462890625, "learning_rate": 8.729016676735179e-06, "logits/chosen": -1.392884373664856, "logits/rejected": -1.0296696424484253, "logps/chosen": -0.9279215931892395, "logps/rejected": -1.295037031173706, "loss": 0.9826, "odds_ratio_loss": 0.5466240644454956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09279216080904007, "rewards/margins": 0.03671155124902725, "rewards/rejected": -0.12950369715690613, "sft_loss": 0.9279215931892395, "step": 3060 }, { "epoch": 0.24, "grad_norm": 25.62282943725586, "learning_rate": 8.724911884523537e-06, "logits/chosen": -1.4473998546600342, "logits/rejected": -0.888154149055481, "logps/chosen": -1.002514123916626, "logps/rejected": -1.8997751474380493, "loss": 1.0388, "odds_ratio_loss": 0.3630914092063904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10025142133235931, "rewards/margins": 0.08972609043121338, "rewards/rejected": -0.1899775117635727, "sft_loss": 1.002514123916626, "step": 3065 }, { "epoch": 0.24, "grad_norm": 28.85422706604004, "learning_rate": 8.720801443010089e-06, "logits/chosen": -1.3426685333251953, "logits/rejected": -0.944126307964325, "logps/chosen": -1.0599967241287231, "logps/rejected": -1.6001322269439697, "loss": 1.0998, "odds_ratio_loss": 0.3978647291660309, "rewards/accuracies": 1.0, "rewards/chosen": -0.1059996709227562, "rewards/margins": 0.054013561457395554, "rewards/rejected": -0.16001322865486145, "sft_loss": 1.0599967241287231, "step": 3070 }, { "epoch": 0.24, "grad_norm": 10.21401309967041, "learning_rate": 8.71668535842884e-06, "logits/chosen": -1.4288160800933838, "logits/rejected": -1.0084912776947021, "logps/chosen": -1.1480839252471924, "logps/rejected": -1.619964838027954, "loss": 1.1981, "odds_ratio_loss": 0.4998813569545746, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11480840295553207, "rewards/margins": 0.047188084572553635, "rewards/rejected": -0.1619964838027954, "sft_loss": 1.1480839252471924, "step": 3075 }, { "epoch": 0.24, "grad_norm": 7.576767921447754, "learning_rate": 8.712563637022357e-06, "logits/chosen": -1.3043601512908936, "logits/rejected": -1.127294898033142, "logps/chosen": -0.9208580851554871, "logps/rejected": -7.681375026702881, "loss": 0.9638, "odds_ratio_loss": 0.42907315492630005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0920858085155487, "rewards/margins": 0.6760517358779907, "rewards/rejected": -0.7681375741958618, "sft_loss": 0.9208580851554871, "step": 3080 }, { "epoch": 0.24, "grad_norm": 6.175205230712891, "learning_rate": 8.708436285041755e-06, "logits/chosen": -1.278227686882019, "logits/rejected": -0.896086573600769, "logps/chosen": -0.9899203181266785, "logps/rejected": -1.603487253189087, "loss": 1.0352, "odds_ratio_loss": 0.45269575715065, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09899203479290009, "rewards/margins": 0.06135668605566025, "rewards/rejected": -0.16034871339797974, "sft_loss": 0.9899203181266785, "step": 3085 }, { "epoch": 0.24, "grad_norm": 9.487650871276855, "learning_rate": 8.704303308746684e-06, "logits/chosen": -1.3824490308761597, "logits/rejected": -0.6026290655136108, "logps/chosen": -1.0155322551727295, "logps/rejected": -1.7905362844467163, "loss": 1.0567, "odds_ratio_loss": 0.4116109311580658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10155323892831802, "rewards/margins": 0.07750038802623749, "rewards/rejected": -0.1790536344051361, "sft_loss": 1.0155322551727295, "step": 3090 }, { "epoch": 0.24, "grad_norm": 235.64552307128906, "learning_rate": 8.700164714405328e-06, "logits/chosen": -1.2964307069778442, "logits/rejected": -0.9890910983085632, "logps/chosen": -1.991641640663147, "logps/rejected": -2.6005265712738037, "loss": 2.0885, "odds_ratio_loss": 0.968436062335968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19916416704654694, "rewards/margins": 0.06088850647211075, "rewards/rejected": -0.2600526511669159, "sft_loss": 1.991641640663147, "step": 3095 }, { "epoch": 0.24, "grad_norm": 7.392255783081055, "learning_rate": 8.696020508294391e-06, "logits/chosen": -1.440700888633728, "logits/rejected": -0.8366183042526245, "logps/chosen": -0.9562617540359497, "logps/rejected": -1.2703158855438232, "loss": 1.0096, "odds_ratio_loss": 0.5336211919784546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09562616795301437, "rewards/margins": 0.03140542656183243, "rewards/rejected": -0.1270315945148468, "sft_loss": 0.9562617540359497, "step": 3100 }, { "epoch": 0.24, "grad_norm": 9.974418640136719, "learning_rate": 8.69187069669909e-06, "logits/chosen": -1.4482066631317139, "logits/rejected": -1.3463423252105713, "logps/chosen": -1.2112513780593872, "logps/rejected": -1.9238742589950562, "loss": 1.2708, "odds_ratio_loss": 0.595005989074707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12112513929605484, "rewards/margins": 0.07126231491565704, "rewards/rejected": -0.1923874318599701, "sft_loss": 1.2112513780593872, "step": 3105 }, { "epoch": 0.24, "grad_norm": 20.881736755371094, "learning_rate": 8.687715285913138e-06, "logits/chosen": -1.4098975658416748, "logits/rejected": -0.9576647877693176, "logps/chosen": -0.7991948127746582, "logps/rejected": -1.363541841506958, "loss": 0.8475, "odds_ratio_loss": 0.4834299683570862, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0799194872379303, "rewards/margins": 0.05643470212817192, "rewards/rejected": -0.13635417819023132, "sft_loss": 0.7991948127746582, "step": 3110 }, { "epoch": 0.24, "grad_norm": 23.569698333740234, "learning_rate": 8.683554282238746e-06, "logits/chosen": -1.1757612228393555, "logits/rejected": -1.2875800132751465, "logps/chosen": -1.008616328239441, "logps/rejected": -1.3941243886947632, "loss": 1.0608, "odds_ratio_loss": 0.5214596390724182, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10086163133382797, "rewards/margins": 0.03855080530047417, "rewards/rejected": -0.13941244781017303, "sft_loss": 1.008616328239441, "step": 3115 }, { "epoch": 0.24, "grad_norm": 10.971246719360352, "learning_rate": 8.6793876919866e-06, "logits/chosen": -1.4144350290298462, "logits/rejected": -1.1336814165115356, "logps/chosen": -1.0561202764511108, "logps/rejected": -8.628296852111816, "loss": 1.1081, "odds_ratio_loss": 0.5193870663642883, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10561203956604004, "rewards/margins": 0.7572176456451416, "rewards/rejected": -0.8628296852111816, "sft_loss": 1.0561202764511108, "step": 3120 }, { "epoch": 0.24, "grad_norm": 9.794310569763184, "learning_rate": 8.675215521475868e-06, "logits/chosen": -1.2344005107879639, "logits/rejected": -1.2249268293380737, "logps/chosen": -0.9627717733383179, "logps/rejected": -1.757939100265503, "loss": 0.9977, "odds_ratio_loss": 0.34921079874038696, "rewards/accuracies": 1.0, "rewards/chosen": -0.09627718478441238, "rewards/margins": 0.07951673120260239, "rewards/rejected": -0.17579391598701477, "sft_loss": 0.9627717733383179, "step": 3125 }, { "epoch": 0.24, "grad_norm": 6.747648239135742, "learning_rate": 8.671037777034173e-06, "logits/chosen": -1.3628339767456055, "logits/rejected": -1.1204272508621216, "logps/chosen": -1.2489184141159058, "logps/rejected": -6.822798728942871, "loss": 1.2942, "odds_ratio_loss": 0.4523259997367859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12489183992147446, "rewards/margins": 0.5573880076408386, "rewards/rejected": -0.6822798848152161, "sft_loss": 1.2489184141159058, "step": 3130 }, { "epoch": 0.24, "grad_norm": 16.021392822265625, "learning_rate": 8.666854464997596e-06, "logits/chosen": -1.3786375522613525, "logits/rejected": -0.7975913286209106, "logps/chosen": -0.9592100381851196, "logps/rejected": -2.387404203414917, "loss": 1.0103, "odds_ratio_loss": 0.5111249685287476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09592099487781525, "rewards/margins": 0.14281943440437317, "rewards/rejected": -0.2387404441833496, "sft_loss": 0.9592100381851196, "step": 3135 }, { "epoch": 0.24, "grad_norm": 24.078792572021484, "learning_rate": 8.662665591710661e-06, "logits/chosen": -1.2728703022003174, "logits/rejected": -0.894081711769104, "logps/chosen": -0.9345654249191284, "logps/rejected": -2.127600908279419, "loss": 0.9621, "odds_ratio_loss": 0.27542421221733093, "rewards/accuracies": 1.0, "rewards/chosen": -0.09345654398202896, "rewards/margins": 0.11930353939533234, "rewards/rejected": -0.2127600908279419, "sft_loss": 0.9345654249191284, "step": 3140 }, { "epoch": 0.24, "grad_norm": 208.22991943359375, "learning_rate": 8.658471163526327e-06, "logits/chosen": -1.2844762802124023, "logits/rejected": -1.1959311962127686, "logps/chosen": -1.1571857929229736, "logps/rejected": -1.600608468055725, "loss": 1.2147, "odds_ratio_loss": 0.5752911567687988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11571858078241348, "rewards/margins": 0.0443422757089138, "rewards/rejected": -0.160060852766037, "sft_loss": 1.1571857929229736, "step": 3145 }, { "epoch": 0.25, "grad_norm": 9.317758560180664, "learning_rate": 8.654271186805974e-06, "logits/chosen": -1.3937506675720215, "logits/rejected": -0.9512116312980652, "logps/chosen": -0.7833994626998901, "logps/rejected": -1.622886300086975, "loss": 0.817, "odds_ratio_loss": 0.33581018447875977, "rewards/accuracies": 1.0, "rewards/chosen": -0.07833994925022125, "rewards/margins": 0.08394867926836014, "rewards/rejected": -0.1622886210680008, "sft_loss": 0.7833994626998901, "step": 3150 }, { "epoch": 0.25, "grad_norm": 10.393624305725098, "learning_rate": 8.650065667919402e-06, "logits/chosen": -1.2336599826812744, "logits/rejected": -1.0274940729141235, "logps/chosen": -0.948900580406189, "logps/rejected": -1.548330545425415, "loss": 0.9981, "odds_ratio_loss": 0.49193769693374634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09489007294178009, "rewards/margins": 0.05994298309087753, "rewards/rejected": -0.15483304858207703, "sft_loss": 0.948900580406189, "step": 3155 }, { "epoch": 0.25, "grad_norm": 5.0471272468566895, "learning_rate": 8.645854613244817e-06, "logits/chosen": -1.374237298965454, "logits/rejected": -0.5605775117874146, "logps/chosen": -0.8872843980789185, "logps/rejected": -1.1447935104370117, "loss": 0.9446, "odds_ratio_loss": 0.5729075074195862, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08872843533754349, "rewards/margins": 0.025750914588570595, "rewards/rejected": -0.11447934806346893, "sft_loss": 0.8872843980789185, "step": 3160 }, { "epoch": 0.25, "grad_norm": 11.45893669128418, "learning_rate": 8.641638029168812e-06, "logits/chosen": -1.2924164533615112, "logits/rejected": -1.0458002090454102, "logps/chosen": -0.9896078109741211, "logps/rejected": -1.7145917415618896, "loss": 1.0336, "odds_ratio_loss": 0.43977856636047363, "rewards/accuracies": 1.0, "rewards/chosen": -0.09896077960729599, "rewards/margins": 0.0724983960390091, "rewards/rejected": -0.1714591681957245, "sft_loss": 0.9896078109741211, "step": 3165 }, { "epoch": 0.25, "grad_norm": 5.857996463775635, "learning_rate": 8.637415922086377e-06, "logits/chosen": -1.2714940309524536, "logits/rejected": -0.8355283737182617, "logps/chosen": -1.2923920154571533, "logps/rejected": -2.2515740394592285, "loss": 1.3668, "odds_ratio_loss": 0.7443500757217407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12923920154571533, "rewards/margins": 0.0959181934595108, "rewards/rejected": -0.22515738010406494, "sft_loss": 1.2923920154571533, "step": 3170 }, { "epoch": 0.25, "grad_norm": 8.047304153442383, "learning_rate": 8.633188298400872e-06, "logits/chosen": -1.4827300310134888, "logits/rejected": -1.222534418106079, "logps/chosen": -0.8443825840950012, "logps/rejected": -1.3143736124038696, "loss": 0.911, "odds_ratio_loss": 0.6657195687294006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0844382643699646, "rewards/margins": 0.04699909687042236, "rewards/rejected": -0.13143734633922577, "sft_loss": 0.8443825840950012, "step": 3175 }, { "epoch": 0.25, "grad_norm": 6.139329433441162, "learning_rate": 8.628955164524024e-06, "logits/chosen": -1.412903904914856, "logits/rejected": -0.6597784757614136, "logps/chosen": -1.1135563850402832, "logps/rejected": -1.4750511646270752, "loss": 1.1709, "odds_ratio_loss": 0.5732403993606567, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11135563999414444, "rewards/margins": 0.03614946827292442, "rewards/rejected": -0.14750510454177856, "sft_loss": 1.1135563850402832, "step": 3180 }, { "epoch": 0.25, "grad_norm": 7.063668727874756, "learning_rate": 8.62471652687592e-06, "logits/chosen": -1.3726282119750977, "logits/rejected": -0.7232956886291504, "logps/chosen": -0.9939814805984497, "logps/rejected": -1.3079441785812378, "loss": 1.0462, "odds_ratio_loss": 0.5219636559486389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09939815104007721, "rewards/margins": 0.03139626979827881, "rewards/rejected": -0.1307944357395172, "sft_loss": 0.9939814805984497, "step": 3185 }, { "epoch": 0.25, "grad_norm": 22.926136016845703, "learning_rate": 8.62047239188499e-06, "logits/chosen": -1.2752636671066284, "logits/rejected": -0.9812234044075012, "logps/chosen": -0.9917522668838501, "logps/rejected": -1.4488723278045654, "loss": 1.0384, "odds_ratio_loss": 0.46636566519737244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09917521476745605, "rewards/margins": 0.04571203142404556, "rewards/rejected": -0.14488725364208221, "sft_loss": 0.9917522668838501, "step": 3190 }, { "epoch": 0.25, "grad_norm": 9.83376693725586, "learning_rate": 8.616222765988006e-06, "logits/chosen": -1.2776237726211548, "logits/rejected": -1.3784388303756714, "logps/chosen": -0.9015542268753052, "logps/rejected": -5.178496360778809, "loss": 0.9514, "odds_ratio_loss": 0.498735249042511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09015541523694992, "rewards/margins": 0.427694171667099, "rewards/rejected": -0.5178496241569519, "sft_loss": 0.9015542268753052, "step": 3195 }, { "epoch": 0.25, "grad_norm": 5.757855415344238, "learning_rate": 8.611967655630062e-06, "logits/chosen": -1.3458263874053955, "logits/rejected": -0.8651115298271179, "logps/chosen": -1.189414381980896, "logps/rejected": -9.91465950012207, "loss": 1.2112, "odds_ratio_loss": 0.21760015189647675, "rewards/accuracies": 1.0, "rewards/chosen": -0.11894144117832184, "rewards/margins": 0.872524619102478, "rewards/rejected": -0.9914659261703491, "sft_loss": 1.189414381980896, "step": 3200 }, { "epoch": 0.25, "grad_norm": 8.271770477294922, "learning_rate": 8.607707067264577e-06, "logits/chosen": -1.2520349025726318, "logits/rejected": -0.9718472361564636, "logps/chosen": -1.1497652530670166, "logps/rejected": -1.762926459312439, "loss": 1.212, "odds_ratio_loss": 0.622275710105896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11497652530670166, "rewards/margins": 0.06131613999605179, "rewards/rejected": -0.17629265785217285, "sft_loss": 1.1497652530670166, "step": 3205 }, { "epoch": 0.25, "grad_norm": 29.484317779541016, "learning_rate": 8.603441007353271e-06, "logits/chosen": -1.3252924680709839, "logits/rejected": -1.2272132635116577, "logps/chosen": -1.0914915800094604, "logps/rejected": -1.3656795024871826, "loss": 1.1633, "odds_ratio_loss": 0.7178690433502197, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10914917290210724, "rewards/margins": 0.027418773621320724, "rewards/rejected": -0.13656795024871826, "sft_loss": 1.0914915800094604, "step": 3210 }, { "epoch": 0.25, "grad_norm": 10.599020957946777, "learning_rate": 8.599169482366167e-06, "logits/chosen": -1.3540681600570679, "logits/rejected": -1.0985815525054932, "logps/chosen": -0.8688950538635254, "logps/rejected": -1.479015588760376, "loss": 0.9184, "odds_ratio_loss": 0.495300829410553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08688951283693314, "rewards/margins": 0.06101206690073013, "rewards/rejected": -0.14790156483650208, "sft_loss": 0.8688950538635254, "step": 3215 }, { "epoch": 0.25, "grad_norm": 13.34872817993164, "learning_rate": 8.594892498781574e-06, "logits/chosen": -1.3374733924865723, "logits/rejected": -0.5042542815208435, "logps/chosen": -1.165950059890747, "logps/rejected": -12.713995933532715, "loss": 1.1996, "odds_ratio_loss": 0.33608299493789673, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11659500747919083, "rewards/margins": 1.1548045873641968, "rewards/rejected": -1.271399736404419, "sft_loss": 1.165950059890747, "step": 3220 }, { "epoch": 0.25, "grad_norm": 35.187660217285156, "learning_rate": 8.590610063086082e-06, "logits/chosen": -1.0710675716400146, "logits/rejected": -0.9008657336235046, "logps/chosen": -1.040565013885498, "logps/rejected": -1.2980402708053589, "loss": 1.0972, "odds_ratio_loss": 0.5668312907218933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10405649244785309, "rewards/margins": 0.025747528299689293, "rewards/rejected": -0.12980404496192932, "sft_loss": 1.040565013885498, "step": 3225 }, { "epoch": 0.25, "grad_norm": 5.274808883666992, "learning_rate": 8.586322181774547e-06, "logits/chosen": -1.3774149417877197, "logits/rejected": -0.7884630560874939, "logps/chosen": -0.9635206460952759, "logps/rejected": -12.23045825958252, "loss": 0.9828, "odds_ratio_loss": 0.1926787793636322, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09635205566883087, "rewards/margins": 1.1266937255859375, "rewards/rejected": -1.223045825958252, "sft_loss": 0.9635206460952759, "step": 3230 }, { "epoch": 0.25, "grad_norm": 11.046777725219727, "learning_rate": 8.582028861350086e-06, "logits/chosen": -1.2464903593063354, "logits/rejected": -1.1636230945587158, "logps/chosen": -0.8882688283920288, "logps/rejected": -1.5485460758209229, "loss": 0.926, "odds_ratio_loss": 0.3775942623615265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08882687985897064, "rewards/margins": 0.06602771580219269, "rewards/rejected": -0.15485459566116333, "sft_loss": 0.8882688283920288, "step": 3235 }, { "epoch": 0.25, "grad_norm": 14.48009204864502, "learning_rate": 8.577730108324067e-06, "logits/chosen": -1.3417062759399414, "logits/rejected": -0.6446546316146851, "logps/chosen": -1.0239213705062866, "logps/rejected": -2.4988975524902344, "loss": 1.0451, "odds_ratio_loss": 0.21227788925170898, "rewards/accuracies": 1.0, "rewards/chosen": -0.10239215195178986, "rewards/margins": 0.14749760925769806, "rewards/rejected": -0.24988976120948792, "sft_loss": 1.0239213705062866, "step": 3240 }, { "epoch": 0.25, "grad_norm": 14.136868476867676, "learning_rate": 8.57342592921609e-06, "logits/chosen": -1.2961242198944092, "logits/rejected": -0.7371039390563965, "logps/chosen": -1.0956108570098877, "logps/rejected": -1.7069292068481445, "loss": 1.1371, "odds_ratio_loss": 0.41481003165245056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10956110060214996, "rewards/margins": 0.061131834983825684, "rewards/rejected": -0.17069292068481445, "sft_loss": 1.0956108570098877, "step": 3245 }, { "epoch": 0.25, "grad_norm": 13.239011764526367, "learning_rate": 8.569116330553992e-06, "logits/chosen": -1.3907157182693481, "logits/rejected": -1.0792735815048218, "logps/chosen": -1.1127643585205078, "logps/rejected": -2.486975908279419, "loss": 1.1428, "odds_ratio_loss": 0.30066484212875366, "rewards/accuracies": 1.0, "rewards/chosen": -0.11127644777297974, "rewards/margins": 0.1374211311340332, "rewards/rejected": -0.24869759380817413, "sft_loss": 1.1127643585205078, "step": 3250 }, { "epoch": 0.25, "grad_norm": 61.88528823852539, "learning_rate": 8.564801318873826e-06, "logits/chosen": -1.3922154903411865, "logits/rejected": -1.0898916721343994, "logps/chosen": -1.3009440898895264, "logps/rejected": -2.3645083904266357, "loss": 1.3433, "odds_ratio_loss": 0.4238888621330261, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13009440898895264, "rewards/margins": 0.1063564270734787, "rewards/rejected": -0.23645083606243134, "sft_loss": 1.3009440898895264, "step": 3255 }, { "epoch": 0.25, "grad_norm": 15.953457832336426, "learning_rate": 8.560480900719855e-06, "logits/chosen": -1.2886173725128174, "logits/rejected": -0.7558988928794861, "logps/chosen": -1.145691990852356, "logps/rejected": -1.2577075958251953, "loss": 1.2173, "odds_ratio_loss": 0.7161797881126404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11456920206546783, "rewards/margins": 0.011201570741832256, "rewards/rejected": -0.12577076256275177, "sft_loss": 1.145691990852356, "step": 3260 }, { "epoch": 0.25, "grad_norm": 24.60605812072754, "learning_rate": 8.556155082644542e-06, "logits/chosen": -1.0742292404174805, "logits/rejected": -0.9177080988883972, "logps/chosen": -0.989599883556366, "logps/rejected": -8.721120834350586, "loss": 1.0172, "odds_ratio_loss": 0.2762053608894348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09895999729633331, "rewards/margins": 0.7731519937515259, "rewards/rejected": -0.8721119165420532, "sft_loss": 0.989599883556366, "step": 3265 }, { "epoch": 0.25, "grad_norm": 17.272823333740234, "learning_rate": 8.55182387120854e-06, "logits/chosen": -1.202138900756836, "logits/rejected": -1.0315873622894287, "logps/chosen": -0.9821723699569702, "logps/rejected": -1.4622547626495361, "loss": 1.0229, "odds_ratio_loss": 0.407528817653656, "rewards/accuracies": 1.0, "rewards/chosen": -0.09821723401546478, "rewards/margins": 0.04800824820995331, "rewards/rejected": -0.1462254822254181, "sft_loss": 0.9821723699569702, "step": 3270 }, { "epoch": 0.25, "grad_norm": 15.39669418334961, "learning_rate": 8.547487272980679e-06, "logits/chosen": -1.3310105800628662, "logits/rejected": -0.7271682620048523, "logps/chosen": -1.3114240169525146, "logps/rejected": -1.6460965871810913, "loss": 1.382, "odds_ratio_loss": 0.7060557007789612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13114240765571594, "rewards/margins": 0.03346724808216095, "rewards/rejected": -0.1646096557378769, "sft_loss": 1.3114240169525146, "step": 3275 }, { "epoch": 0.26, "grad_norm": 21.61895179748535, "learning_rate": 8.543145294537963e-06, "logits/chosen": -1.2018988132476807, "logits/rejected": -1.2045353651046753, "logps/chosen": -1.3924777507781982, "logps/rejected": -2.1691339015960693, "loss": 1.4376, "odds_ratio_loss": 0.4510256350040436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13924778997898102, "rewards/margins": 0.07766561955213547, "rewards/rejected": -0.21691341698169708, "sft_loss": 1.3924777507781982, "step": 3280 }, { "epoch": 0.26, "grad_norm": 72.43243408203125, "learning_rate": 8.538797942465551e-06, "logits/chosen": -1.2349824905395508, "logits/rejected": -0.9436966776847839, "logps/chosen": -1.0088491439819336, "logps/rejected": -6.553753852844238, "loss": 1.0257, "odds_ratio_loss": 0.16876181960105896, "rewards/accuracies": 1.0, "rewards/chosen": -0.10088489949703217, "rewards/margins": 0.5544905066490173, "rewards/rejected": -0.6553754210472107, "sft_loss": 1.0088491439819336, "step": 3285 }, { "epoch": 0.26, "grad_norm": 4.700754165649414, "learning_rate": 8.534445223356756e-06, "logits/chosen": -1.1828210353851318, "logits/rejected": -0.9301977157592773, "logps/chosen": -0.8841454386711121, "logps/rejected": -1.6752599477767944, "loss": 0.9198, "odds_ratio_loss": 0.3564664423465729, "rewards/accuracies": 1.0, "rewards/chosen": -0.08841454237699509, "rewards/margins": 0.07911147177219391, "rewards/rejected": -0.1675260066986084, "sft_loss": 0.8841454386711121, "step": 3290 }, { "epoch": 0.26, "grad_norm": 5.469188690185547, "learning_rate": 8.53008714381303e-06, "logits/chosen": -1.4232442378997803, "logits/rejected": -0.569993793964386, "logps/chosen": -0.8220561146736145, "logps/rejected": -2.83996844291687, "loss": 0.8424, "odds_ratio_loss": 0.20346903800964355, "rewards/accuracies": 1.0, "rewards/chosen": -0.08220561593770981, "rewards/margins": 0.20179125666618347, "rewards/rejected": -0.2839968800544739, "sft_loss": 0.8220561146736145, "step": 3295 }, { "epoch": 0.26, "grad_norm": 20.554473876953125, "learning_rate": 8.525723710443953e-06, "logits/chosen": -1.4156343936920166, "logits/rejected": -1.068035364151001, "logps/chosen": -1.096895456314087, "logps/rejected": -2.0811915397644043, "loss": 1.1371, "odds_ratio_loss": 0.40177327394485474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10968954861164093, "rewards/margins": 0.0984296202659607, "rewards/rejected": -0.20811918377876282, "sft_loss": 1.096895456314087, "step": 3300 }, { "epoch": 0.26, "grad_norm": 461.81805419921875, "learning_rate": 8.521354929867227e-06, "logits/chosen": -1.2747472524642944, "logits/rejected": -1.0539219379425049, "logps/chosen": -0.9387380480766296, "logps/rejected": -8.72998046875, "loss": 0.9679, "odds_ratio_loss": 0.2919756770133972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09387381374835968, "rewards/margins": 0.7791242599487305, "rewards/rejected": -0.872998058795929, "sft_loss": 0.9387380480766296, "step": 3305 }, { "epoch": 0.26, "grad_norm": 13.743693351745605, "learning_rate": 8.516980808708659e-06, "logits/chosen": -1.359903335571289, "logits/rejected": -1.051998496055603, "logps/chosen": -1.1269054412841797, "logps/rejected": -9.247424125671387, "loss": 1.1874, "odds_ratio_loss": 0.6050975918769836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11269054561853409, "rewards/margins": 0.8120518922805786, "rewards/rejected": -0.9247424006462097, "sft_loss": 1.1269054412841797, "step": 3310 }, { "epoch": 0.26, "grad_norm": 9.05522632598877, "learning_rate": 8.512601353602164e-06, "logits/chosen": -1.4644567966461182, "logits/rejected": -0.9861732721328735, "logps/chosen": -1.1417715549468994, "logps/rejected": -1.4305145740509033, "loss": 1.1992, "odds_ratio_loss": 0.574771523475647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1141771674156189, "rewards/margins": 0.028874289244413376, "rewards/rejected": -0.14305146038532257, "sft_loss": 1.1417715549468994, "step": 3315 }, { "epoch": 0.26, "grad_norm": 7.766719818115234, "learning_rate": 8.508216571189737e-06, "logits/chosen": -1.3620727062225342, "logits/rejected": -0.8981539607048035, "logps/chosen": -0.9097458124160767, "logps/rejected": -5.282686710357666, "loss": 0.9658, "odds_ratio_loss": 0.5603520274162292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0909745842218399, "rewards/margins": 0.437294065952301, "rewards/rejected": -0.5282686948776245, "sft_loss": 0.9097458124160767, "step": 3320 }, { "epoch": 0.26, "grad_norm": 45.20925521850586, "learning_rate": 8.50382646812146e-06, "logits/chosen": -1.3043216466903687, "logits/rejected": -1.3080909252166748, "logps/chosen": -0.7670945525169373, "logps/rejected": -1.146458625793457, "loss": 0.8246, "odds_ratio_loss": 0.5751861929893494, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07670946419239044, "rewards/margins": 0.03793640434741974, "rewards/rejected": -0.11464586108922958, "sft_loss": 0.7670945525169373, "step": 3325 }, { "epoch": 0.26, "grad_norm": 20.774234771728516, "learning_rate": 8.49943105105548e-06, "logits/chosen": -1.447458267211914, "logits/rejected": -0.8410998582839966, "logps/chosen": -0.9892905354499817, "logps/rejected": -2.2154524326324463, "loss": 1.0132, "odds_ratio_loss": 0.23863506317138672, "rewards/accuracies": 1.0, "rewards/chosen": -0.09892904758453369, "rewards/margins": 0.12261620908975601, "rewards/rejected": -0.2215452492237091, "sft_loss": 0.9892905354499817, "step": 3330 }, { "epoch": 0.26, "grad_norm": 7.589955806732178, "learning_rate": 8.495030326658007e-06, "logits/chosen": -1.45345139503479, "logits/rejected": -0.8291786313056946, "logps/chosen": -1.0652108192443848, "logps/rejected": -11.067428588867188, "loss": 1.1067, "odds_ratio_loss": 0.41494470834732056, "rewards/accuracies": 1.0, "rewards/chosen": -0.10652108490467072, "rewards/margins": 1.000221848487854, "rewards/rejected": -1.1067428588867188, "sft_loss": 1.0652108192443848, "step": 3335 }, { "epoch": 0.26, "grad_norm": 9.080081939697266, "learning_rate": 8.490624301603296e-06, "logits/chosen": -1.0828771591186523, "logits/rejected": -1.0897125005722046, "logps/chosen": -0.8774884343147278, "logps/rejected": -1.6583712100982666, "loss": 0.9108, "odds_ratio_loss": 0.33312711119651794, "rewards/accuracies": 1.0, "rewards/chosen": -0.08774884045124054, "rewards/margins": 0.07808827608823776, "rewards/rejected": -0.1658371239900589, "sft_loss": 0.8774884343147278, "step": 3340 }, { "epoch": 0.26, "grad_norm": 15.120588302612305, "learning_rate": 8.486212982573648e-06, "logits/chosen": -1.0031670331954956, "logits/rejected": -0.9409330487251282, "logps/chosen": -1.1631263494491577, "logps/rejected": -2.286313533782959, "loss": 1.2246, "odds_ratio_loss": 0.6149234175682068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11631264537572861, "rewards/margins": 0.11231871694326401, "rewards/rejected": -0.22863134741783142, "sft_loss": 1.1631263494491577, "step": 3345 }, { "epoch": 0.26, "grad_norm": 7.824958801269531, "learning_rate": 8.481796376259382e-06, "logits/chosen": -1.2014691829681396, "logits/rejected": -0.8414332270622253, "logps/chosen": -1.1760876178741455, "logps/rejected": -1.5616389513015747, "loss": 1.2254, "odds_ratio_loss": 0.4928853511810303, "rewards/accuracies": 1.0, "rewards/chosen": -0.11760877072811127, "rewards/margins": 0.03855512663722038, "rewards/rejected": -0.15616390109062195, "sft_loss": 1.1760876178741455, "step": 3350 }, { "epoch": 0.26, "grad_norm": 8.375786781311035, "learning_rate": 8.477374489358845e-06, "logits/chosen": -1.5305713415145874, "logits/rejected": -1.3281760215759277, "logps/chosen": -1.110764503479004, "logps/rejected": -10.505553245544434, "loss": 1.2078, "odds_ratio_loss": 0.9702065587043762, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1110764592885971, "rewards/margins": 0.9394787549972534, "rewards/rejected": -1.0505553483963013, "sft_loss": 1.110764503479004, "step": 3355 }, { "epoch": 0.26, "grad_norm": 18.455425262451172, "learning_rate": 8.472947328578392e-06, "logits/chosen": -1.4160370826721191, "logits/rejected": -0.7809430956840515, "logps/chosen": -1.1917507648468018, "logps/rejected": -1.7635633945465088, "loss": 1.2442, "odds_ratio_loss": 0.5245550870895386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11917507648468018, "rewards/margins": 0.05718127638101578, "rewards/rejected": -0.17635634541511536, "sft_loss": 1.1917507648468018, "step": 3360 }, { "epoch": 0.26, "grad_norm": 9.328863143920898, "learning_rate": 8.46851490063237e-06, "logits/chosen": -1.3257181644439697, "logits/rejected": -1.2680017948150635, "logps/chosen": -1.1743090152740479, "logps/rejected": -1.757370948791504, "loss": 1.2341, "odds_ratio_loss": 0.5981367826461792, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11743088811635971, "rewards/margins": 0.05830620601773262, "rewards/rejected": -0.17573709785938263, "sft_loss": 1.1743090152740479, "step": 3365 }, { "epoch": 0.26, "grad_norm": 19.936983108520508, "learning_rate": 8.464077212243125e-06, "logits/chosen": -1.33879816532135, "logits/rejected": -1.112377405166626, "logps/chosen": -1.3328278064727783, "logps/rejected": -4.014039039611816, "loss": 1.3689, "odds_ratio_loss": 0.36111459136009216, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13328281044960022, "rewards/margins": 0.2681211233139038, "rewards/rejected": -0.40140390396118164, "sft_loss": 1.3328278064727783, "step": 3370 }, { "epoch": 0.26, "grad_norm": 69.97361755371094, "learning_rate": 8.459634270140968e-06, "logits/chosen": -1.3100478649139404, "logits/rejected": -0.6492315530776978, "logps/chosen": -1.3336145877838135, "logps/rejected": -4.281968116760254, "loss": 1.355, "odds_ratio_loss": 0.21349510550498962, "rewards/accuracies": 1.0, "rewards/chosen": -0.13336145877838135, "rewards/margins": 0.2948353886604309, "rewards/rejected": -0.42819681763648987, "sft_loss": 1.3336145877838135, "step": 3375 }, { "epoch": 0.26, "grad_norm": 11.706873893737793, "learning_rate": 8.45518608106419e-06, "logits/chosen": -1.2311418056488037, "logits/rejected": -0.7156813740730286, "logps/chosen": -1.0485591888427734, "logps/rejected": -2.773469924926758, "loss": 1.0833, "odds_ratio_loss": 0.3475884795188904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10485591739416122, "rewards/margins": 0.17249107360839844, "rewards/rejected": -0.27734702825546265, "sft_loss": 1.0485591888427734, "step": 3380 }, { "epoch": 0.26, "grad_norm": 95.40128326416016, "learning_rate": 8.450732651759033e-06, "logits/chosen": -1.3298120498657227, "logits/rejected": -1.1620497703552246, "logps/chosen": -0.8674956560134888, "logps/rejected": -1.8080475330352783, "loss": 0.899, "odds_ratio_loss": 0.31545546650886536, "rewards/accuracies": 1.0, "rewards/chosen": -0.08674956858158112, "rewards/margins": 0.0940551608800888, "rewards/rejected": -0.1808047592639923, "sft_loss": 0.8674956560134888, "step": 3385 }, { "epoch": 0.26, "grad_norm": 20.017311096191406, "learning_rate": 8.446273988979686e-06, "logits/chosen": -1.4703214168548584, "logits/rejected": -1.171466588973999, "logps/chosen": -0.9976893663406372, "logps/rejected": -1.4772334098815918, "loss": 1.0456, "odds_ratio_loss": 0.4789748787879944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09976892173290253, "rewards/margins": 0.047954410314559937, "rewards/rejected": -0.14772334694862366, "sft_loss": 0.9976893663406372, "step": 3390 }, { "epoch": 0.26, "grad_norm": 7.699336051940918, "learning_rate": 8.441810099488279e-06, "logits/chosen": -1.2824268341064453, "logits/rejected": -0.9811736941337585, "logps/chosen": -0.8096511960029602, "logps/rejected": -1.0340986251831055, "loss": 0.8644, "odds_ratio_loss": 0.5473746657371521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08096511662006378, "rewards/margins": 0.022444751113653183, "rewards/rejected": -0.10340987145900726, "sft_loss": 0.8096511960029602, "step": 3395 }, { "epoch": 0.26, "grad_norm": 13.585868835449219, "learning_rate": 8.437340990054868e-06, "logits/chosen": -1.4865134954452515, "logits/rejected": -1.2178223133087158, "logps/chosen": -0.6663314700126648, "logps/rejected": -1.67364501953125, "loss": 0.6956, "odds_ratio_loss": 0.292599618434906, "rewards/accuracies": 1.0, "rewards/chosen": -0.06663314998149872, "rewards/margins": 0.10073135793209076, "rewards/rejected": -0.16736450791358948, "sft_loss": 0.6663314700126648, "step": 3400 }, { "epoch": 0.26, "grad_norm": 9.500743865966797, "learning_rate": 8.432866667457423e-06, "logits/chosen": -1.4319857358932495, "logits/rejected": -0.9485819935798645, "logps/chosen": -0.9060953855514526, "logps/rejected": -7.988076686859131, "loss": 0.955, "odds_ratio_loss": 0.48911604285240173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09060955047607422, "rewards/margins": 0.7081981897354126, "rewards/rejected": -0.7988077402114868, "sft_loss": 0.9060953855514526, "step": 3405 }, { "epoch": 0.27, "grad_norm": 6.2479753494262695, "learning_rate": 8.428387138481825e-06, "logits/chosen": -1.4581882953643799, "logits/rejected": -0.7471655607223511, "logps/chosen": -1.04079270362854, "logps/rejected": -1.4583923816680908, "loss": 1.0885, "odds_ratio_loss": 0.4766160845756531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10407926887273788, "rewards/margins": 0.04175996407866478, "rewards/rejected": -0.14583922922611237, "sft_loss": 1.04079270362854, "step": 3410 }, { "epoch": 0.27, "grad_norm": 19.998077392578125, "learning_rate": 8.423902409921842e-06, "logits/chosen": -1.1454991102218628, "logits/rejected": -0.9668058156967163, "logps/chosen": -1.0157272815704346, "logps/rejected": -1.1806867122650146, "loss": 1.0831, "odds_ratio_loss": 0.6740451455116272, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10157272964715958, "rewards/margins": 0.01649593934416771, "rewards/rejected": -0.11806867271661758, "sft_loss": 1.0157272815704346, "step": 3415 }, { "epoch": 0.27, "grad_norm": 253.7983856201172, "learning_rate": 8.419412488579142e-06, "logits/chosen": -1.4023807048797607, "logits/rejected": -1.1779649257659912, "logps/chosen": -0.9201675653457642, "logps/rejected": -2.4253060817718506, "loss": 0.9598, "odds_ratio_loss": 0.39633411169052124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09201676398515701, "rewards/margins": 0.15051385760307312, "rewards/rejected": -0.24253061413764954, "sft_loss": 0.9201675653457642, "step": 3420 }, { "epoch": 0.27, "grad_norm": 12.650860786437988, "learning_rate": 8.414917381263256e-06, "logits/chosen": -1.4328114986419678, "logits/rejected": -0.7475396394729614, "logps/chosen": -1.0361485481262207, "logps/rejected": -2.0335729122161865, "loss": 1.0612, "odds_ratio_loss": 0.2504461109638214, "rewards/accuracies": 1.0, "rewards/chosen": -0.10361485183238983, "rewards/margins": 0.09974244982004166, "rewards/rejected": -0.20335730910301208, "sft_loss": 1.0361485481262207, "step": 3425 }, { "epoch": 0.27, "grad_norm": 12.29416561126709, "learning_rate": 8.410417094791587e-06, "logits/chosen": -1.2249650955200195, "logits/rejected": -1.1467955112457275, "logps/chosen": -1.2544214725494385, "logps/rejected": -1.503396987915039, "loss": 1.3107, "odds_ratio_loss": 0.5625152587890625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12544213235378265, "rewards/margins": 0.024897579103708267, "rewards/rejected": -0.15033970773220062, "sft_loss": 1.2544214725494385, "step": 3430 }, { "epoch": 0.27, "grad_norm": 9.909943580627441, "learning_rate": 8.405911635989391e-06, "logits/chosen": -1.3390843868255615, "logits/rejected": -0.8556186556816101, "logps/chosen": -1.392810583114624, "logps/rejected": -0.9943065643310547, "loss": 1.5033, "odds_ratio_loss": 1.1048606634140015, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13928106427192688, "rewards/margins": -0.03985040634870529, "rewards/rejected": -0.09943065792322159, "sft_loss": 1.392810583114624, "step": 3435 }, { "epoch": 0.27, "grad_norm": 13.671736717224121, "learning_rate": 8.40140101168977e-06, "logits/chosen": -1.1585992574691772, "logits/rejected": -0.9479349851608276, "logps/chosen": -0.7876306176185608, "logps/rejected": -2.8406918048858643, "loss": 0.8062, "odds_ratio_loss": 0.18568609654903412, "rewards/accuracies": 1.0, "rewards/chosen": -0.07876305282115936, "rewards/margins": 0.20530612766742706, "rewards/rejected": -0.28406915068626404, "sft_loss": 0.7876306176185608, "step": 3440 }, { "epoch": 0.27, "grad_norm": 8.141471862792969, "learning_rate": 8.396885228733651e-06, "logits/chosen": -1.4388540983200073, "logits/rejected": -0.9490545392036438, "logps/chosen": -0.7759628891944885, "logps/rejected": -6.000257968902588, "loss": 0.8164, "odds_ratio_loss": 0.40440258383750916, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07759629189968109, "rewards/margins": 0.5224294662475586, "rewards/rejected": -0.6000257730484009, "sft_loss": 0.7759628891944885, "step": 3445 }, { "epoch": 0.27, "grad_norm": 7.394799709320068, "learning_rate": 8.392364293969802e-06, "logits/chosen": -1.2691454887390137, "logits/rejected": -0.830244243144989, "logps/chosen": -1.0301793813705444, "logps/rejected": -1.5118746757507324, "loss": 1.0849, "odds_ratio_loss": 0.547632098197937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10301794111728668, "rewards/margins": 0.04816952347755432, "rewards/rejected": -0.151187464594841, "sft_loss": 1.0301793813705444, "step": 3450 }, { "epoch": 0.27, "grad_norm": 10.756146430969238, "learning_rate": 8.387838214254787e-06, "logits/chosen": -1.463841438293457, "logits/rejected": -0.8923279047012329, "logps/chosen": -1.1739076375961304, "logps/rejected": -3.4276280403137207, "loss": 1.232, "odds_ratio_loss": 0.5810686945915222, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11739076673984528, "rewards/margins": 0.2253720462322235, "rewards/rejected": -0.34276282787323, "sft_loss": 1.1739076375961304, "step": 3455 }, { "epoch": 0.27, "grad_norm": 7.481341361999512, "learning_rate": 8.383306996452984e-06, "logits/chosen": -1.2822167873382568, "logits/rejected": -1.111701250076294, "logps/chosen": -0.847335934638977, "logps/rejected": -1.5403449535369873, "loss": 0.8853, "odds_ratio_loss": 0.3792897164821625, "rewards/accuracies": 1.0, "rewards/chosen": -0.08473359048366547, "rewards/margins": 0.06930090487003326, "rewards/rejected": -0.15403451025485992, "sft_loss": 0.847335934638977, "step": 3460 }, { "epoch": 0.27, "grad_norm": 59.56509017944336, "learning_rate": 8.378770647436558e-06, "logits/chosen": -1.377863883972168, "logits/rejected": -1.1741714477539062, "logps/chosen": -1.4446465969085693, "logps/rejected": -3.021998882293701, "loss": 1.4683, "odds_ratio_loss": 0.2365054190158844, "rewards/accuracies": 1.0, "rewards/chosen": -0.1444646567106247, "rewards/margins": 0.15773524343967438, "rewards/rejected": -0.3021999001502991, "sft_loss": 1.4446465969085693, "step": 3465 }, { "epoch": 0.27, "grad_norm": 7.632749080657959, "learning_rate": 8.374229174085462e-06, "logits/chosen": -1.4123557806015015, "logits/rejected": -1.2731513977050781, "logps/chosen": -1.1025629043579102, "logps/rejected": -1.815313696861267, "loss": 1.1558, "odds_ratio_loss": 0.5323792695999146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11025629192590714, "rewards/margins": 0.07127507776021957, "rewards/rejected": -0.1815313696861267, "sft_loss": 1.1025629043579102, "step": 3470 }, { "epoch": 0.27, "grad_norm": 6.778805255889893, "learning_rate": 8.369682583287414e-06, "logits/chosen": -1.2414586544036865, "logits/rejected": -0.7293938398361206, "logps/chosen": -0.9718669652938843, "logps/rejected": -3.2794570922851562, "loss": 1.0039, "odds_ratio_loss": 0.32015174627304077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09718669950962067, "rewards/margins": 0.23075899481773376, "rewards/rejected": -0.3279457092285156, "sft_loss": 0.9718669652938843, "step": 3475 }, { "epoch": 0.27, "grad_norm": 46.404876708984375, "learning_rate": 8.365130881937897e-06, "logits/chosen": -1.34323251247406, "logits/rejected": -1.000976800918579, "logps/chosen": -1.2657513618469238, "logps/rejected": -1.1252360343933105, "loss": 1.3463, "odds_ratio_loss": 0.8055798411369324, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.12657514214515686, "rewards/margins": -0.014051536098122597, "rewards/rejected": -0.11252360045909882, "sft_loss": 1.2657513618469238, "step": 3480 }, { "epoch": 0.27, "grad_norm": 14.280263900756836, "learning_rate": 8.360574076940143e-06, "logits/chosen": -1.4023492336273193, "logits/rejected": -1.4027321338653564, "logps/chosen": -1.1064332723617554, "logps/rejected": -2.0462806224823, "loss": 1.1417, "odds_ratio_loss": 0.35309427976608276, "rewards/accuracies": 1.0, "rewards/chosen": -0.11064332723617554, "rewards/margins": 0.09398476034402847, "rewards/rejected": -0.2046280801296234, "sft_loss": 1.1064332723617554, "step": 3485 }, { "epoch": 0.27, "grad_norm": 14.520868301391602, "learning_rate": 8.356012175205127e-06, "logits/chosen": -1.339352011680603, "logits/rejected": -0.829668402671814, "logps/chosen": -1.151982069015503, "logps/rejected": -2.6990561485290527, "loss": 1.1743, "odds_ratio_loss": 0.22352364659309387, "rewards/accuracies": 1.0, "rewards/chosen": -0.11519820988178253, "rewards/margins": 0.1547074317932129, "rewards/rejected": -0.26990562677383423, "sft_loss": 1.151982069015503, "step": 3490 }, { "epoch": 0.27, "grad_norm": 6.273999214172363, "learning_rate": 8.351445183651552e-06, "logits/chosen": -1.5008800029754639, "logits/rejected": -1.093679666519165, "logps/chosen": -0.9768564105033875, "logps/rejected": -5.830966472625732, "loss": 0.9964, "odds_ratio_loss": 0.19528701901435852, "rewards/accuracies": 1.0, "rewards/chosen": -0.09768564254045486, "rewards/margins": 0.48541101813316345, "rewards/rejected": -0.5830966234207153, "sft_loss": 0.9768564105033875, "step": 3495 }, { "epoch": 0.27, "grad_norm": 11.410566329956055, "learning_rate": 8.34687310920584e-06, "logits/chosen": -1.493222951889038, "logits/rejected": -1.0385332107543945, "logps/chosen": -1.2644648551940918, "logps/rejected": -2.5637965202331543, "loss": 1.2901, "odds_ratio_loss": 0.2563869059085846, "rewards/accuracies": 1.0, "rewards/chosen": -0.12644650042057037, "rewards/margins": 0.129933163523674, "rewards/rejected": -0.2563796639442444, "sft_loss": 1.2644648551940918, "step": 3500 }, { "epoch": 0.27, "grad_norm": 7.278417110443115, "learning_rate": 8.34229595880212e-06, "logits/chosen": -1.4679405689239502, "logits/rejected": -0.689369797706604, "logps/chosen": -1.1459770202636719, "logps/rejected": -2.5427379608154297, "loss": 1.1695, "odds_ratio_loss": 0.23483486473560333, "rewards/accuracies": 1.0, "rewards/chosen": -0.11459771543741226, "rewards/margins": 0.13967609405517578, "rewards/rejected": -0.25427383184432983, "sft_loss": 1.1459770202636719, "step": 3505 }, { "epoch": 0.27, "grad_norm": 9.49854564666748, "learning_rate": 8.337713739382224e-06, "logits/chosen": -1.3915735483169556, "logits/rejected": -1.2456294298171997, "logps/chosen": -1.202357530593872, "logps/rejected": -5.146681785583496, "loss": 1.2341, "odds_ratio_loss": 0.3172472417354584, "rewards/accuracies": 1.0, "rewards/chosen": -0.12023575603961945, "rewards/margins": 0.3944324851036072, "rewards/rejected": -0.5146682858467102, "sft_loss": 1.202357530593872, "step": 3510 }, { "epoch": 0.27, "grad_norm": 7.210588455200195, "learning_rate": 8.333126457895673e-06, "logits/chosen": -1.4028444290161133, "logits/rejected": -0.9360781908035278, "logps/chosen": -1.0781782865524292, "logps/rejected": -2.235292911529541, "loss": 1.1052, "odds_ratio_loss": 0.2698212265968323, "rewards/accuracies": 1.0, "rewards/chosen": -0.10781782865524292, "rewards/margins": 0.11571145057678223, "rewards/rejected": -0.22352926433086395, "sft_loss": 1.0781782865524292, "step": 3515 }, { "epoch": 0.27, "grad_norm": 5.604741096496582, "learning_rate": 8.328534121299654e-06, "logits/chosen": -1.2505098581314087, "logits/rejected": -0.5317830443382263, "logps/chosen": -0.8969619870185852, "logps/rejected": -2.316410779953003, "loss": 0.9218, "odds_ratio_loss": 0.2478959560394287, "rewards/accuracies": 1.0, "rewards/chosen": -0.08969619125127792, "rewards/margins": 0.14194490015506744, "rewards/rejected": -0.23164109885692596, "sft_loss": 0.8969619870185852, "step": 3520 }, { "epoch": 0.27, "grad_norm": 57.90606689453125, "learning_rate": 8.323936736559038e-06, "logits/chosen": -1.0537947416305542, "logits/rejected": -1.2932230234146118, "logps/chosen": -2.3485894203186035, "logps/rejected": -2.0282821655273438, "loss": 2.4812, "odds_ratio_loss": 1.3260843753814697, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.23485895991325378, "rewards/margins": -0.03203073889017105, "rewards/rejected": -0.20282821357250214, "sft_loss": 2.3485894203186035, "step": 3525 }, { "epoch": 0.27, "grad_norm": 6.8659539222717285, "learning_rate": 8.319334310646335e-06, "logits/chosen": -1.393730878829956, "logits/rejected": -0.7838658690452576, "logps/chosen": -1.2052452564239502, "logps/rejected": -1.4900472164154053, "loss": 1.27, "odds_ratio_loss": 0.6480141878128052, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.12052451074123383, "rewards/margins": 0.02848021313548088, "rewards/rejected": -0.149004727602005, "sft_loss": 1.2052452564239502, "step": 3530 }, { "epoch": 0.27, "grad_norm": 5.410351276397705, "learning_rate": 8.31472685054171e-06, "logits/chosen": -1.2084213495254517, "logits/rejected": -1.0660079717636108, "logps/chosen": -1.127166986465454, "logps/rejected": -2.948442220687866, "loss": 1.1443, "odds_ratio_loss": 0.17111781239509583, "rewards/accuracies": 1.0, "rewards/chosen": -0.11271669715642929, "rewards/margins": 0.18212752044200897, "rewards/rejected": -0.29484421014785767, "sft_loss": 1.127166986465454, "step": 3535 }, { "epoch": 0.28, "grad_norm": 313.609130859375, "learning_rate": 8.310114363232961e-06, "logits/chosen": -1.1509945392608643, "logits/rejected": -1.0163276195526123, "logps/chosen": -1.3601996898651123, "logps/rejected": -5.512236595153809, "loss": 1.3712, "odds_ratio_loss": 0.10965617001056671, "rewards/accuracies": 1.0, "rewards/chosen": -0.1360199749469757, "rewards/margins": 0.415203720331192, "rewards/rejected": -0.551223635673523, "sft_loss": 1.3601996898651123, "step": 3540 }, { "epoch": 0.28, "grad_norm": 43.61051940917969, "learning_rate": 8.305496855715515e-06, "logits/chosen": -1.3698691129684448, "logits/rejected": -1.2788420915603638, "logps/chosen": -0.764731764793396, "logps/rejected": -3.356771469116211, "loss": 0.779, "odds_ratio_loss": 0.14261171221733093, "rewards/accuracies": 1.0, "rewards/chosen": -0.0764731839299202, "rewards/margins": 0.2592040002346039, "rewards/rejected": -0.3356771767139435, "sft_loss": 0.764731764793396, "step": 3545 }, { "epoch": 0.28, "grad_norm": 24.839813232421875, "learning_rate": 8.300874334992404e-06, "logits/chosen": -1.3410618305206299, "logits/rejected": -1.0158249139785767, "logps/chosen": -0.9953804016113281, "logps/rejected": -3.262022018432617, "loss": 1.0184, "odds_ratio_loss": 0.2305566519498825, "rewards/accuracies": 1.0, "rewards/chosen": -0.09953804314136505, "rewards/margins": 0.22666415572166443, "rewards/rejected": -0.3262022137641907, "sft_loss": 0.9953804016113281, "step": 3550 }, { "epoch": 0.28, "grad_norm": 26.78093719482422, "learning_rate": 8.296246808074268e-06, "logits/chosen": -1.4354108572006226, "logits/rejected": -1.1755014657974243, "logps/chosen": -0.6938873529434204, "logps/rejected": -6.733367919921875, "loss": 0.7041, "odds_ratio_loss": 0.10260869562625885, "rewards/accuracies": 1.0, "rewards/chosen": -0.0693887323141098, "rewards/margins": 0.6039480566978455, "rewards/rejected": -0.6733368039131165, "sft_loss": 0.6938873529434204, "step": 3555 }, { "epoch": 0.28, "grad_norm": 7.194347381591797, "learning_rate": 8.291614281979339e-06, "logits/chosen": -1.406665563583374, "logits/rejected": -1.10782790184021, "logps/chosen": -1.9012788534164429, "logps/rejected": -1.8412885665893555, "loss": 2.0278, "odds_ratio_loss": 1.265174150466919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.190127894282341, "rewards/margins": -0.005999034736305475, "rewards/rejected": -0.18412885069847107, "sft_loss": 1.9012788534164429, "step": 3560 }, { "epoch": 0.28, "grad_norm": 28.549922943115234, "learning_rate": 8.286976763733433e-06, "logits/chosen": -1.2915570735931396, "logits/rejected": -0.8109747171401978, "logps/chosen": -1.170531153678894, "logps/rejected": -2.9936161041259766, "loss": 1.2096, "odds_ratio_loss": 0.39067792892456055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11705312877893448, "rewards/margins": 0.18230850994586945, "rewards/rejected": -0.2993616461753845, "sft_loss": 1.170531153678894, "step": 3565 }, { "epoch": 0.28, "grad_norm": 28.323139190673828, "learning_rate": 8.282334260369934e-06, "logits/chosen": -1.463327407836914, "logits/rejected": -1.1677974462509155, "logps/chosen": -1.2482119798660278, "logps/rejected": -4.510406017303467, "loss": 1.285, "odds_ratio_loss": 0.3678421378135681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12482120841741562, "rewards/margins": 0.32621946930885315, "rewards/rejected": -0.451040655374527, "sft_loss": 1.2482119798660278, "step": 3570 }, { "epoch": 0.28, "grad_norm": 43.18751907348633, "learning_rate": 8.277686778929786e-06, "logits/chosen": -1.5234771966934204, "logits/rejected": -1.2108821868896484, "logps/chosen": -1.2944618463516235, "logps/rejected": -4.193190097808838, "loss": 1.3536, "odds_ratio_loss": 0.5915259122848511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12944617867469788, "rewards/margins": 0.28987282514572144, "rewards/rejected": -0.4193190634250641, "sft_loss": 1.2944618463516235, "step": 3575 }, { "epoch": 0.28, "grad_norm": 26.376367568969727, "learning_rate": 8.273034326461489e-06, "logits/chosen": -1.3317598104476929, "logits/rejected": -0.8073261976242065, "logps/chosen": -1.051276445388794, "logps/rejected": -1.787710428237915, "loss": 1.0856, "odds_ratio_loss": 0.34296557307243347, "rewards/accuracies": 1.0, "rewards/chosen": -0.10512763261795044, "rewards/margins": 0.07364340126514435, "rewards/rejected": -0.1787710338830948, "sft_loss": 1.051276445388794, "step": 3580 }, { "epoch": 0.28, "grad_norm": 23.10160255432129, "learning_rate": 8.268376910021075e-06, "logits/chosen": -1.2411911487579346, "logits/rejected": -1.3177855014801025, "logps/chosen": -0.8805097341537476, "logps/rejected": -1.6542412042617798, "loss": 0.9218, "odds_ratio_loss": 0.41270628571510315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.088050976395607, "rewards/margins": 0.07737313210964203, "rewards/rejected": -0.16542412340641022, "sft_loss": 0.8805097341537476, "step": 3585 }, { "epoch": 0.28, "grad_norm": 171.03265380859375, "learning_rate": 8.263714536672105e-06, "logits/chosen": -1.3680837154388428, "logits/rejected": -0.7903397679328918, "logps/chosen": -1.1044056415557861, "logps/rejected": -3.1359059810638428, "loss": 1.1217, "odds_ratio_loss": 0.17297251522541046, "rewards/accuracies": 1.0, "rewards/chosen": -0.11044056713581085, "rewards/margins": 0.20315006375312805, "rewards/rejected": -0.3135906159877777, "sft_loss": 1.1044056415557861, "step": 3590 }, { "epoch": 0.28, "grad_norm": 25.135984420776367, "learning_rate": 8.259047213485664e-06, "logits/chosen": -1.1868219375610352, "logits/rejected": -1.4525445699691772, "logps/chosen": -1.0119667053222656, "logps/rejected": -1.5798547267913818, "loss": 1.0506, "odds_ratio_loss": 0.38638943433761597, "rewards/accuracies": 1.0, "rewards/chosen": -0.10119666904211044, "rewards/margins": 0.056788813322782516, "rewards/rejected": -0.15798547863960266, "sft_loss": 1.0119667053222656, "step": 3595 }, { "epoch": 0.28, "grad_norm": 7.753753185272217, "learning_rate": 8.25437494754034e-06, "logits/chosen": -1.278234839439392, "logits/rejected": -1.153738021850586, "logps/chosen": -0.9590956568717957, "logps/rejected": -1.9769725799560547, "loss": 0.9844, "odds_ratio_loss": 0.25261688232421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.09590956568717957, "rewards/margins": 0.10178768634796143, "rewards/rejected": -0.197697252035141, "sft_loss": 0.9590956568717957, "step": 3600 }, { "epoch": 0.28, "grad_norm": 9.523035049438477, "learning_rate": 8.249697745922216e-06, "logits/chosen": -1.2500884532928467, "logits/rejected": -1.2859524488449097, "logps/chosen": -1.1764044761657715, "logps/rejected": -3.144897937774658, "loss": 1.2349, "odds_ratio_loss": 0.5853737592697144, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11764045059680939, "rewards/margins": 0.19684937596321106, "rewards/rejected": -0.31448981165885925, "sft_loss": 1.1764044761657715, "step": 3605 }, { "epoch": 0.28, "grad_norm": 7.400860786437988, "learning_rate": 8.245015615724862e-06, "logits/chosen": -1.287461280822754, "logits/rejected": -0.8889628648757935, "logps/chosen": -1.0676617622375488, "logps/rejected": -1.123071551322937, "loss": 1.1391, "odds_ratio_loss": 0.7140272855758667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10676617920398712, "rewards/margins": 0.00554098654538393, "rewards/rejected": -0.11230716854333878, "sft_loss": 1.0676617622375488, "step": 3610 }, { "epoch": 0.28, "grad_norm": 15.867348670959473, "learning_rate": 8.240328564049326e-06, "logits/chosen": -1.1854329109191895, "logits/rejected": -0.8505358695983887, "logps/chosen": -1.1343297958374023, "logps/rejected": -1.3808737993240356, "loss": 1.1879, "odds_ratio_loss": 0.5355452299118042, "rewards/accuracies": 1.0, "rewards/chosen": -0.11343298107385635, "rewards/margins": 0.024654392153024673, "rewards/rejected": -0.13808736205101013, "sft_loss": 1.1343297958374023, "step": 3615 }, { "epoch": 0.28, "grad_norm": 11.455201148986816, "learning_rate": 8.235636598004112e-06, "logits/chosen": -1.4110326766967773, "logits/rejected": -1.1761510372161865, "logps/chosen": -1.0385137796401978, "logps/rejected": -1.8720951080322266, "loss": 1.0757, "odds_ratio_loss": 0.37180137634277344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10385137796401978, "rewards/margins": 0.08335815370082855, "rewards/rejected": -0.18720951676368713, "sft_loss": 1.0385137796401978, "step": 3620 }, { "epoch": 0.28, "grad_norm": 5.2311811447143555, "learning_rate": 8.230939724705185e-06, "logits/chosen": -1.3805992603302002, "logits/rejected": -0.5798208713531494, "logps/chosen": -1.0771076679229736, "logps/rejected": -1.4178438186645508, "loss": 1.1278, "odds_ratio_loss": 0.5071910619735718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10771076381206512, "rewards/margins": 0.0340736024081707, "rewards/rejected": -0.14178438484668732, "sft_loss": 1.0771076679229736, "step": 3625 }, { "epoch": 0.28, "grad_norm": 14.495864868164062, "learning_rate": 8.226237951275951e-06, "logits/chosen": -1.4785919189453125, "logits/rejected": -0.8613845705986023, "logps/chosen": -0.7386495471000671, "logps/rejected": -1.0128322839736938, "loss": 0.7915, "odds_ratio_loss": 0.528670072555542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07386495918035507, "rewards/margins": 0.027418266981840134, "rewards/rejected": -0.1012832298874855, "sft_loss": 0.7386495471000671, "step": 3630 }, { "epoch": 0.28, "grad_norm": 6.944512844085693, "learning_rate": 8.221531284847242e-06, "logits/chosen": -1.2884480953216553, "logits/rejected": -0.8778683543205261, "logps/chosen": -0.7235269546508789, "logps/rejected": -1.7379268407821655, "loss": 0.7619, "odds_ratio_loss": 0.38415372371673584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07235269248485565, "rewards/margins": 0.10144001245498657, "rewards/rejected": -0.17379269003868103, "sft_loss": 0.7235269546508789, "step": 3635 }, { "epoch": 0.28, "grad_norm": 22.363006591796875, "learning_rate": 8.21681973255732e-06, "logits/chosen": -1.457991123199463, "logits/rejected": -0.9291761517524719, "logps/chosen": -1.1872960329055786, "logps/rejected": -1.5329262018203735, "loss": 1.2416, "odds_ratio_loss": 0.543427586555481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1187296062707901, "rewards/margins": 0.03456302359700203, "rewards/rejected": -0.15329262614250183, "sft_loss": 1.1872960329055786, "step": 3640 }, { "epoch": 0.28, "grad_norm": 10.626898765563965, "learning_rate": 8.212103301551851e-06, "logits/chosen": -1.3206452131271362, "logits/rejected": -0.890534520149231, "logps/chosen": -1.128233551979065, "logps/rejected": -1.826210379600525, "loss": 1.1681, "odds_ratio_loss": 0.3984006345272064, "rewards/accuracies": 1.0, "rewards/chosen": -0.11282335221767426, "rewards/margins": 0.06979767978191376, "rewards/rejected": -0.182621031999588, "sft_loss": 1.128233551979065, "step": 3645 }, { "epoch": 0.28, "grad_norm": 8.859610557556152, "learning_rate": 8.207381998983897e-06, "logits/chosen": -1.4563844203948975, "logits/rejected": -0.7494536638259888, "logps/chosen": -1.0328868627548218, "logps/rejected": -2.5515682697296143, "loss": 1.0563, "odds_ratio_loss": 0.23427622020244598, "rewards/accuracies": 1.0, "rewards/chosen": -0.1032886877655983, "rewards/margins": 0.15186813473701477, "rewards/rejected": -0.25515681505203247, "sft_loss": 1.0328868627548218, "step": 3650 }, { "epoch": 0.28, "grad_norm": 9.665281295776367, "learning_rate": 8.202655832013919e-06, "logits/chosen": -1.413213849067688, "logits/rejected": -1.1829755306243896, "logps/chosen": -0.9151542782783508, "logps/rejected": -5.067257404327393, "loss": 0.9737, "odds_ratio_loss": 0.5854582786560059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0915154367685318, "rewards/margins": 0.4152103364467621, "rewards/rejected": -0.5067256689071655, "sft_loss": 0.9151542782783508, "step": 3655 }, { "epoch": 0.28, "grad_norm": 19.42445945739746, "learning_rate": 8.197924807809747e-06, "logits/chosen": -1.3678677082061768, "logits/rejected": -1.3623011112213135, "logps/chosen": -1.2780225276947021, "logps/rejected": -1.6961390972137451, "loss": 1.3304, "odds_ratio_loss": 0.5240920186042786, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12780225276947021, "rewards/margins": 0.04181166738271713, "rewards/rejected": -0.16961391270160675, "sft_loss": 1.2780225276947021, "step": 3660 }, { "epoch": 0.29, "grad_norm": 34.14183044433594, "learning_rate": 8.193188933546579e-06, "logits/chosen": -1.334039330482483, "logits/rejected": -1.05949866771698, "logps/chosen": -0.9566957354545593, "logps/rejected": -4.501104831695557, "loss": 0.9723, "odds_ratio_loss": 0.15578912198543549, "rewards/accuracies": 1.0, "rewards/chosen": -0.09566958248615265, "rewards/margins": 0.35444092750549316, "rewards/rejected": -0.450110524892807, "sft_loss": 0.9566957354545593, "step": 3665 }, { "epoch": 0.29, "grad_norm": 30.650644302368164, "learning_rate": 8.188448216406971e-06, "logits/chosen": -1.3179060220718384, "logits/rejected": -1.1395162343978882, "logps/chosen": -1.471545934677124, "logps/rejected": -2.721754550933838, "loss": 1.53, "odds_ratio_loss": 0.5845457315444946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14715459942817688, "rewards/margins": 0.12502089142799377, "rewards/rejected": -0.27217546105384827, "sft_loss": 1.471545934677124, "step": 3670 }, { "epoch": 0.29, "grad_norm": 10.528597831726074, "learning_rate": 8.183702663580822e-06, "logits/chosen": -1.3393423557281494, "logits/rejected": -1.333606481552124, "logps/chosen": -1.123405933380127, "logps/rejected": -4.076834678649902, "loss": 1.1327, "odds_ratio_loss": 0.09317772090435028, "rewards/accuracies": 1.0, "rewards/chosen": -0.11234060674905777, "rewards/margins": 0.2953428626060486, "rewards/rejected": -0.40768346190452576, "sft_loss": 1.123405933380127, "step": 3675 }, { "epoch": 0.29, "grad_norm": 271.1390075683594, "learning_rate": 8.178952282265364e-06, "logits/chosen": -1.3709100484848022, "logits/rejected": -1.0029327869415283, "logps/chosen": -1.405590295791626, "logps/rejected": -2.507824420928955, "loss": 1.4632, "odds_ratio_loss": 0.5756229758262634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14055903255939484, "rewards/margins": 0.1102234274148941, "rewards/rejected": -0.25078245997428894, "sft_loss": 1.405590295791626, "step": 3680 }, { "epoch": 0.29, "grad_norm": 57.85871124267578, "learning_rate": 8.174197079665153e-06, "logits/chosen": -1.3550955057144165, "logits/rejected": -0.8531472086906433, "logps/chosen": -1.132922649383545, "logps/rejected": -3.7029755115509033, "loss": 1.1647, "odds_ratio_loss": 0.3179894983768463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11329226195812225, "rewards/margins": 0.2570053040981293, "rewards/rejected": -0.37029752135276794, "sft_loss": 1.132922649383545, "step": 3685 }, { "epoch": 0.29, "grad_norm": 8.095747947692871, "learning_rate": 8.169437062992061e-06, "logits/chosen": -1.401653528213501, "logits/rejected": -0.91447913646698, "logps/chosen": -0.8620834350585938, "logps/rejected": -2.502750873565674, "loss": 0.9076, "odds_ratio_loss": 0.45480185747146606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08620833605527878, "rewards/margins": 0.16406671702861786, "rewards/rejected": -0.2502750754356384, "sft_loss": 0.8620834350585938, "step": 3690 }, { "epoch": 0.29, "grad_norm": 5.202536106109619, "learning_rate": 8.164672239465254e-06, "logits/chosen": -1.190614938735962, "logits/rejected": -0.9084379076957703, "logps/chosen": -0.8757231831550598, "logps/rejected": -1.483604073524475, "loss": 0.9161, "odds_ratio_loss": 0.4041404128074646, "rewards/accuracies": 1.0, "rewards/chosen": -0.08757232129573822, "rewards/margins": 0.060788094997406006, "rewards/rejected": -0.14836041629314423, "sft_loss": 0.8757231831550598, "step": 3695 }, { "epoch": 0.29, "grad_norm": 25.508304595947266, "learning_rate": 8.159902616311195e-06, "logits/chosen": -1.2143833637237549, "logits/rejected": -1.231711745262146, "logps/chosen": -1.2228883504867554, "logps/rejected": -1.3752474784851074, "loss": 1.2946, "odds_ratio_loss": 0.7171187400817871, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.12228883802890778, "rewards/margins": 0.015235906466841698, "rewards/rejected": -0.13752475380897522, "sft_loss": 1.2228883504867554, "step": 3700 }, { "epoch": 0.29, "grad_norm": 13.769211769104004, "learning_rate": 8.155128200763623e-06, "logits/chosen": -1.255629539489746, "logits/rejected": -0.8488641977310181, "logps/chosen": -1.5895591974258423, "logps/rejected": -1.4509578943252563, "loss": 1.6858, "odds_ratio_loss": 0.9623721837997437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15895593166351318, "rewards/margins": -0.013860151171684265, "rewards/rejected": -0.14509578049182892, "sft_loss": 1.5895591974258423, "step": 3705 }, { "epoch": 0.29, "grad_norm": 7.28419303894043, "learning_rate": 8.15034900006354e-06, "logits/chosen": -1.1626012325286865, "logits/rejected": -0.8984651565551758, "logps/chosen": -1.3513168096542358, "logps/rejected": -1.5219471454620361, "loss": 1.4389, "odds_ratio_loss": 0.8761366009712219, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13513168692588806, "rewards/margins": 0.017063047736883163, "rewards/rejected": -0.15219472348690033, "sft_loss": 1.3513168096542358, "step": 3710 }, { "epoch": 0.29, "grad_norm": 7.7843708992004395, "learning_rate": 8.145565021459217e-06, "logits/chosen": -1.2671594619750977, "logits/rejected": -0.9397374987602234, "logps/chosen": -1.4201005697250366, "logps/rejected": -1.5774786472320557, "loss": 1.4891, "odds_ratio_loss": 0.6898049116134644, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14201006293296814, "rewards/margins": 0.015737801790237427, "rewards/rejected": -0.15774787962436676, "sft_loss": 1.4201005697250366, "step": 3715 }, { "epoch": 0.29, "grad_norm": 18.914749145507812, "learning_rate": 8.140776272206161e-06, "logits/chosen": -1.2724708318710327, "logits/rejected": -1.3933824300765991, "logps/chosen": -0.5651779174804688, "logps/rejected": -2.1759657859802246, "loss": 0.6014, "odds_ratio_loss": 0.3619091808795929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.056517791002988815, "rewards/margins": 0.1610788106918335, "rewards/rejected": -0.2175966054201126, "sft_loss": 0.5651779174804688, "step": 3720 }, { "epoch": 0.29, "grad_norm": 6.794033050537109, "learning_rate": 8.135982759567121e-06, "logits/chosen": -1.3524110317230225, "logits/rejected": -0.882132887840271, "logps/chosen": -0.7997108697891235, "logps/rejected": -1.584331750869751, "loss": 0.832, "odds_ratio_loss": 0.32256340980529785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07997108995914459, "rewards/margins": 0.07846207171678543, "rewards/rejected": -0.15843316912651062, "sft_loss": 0.7997108697891235, "step": 3725 }, { "epoch": 0.29, "grad_norm": 31.65180015563965, "learning_rate": 8.131184490812064e-06, "logits/chosen": -1.186091661453247, "logits/rejected": -0.8857123255729675, "logps/chosen": -1.0445191860198975, "logps/rejected": -4.812319278717041, "loss": 1.0873, "odds_ratio_loss": 0.4277670979499817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10445191711187363, "rewards/margins": 0.3767800033092499, "rewards/rejected": -0.4812319278717041, "sft_loss": 1.0445191860198975, "step": 3730 }, { "epoch": 0.29, "grad_norm": 7.225113868713379, "learning_rate": 8.126381473218179e-06, "logits/chosen": -1.2817249298095703, "logits/rejected": -1.080437421798706, "logps/chosen": -1.0402967929840088, "logps/rejected": -5.129024982452393, "loss": 1.0936, "odds_ratio_loss": 0.5332490801811218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10402967035770416, "rewards/margins": 0.4088728427886963, "rewards/rejected": -0.5129024982452393, "sft_loss": 1.0402967929840088, "step": 3735 }, { "epoch": 0.29, "grad_norm": 17.7869873046875, "learning_rate": 8.121573714069848e-06, "logits/chosen": -1.3492562770843506, "logits/rejected": -0.9287340044975281, "logps/chosen": -0.6982876658439636, "logps/rejected": -1.885575532913208, "loss": 0.7366, "odds_ratio_loss": 0.3826819360256195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06982876360416412, "rewards/margins": 0.11872879415750504, "rewards/rejected": -0.18855755031108856, "sft_loss": 0.6982876658439636, "step": 3740 }, { "epoch": 0.29, "grad_norm": 8.037618637084961, "learning_rate": 8.116761220658649e-06, "logits/chosen": -1.4126262664794922, "logits/rejected": -1.4442805051803589, "logps/chosen": -0.7851252555847168, "logps/rejected": -1.5594263076782227, "loss": 0.8182, "odds_ratio_loss": 0.33029070496559143, "rewards/accuracies": 1.0, "rewards/chosen": -0.0785125270485878, "rewards/margins": 0.0774301066994667, "rewards/rejected": -0.1559426337480545, "sft_loss": 0.7851252555847168, "step": 3745 }, { "epoch": 0.29, "grad_norm": 9.552249908447266, "learning_rate": 8.111944000283339e-06, "logits/chosen": -1.382643461227417, "logits/rejected": -1.1251006126403809, "logps/chosen": -0.8634234666824341, "logps/rejected": -2.6441121101379395, "loss": 0.8849, "odds_ratio_loss": 0.21498659253120422, "rewards/accuracies": 1.0, "rewards/chosen": -0.08634234964847565, "rewards/margins": 0.17806890606880188, "rewards/rejected": -0.26441124081611633, "sft_loss": 0.8634234666824341, "step": 3750 }, { "epoch": 0.29, "grad_norm": 30.744495391845703, "learning_rate": 8.107122060249846e-06, "logits/chosen": -1.0354880094528198, "logits/rejected": -0.7640705108642578, "logps/chosen": -1.2475351095199585, "logps/rejected": -1.5779125690460205, "loss": 1.304, "odds_ratio_loss": 0.5643216371536255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12475351989269257, "rewards/margins": 0.03303774073719978, "rewards/rejected": -0.15779125690460205, "sft_loss": 1.2475351095199585, "step": 3755 }, { "epoch": 0.29, "grad_norm": 17.847753524780273, "learning_rate": 8.102295407871252e-06, "logits/chosen": -1.40513014793396, "logits/rejected": -1.1785567998886108, "logps/chosen": -0.9754989743232727, "logps/rejected": -3.8326869010925293, "loss": 1.0581, "odds_ratio_loss": 0.825912594795227, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09754989296197891, "rewards/margins": 0.28571879863739014, "rewards/rejected": -0.38326868414878845, "sft_loss": 0.9754989743232727, "step": 3760 }, { "epoch": 0.29, "grad_norm": 14.9716215133667, "learning_rate": 8.097464050467788e-06, "logits/chosen": -1.2403470277786255, "logits/rejected": -0.671155571937561, "logps/chosen": -1.0241820812225342, "logps/rejected": -2.359799861907959, "loss": 1.0488, "odds_ratio_loss": 0.24615927040576935, "rewards/accuracies": 1.0, "rewards/chosen": -0.10241822153329849, "rewards/margins": 0.13356177508831024, "rewards/rejected": -0.23597998917102814, "sft_loss": 1.0241820812225342, "step": 3765 }, { "epoch": 0.29, "grad_norm": 7.668431758880615, "learning_rate": 8.092627995366824e-06, "logits/chosen": -1.3898518085479736, "logits/rejected": -0.8233474493026733, "logps/chosen": -0.8895101547241211, "logps/rejected": -1.942580223083496, "loss": 0.9232, "odds_ratio_loss": 0.3369949758052826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08895101398229599, "rewards/margins": 0.10530702769756317, "rewards/rejected": -0.19425803422927856, "sft_loss": 0.8895101547241211, "step": 3770 }, { "epoch": 0.29, "grad_norm": 34.52479553222656, "learning_rate": 8.08778724990285e-06, "logits/chosen": -1.2432185411453247, "logits/rejected": -1.1982190608978271, "logps/chosen": -1.1835172176361084, "logps/rejected": -1.8407405614852905, "loss": 1.2383, "odds_ratio_loss": 0.5474838018417358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11835173517465591, "rewards/margins": 0.06572232395410538, "rewards/rejected": -0.1840740442276001, "sft_loss": 1.1835172176361084, "step": 3775 }, { "epoch": 0.29, "grad_norm": 7.304516315460205, "learning_rate": 8.082941821417469e-06, "logits/chosen": -1.2247803211212158, "logits/rejected": -0.9323280453681946, "logps/chosen": -1.0696570873260498, "logps/rejected": -3.683121919631958, "loss": 1.0933, "odds_ratio_loss": 0.23642174899578094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10696570575237274, "rewards/margins": 0.2613464891910553, "rewards/rejected": -0.36831218004226685, "sft_loss": 1.0696570873260498, "step": 3780 }, { "epoch": 0.29, "grad_norm": 15.417840957641602, "learning_rate": 8.07809171725939e-06, "logits/chosen": -1.3001465797424316, "logits/rejected": -0.7907955646514893, "logps/chosen": -1.1750476360321045, "logps/rejected": -1.312461018562317, "loss": 1.2469, "odds_ratio_loss": 0.7185543179512024, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11750476062297821, "rewards/margins": 0.013741342350840569, "rewards/rejected": -0.13124610483646393, "sft_loss": 1.1750476360321045, "step": 3785 }, { "epoch": 0.29, "grad_norm": 8.207767486572266, "learning_rate": 8.073236944784415e-06, "logits/chosen": -1.3569071292877197, "logits/rejected": -1.307720422744751, "logps/chosen": -1.0300140380859375, "logps/rejected": -2.8602893352508545, "loss": 1.0538, "odds_ratio_loss": 0.23768818378448486, "rewards/accuracies": 1.0, "rewards/chosen": -0.10300140082836151, "rewards/margins": 0.18302753567695618, "rewards/rejected": -0.2860289514064789, "sft_loss": 1.0300140380859375, "step": 3790 }, { "epoch": 0.3, "grad_norm": 10.38357925415039, "learning_rate": 8.068377511355418e-06, "logits/chosen": -1.2362945079803467, "logits/rejected": -1.1578267812728882, "logps/chosen": -1.007973074913025, "logps/rejected": -1.5636718273162842, "loss": 1.054, "odds_ratio_loss": 0.46053171157836914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10079731792211533, "rewards/margins": 0.05556987598538399, "rewards/rejected": -0.15636718273162842, "sft_loss": 1.007973074913025, "step": 3795 }, { "epoch": 0.3, "grad_norm": 12.893205642700195, "learning_rate": 8.063513424342348e-06, "logits/chosen": -1.183700680732727, "logits/rejected": -1.1352441310882568, "logps/chosen": -1.0947539806365967, "logps/rejected": -3.030123472213745, "loss": 1.1066, "odds_ratio_loss": 0.11800430715084076, "rewards/accuracies": 1.0, "rewards/chosen": -0.10947538912296295, "rewards/margins": 0.19353696703910828, "rewards/rejected": -0.3030123710632324, "sft_loss": 1.0947539806365967, "step": 3800 }, { "epoch": 0.3, "grad_norm": 10.757180213928223, "learning_rate": 8.058644691122211e-06, "logits/chosen": -1.3270838260650635, "logits/rejected": -0.9902740716934204, "logps/chosen": -0.880314826965332, "logps/rejected": -2.9867103099823, "loss": 0.8953, "odds_ratio_loss": 0.14989587664604187, "rewards/accuracies": 1.0, "rewards/chosen": -0.08803148567676544, "rewards/margins": 0.21063955128192902, "rewards/rejected": -0.29867103695869446, "sft_loss": 0.880314826965332, "step": 3805 }, { "epoch": 0.3, "grad_norm": 8.997293472290039, "learning_rate": 8.053771319079061e-06, "logits/chosen": -1.4080528020858765, "logits/rejected": -0.9664722681045532, "logps/chosen": -1.167017936706543, "logps/rejected": -2.621018171310425, "loss": 1.1972, "odds_ratio_loss": 0.30143603682518005, "rewards/accuracies": 1.0, "rewards/chosen": -0.11670179665088654, "rewards/margins": 0.1454000174999237, "rewards/rejected": -0.26210182905197144, "sft_loss": 1.167017936706543, "step": 3810 }, { "epoch": 0.3, "grad_norm": 9.430560111999512, "learning_rate": 8.048893315603982e-06, "logits/chosen": -1.350351095199585, "logits/rejected": -0.7604211568832397, "logps/chosen": -1.1834475994110107, "logps/rejected": -5.396286964416504, "loss": 1.2188, "odds_ratio_loss": 0.35320568084716797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11834476888179779, "rewards/margins": 0.42128387093544006, "rewards/rejected": -0.5396286249160767, "sft_loss": 1.1834475994110107, "step": 3815 }, { "epoch": 0.3, "grad_norm": 13.44129467010498, "learning_rate": 8.044010688095089e-06, "logits/chosen": -1.2833950519561768, "logits/rejected": -0.6708613634109497, "logps/chosen": -1.1539959907531738, "logps/rejected": -2.1586103439331055, "loss": 1.196, "odds_ratio_loss": 0.4198933243751526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11539959907531738, "rewards/margins": 0.100461445748806, "rewards/rejected": -0.21586103737354279, "sft_loss": 1.1539959907531738, "step": 3820 }, { "epoch": 0.3, "grad_norm": 7.030882358551025, "learning_rate": 8.039123443957503e-06, "logits/chosen": -1.4119694232940674, "logits/rejected": -0.6684118509292603, "logps/chosen": -0.9586877822875977, "logps/rejected": -3.244650363922119, "loss": 0.9943, "odds_ratio_loss": 0.35587188601493835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.095868781208992, "rewards/margins": 0.2285962849855423, "rewards/rejected": -0.3244650661945343, "sft_loss": 0.9586877822875977, "step": 3825 }, { "epoch": 0.3, "grad_norm": 5.365910530090332, "learning_rate": 8.034231590603355e-06, "logits/chosen": -1.3894189596176147, "logits/rejected": -1.2239296436309814, "logps/chosen": -1.6029672622680664, "logps/rejected": -5.419643878936768, "loss": 1.6304, "odds_ratio_loss": 0.2738359570503235, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1602967083454132, "rewards/margins": 0.3816676735877991, "rewards/rejected": -0.5419644117355347, "sft_loss": 1.6029672622680664, "step": 3830 }, { "epoch": 0.3, "grad_norm": 27.208940505981445, "learning_rate": 8.029335135451756e-06, "logits/chosen": -1.4428378343582153, "logits/rejected": -1.114614725112915, "logps/chosen": -0.8311885595321655, "logps/rejected": -1.4355841875076294, "loss": 0.8732, "odds_ratio_loss": 0.4203735291957855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08311885595321655, "rewards/margins": 0.060439564287662506, "rewards/rejected": -0.14355841279029846, "sft_loss": 0.8311885595321655, "step": 3835 }, { "epoch": 0.3, "grad_norm": 12.32146167755127, "learning_rate": 8.024434085928806e-06, "logits/chosen": -1.5364830493927002, "logits/rejected": -1.250301718711853, "logps/chosen": -0.8794133067131042, "logps/rejected": -4.063249111175537, "loss": 0.9059, "odds_ratio_loss": 0.2653045356273651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08794133365154266, "rewards/margins": 0.3183836042881012, "rewards/rejected": -0.40632495284080505, "sft_loss": 0.8794133067131042, "step": 3840 }, { "epoch": 0.3, "grad_norm": 9.93622875213623, "learning_rate": 8.019528449467566e-06, "logits/chosen": -1.114686369895935, "logits/rejected": -1.0291764736175537, "logps/chosen": -0.8959754109382629, "logps/rejected": -1.8957700729370117, "loss": 0.9519, "odds_ratio_loss": 0.5592709183692932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08959753811359406, "rewards/margins": 0.0999794751405716, "rewards/rejected": -0.18957701325416565, "sft_loss": 0.8959754109382629, "step": 3845 }, { "epoch": 0.3, "grad_norm": 8.939530372619629, "learning_rate": 8.01461823350806e-06, "logits/chosen": -1.2906373739242554, "logits/rejected": -0.6907309293746948, "logps/chosen": -0.8869168162345886, "logps/rejected": -1.3133783340454102, "loss": 0.9384, "odds_ratio_loss": 0.515034556388855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08869167417287827, "rewards/margins": 0.04264615476131439, "rewards/rejected": -0.13133783638477325, "sft_loss": 0.8869168162345886, "step": 3850 }, { "epoch": 0.3, "grad_norm": 6.442677021026611, "learning_rate": 8.009703445497252e-06, "logits/chosen": -1.286516785621643, "logits/rejected": -1.0029170513153076, "logps/chosen": -1.1681463718414307, "logps/rejected": -2.1528007984161377, "loss": 1.1983, "odds_ratio_loss": 0.30186447501182556, "rewards/accuracies": 1.0, "rewards/chosen": -0.11681463569402695, "rewards/margins": 0.0984654426574707, "rewards/rejected": -0.21528008580207825, "sft_loss": 1.1681463718414307, "step": 3855 }, { "epoch": 0.3, "grad_norm": 6.5835981369018555, "learning_rate": 8.004784092889043e-06, "logits/chosen": -1.2807387113571167, "logits/rejected": -0.9215117692947388, "logps/chosen": -1.4025729894638062, "logps/rejected": -3.206958770751953, "loss": 1.4735, "odds_ratio_loss": 0.7089608907699585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14025728404521942, "rewards/margins": 0.18043860793113708, "rewards/rejected": -0.3206959068775177, "sft_loss": 1.4025729894638062, "step": 3860 }, { "epoch": 0.3, "grad_norm": 44.311912536621094, "learning_rate": 7.999860183144251e-06, "logits/chosen": -1.3083603382110596, "logits/rejected": -0.9558000564575195, "logps/chosen": -1.020721197128296, "logps/rejected": -3.8038506507873535, "loss": 1.0512, "odds_ratio_loss": 0.30526265501976013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10207213461399078, "rewards/margins": 0.27831295132637024, "rewards/rejected": -0.38038507103919983, "sft_loss": 1.020721197128296, "step": 3865 }, { "epoch": 0.3, "grad_norm": 16.797744750976562, "learning_rate": 7.994931723730617e-06, "logits/chosen": -1.3893539905548096, "logits/rejected": -0.7881430387496948, "logps/chosen": -1.0593688488006592, "logps/rejected": -1.619405746459961, "loss": 1.1074, "odds_ratio_loss": 0.47998982667922974, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10593689978122711, "rewards/margins": 0.056003689765930176, "rewards/rejected": -0.16194060444831848, "sft_loss": 1.0593688488006592, "step": 3870 }, { "epoch": 0.3, "grad_norm": 7.763211727142334, "learning_rate": 7.989998722122771e-06, "logits/chosen": -1.3000319004058838, "logits/rejected": -1.0927565097808838, "logps/chosen": -1.4487582445144653, "logps/rejected": -1.6110073328018188, "loss": 1.5211, "odds_ratio_loss": 0.7234418988227844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14487579464912415, "rewards/margins": 0.016224917024374008, "rewards/rejected": -0.16110071539878845, "sft_loss": 1.4487582445144653, "step": 3875 }, { "epoch": 0.3, "grad_norm": 11.750642776489258, "learning_rate": 7.98506118580224e-06, "logits/chosen": -1.2582849264144897, "logits/rejected": -0.896298885345459, "logps/chosen": -1.1716492176055908, "logps/rejected": -2.2447562217712402, "loss": 1.2204, "odds_ratio_loss": 0.4879697859287262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11716492474079132, "rewards/margins": 0.1073107123374939, "rewards/rejected": -0.2244756519794464, "sft_loss": 1.1716492176055908, "step": 3880 }, { "epoch": 0.3, "grad_norm": 9.925634384155273, "learning_rate": 7.98011912225742e-06, "logits/chosen": -1.3496580123901367, "logits/rejected": -1.125966191291809, "logps/chosen": -0.737195611000061, "logps/rejected": -2.681872844696045, "loss": 0.7503, "odds_ratio_loss": 0.1313866823911667, "rewards/accuracies": 1.0, "rewards/chosen": -0.0737195536494255, "rewards/margins": 0.194467693567276, "rewards/rejected": -0.2681872844696045, "sft_loss": 0.737195611000061, "step": 3885 }, { "epoch": 0.3, "grad_norm": 9.05740737915039, "learning_rate": 7.975172538983583e-06, "logits/chosen": -1.3671633005142212, "logits/rejected": -0.9348441362380981, "logps/chosen": -1.0503031015396118, "logps/rejected": -4.125294208526611, "loss": 1.0606, "odds_ratio_loss": 0.10310007631778717, "rewards/accuracies": 1.0, "rewards/chosen": -0.10503031313419342, "rewards/margins": 0.30749911069869995, "rewards/rejected": -0.4125294089317322, "sft_loss": 1.0503031015396118, "step": 3890 }, { "epoch": 0.3, "grad_norm": 9.612077713012695, "learning_rate": 7.970221443482847e-06, "logits/chosen": -1.3794662952423096, "logits/rejected": -0.8071687817573547, "logps/chosen": -1.513962745666504, "logps/rejected": -3.9997737407684326, "loss": 1.5482, "odds_ratio_loss": 0.3426254689693451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1513962745666504, "rewards/margins": 0.24858109652996063, "rewards/rejected": -0.3999773859977722, "sft_loss": 1.513962745666504, "step": 3895 }, { "epoch": 0.3, "grad_norm": 11.001276969909668, "learning_rate": 7.965265843264178e-06, "logits/chosen": -1.3838317394256592, "logits/rejected": -1.0965882539749146, "logps/chosen": -0.969623863697052, "logps/rejected": -3.6703083515167236, "loss": 1.003, "odds_ratio_loss": 0.33327335119247437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09696237742900848, "rewards/margins": 0.2700684666633606, "rewards/rejected": -0.3670308589935303, "sft_loss": 0.969623863697052, "step": 3900 }, { "epoch": 0.3, "grad_norm": 12.948336601257324, "learning_rate": 7.960305745843374e-06, "logits/chosen": -1.4322015047073364, "logits/rejected": -0.9059526324272156, "logps/chosen": -1.9296623468399048, "logps/rejected": -3.271130084991455, "loss": 1.9571, "odds_ratio_loss": 0.27451637387275696, "rewards/accuracies": 1.0, "rewards/chosen": -0.1929662525653839, "rewards/margins": 0.1341467797756195, "rewards/rejected": -0.32711300253868103, "sft_loss": 1.9296623468399048, "step": 3905 }, { "epoch": 0.3, "grad_norm": 25.758337020874023, "learning_rate": 7.955341158743048e-06, "logits/chosen": -1.312316656112671, "logits/rejected": -1.235752820968628, "logps/chosen": -0.7140806913375854, "logps/rejected": -3.0332131385803223, "loss": 0.7353, "odds_ratio_loss": 0.21259894967079163, "rewards/accuracies": 1.0, "rewards/chosen": -0.07140807062387466, "rewards/margins": 0.2319132536649704, "rewards/rejected": -0.30332133173942566, "sft_loss": 0.7140806913375854, "step": 3910 }, { "epoch": 0.3, "grad_norm": 17.689552307128906, "learning_rate": 7.950372089492634e-06, "logits/chosen": -1.3138293027877808, "logits/rejected": -0.8663791418075562, "logps/chosen": -0.8342885971069336, "logps/rejected": -1.7863073348999023, "loss": 0.8596, "odds_ratio_loss": 0.25299352407455444, "rewards/accuracies": 1.0, "rewards/chosen": -0.08342885971069336, "rewards/margins": 0.09520186483860016, "rewards/rejected": -0.1786307394504547, "sft_loss": 0.8342885971069336, "step": 3915 }, { "epoch": 0.3, "grad_norm": 6.6880574226379395, "learning_rate": 7.94539854562835e-06, "logits/chosen": -1.354337453842163, "logits/rejected": -0.8866574168205261, "logps/chosen": -1.0239028930664062, "logps/rejected": -2.7309670448303223, "loss": 1.0524, "odds_ratio_loss": 0.28486576676368713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10239030420780182, "rewards/margins": 0.17070642113685608, "rewards/rejected": -0.2730966806411743, "sft_loss": 1.0239028930664062, "step": 3920 }, { "epoch": 0.31, "grad_norm": 16.98858070373535, "learning_rate": 7.94042053469321e-06, "logits/chosen": -1.1966888904571533, "logits/rejected": -0.6225118637084961, "logps/chosen": -0.9805110692977905, "logps/rejected": -1.7540674209594727, "loss": 1.0401, "odds_ratio_loss": 0.5959800481796265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09805111587047577, "rewards/margins": 0.07735563069581985, "rewards/rejected": -0.17540673911571503, "sft_loss": 0.9805110692977905, "step": 3925 }, { "epoch": 0.31, "grad_norm": 12.183523178100586, "learning_rate": 7.935438064236998e-06, "logits/chosen": -1.2711061239242554, "logits/rejected": -0.860754132270813, "logps/chosen": -0.8835781812667847, "logps/rejected": -2.7187657356262207, "loss": 0.9118, "odds_ratio_loss": 0.2823113799095154, "rewards/accuracies": 1.0, "rewards/chosen": -0.0883578211069107, "rewards/margins": 0.18351872265338898, "rewards/rejected": -0.2718765437602997, "sft_loss": 0.8835781812667847, "step": 3930 }, { "epoch": 0.31, "grad_norm": 16.06085968017578, "learning_rate": 7.930451141816264e-06, "logits/chosen": -1.203362226486206, "logits/rejected": -0.922333836555481, "logps/chosen": -1.0303407907485962, "logps/rejected": -3.09401273727417, "loss": 1.0493, "odds_ratio_loss": 0.189732626080513, "rewards/accuracies": 1.0, "rewards/chosen": -0.10303407907485962, "rewards/margins": 0.20636720955371857, "rewards/rejected": -0.3094013035297394, "sft_loss": 1.0303407907485962, "step": 3935 }, { "epoch": 0.31, "grad_norm": 11.412202835083008, "learning_rate": 7.925459774994311e-06, "logits/chosen": -1.3621621131896973, "logits/rejected": -1.251784086227417, "logps/chosen": -1.0007188320159912, "logps/rejected": -2.657543897628784, "loss": 1.0286, "odds_ratio_loss": 0.27897682785987854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10007189214229584, "rewards/margins": 0.16568250954151154, "rewards/rejected": -0.2657544016838074, "sft_loss": 1.0007188320159912, "step": 3940 }, { "epoch": 0.31, "grad_norm": 19.832815170288086, "learning_rate": 7.920463971341175e-06, "logits/chosen": -1.4012175798416138, "logits/rejected": -0.8629199862480164, "logps/chosen": -0.9235948324203491, "logps/rejected": -5.943634986877441, "loss": 0.9452, "odds_ratio_loss": 0.2163792848587036, "rewards/accuracies": 1.0, "rewards/chosen": -0.09235947579145432, "rewards/margins": 0.5020040273666382, "rewards/rejected": -0.5943635106086731, "sft_loss": 0.9235948324203491, "step": 3945 }, { "epoch": 0.31, "grad_norm": 30.11895751953125, "learning_rate": 7.915463738433633e-06, "logits/chosen": -1.422339677810669, "logits/rejected": -1.1737186908721924, "logps/chosen": -1.0355141162872314, "logps/rejected": -2.3156991004943848, "loss": 1.0611, "odds_ratio_loss": 0.2554258108139038, "rewards/accuracies": 1.0, "rewards/chosen": -0.10355141013860703, "rewards/margins": 0.12801849842071533, "rewards/rejected": -0.23156991600990295, "sft_loss": 1.0355141162872314, "step": 3950 }, { "epoch": 0.31, "grad_norm": 10.107759475708008, "learning_rate": 7.910459083855169e-06, "logits/chosen": -1.4450719356536865, "logits/rejected": -0.7962635159492493, "logps/chosen": -1.148455023765564, "logps/rejected": -3.3044426441192627, "loss": 1.162, "odds_ratio_loss": 0.13495874404907227, "rewards/accuracies": 1.0, "rewards/chosen": -0.11484551429748535, "rewards/margins": 0.21559877693653107, "rewards/rejected": -0.3304442763328552, "sft_loss": 1.148455023765564, "step": 3955 }, { "epoch": 0.31, "grad_norm": 3.7334766387939453, "learning_rate": 7.905450015195977e-06, "logits/chosen": -1.6166236400604248, "logits/rejected": -0.9999248385429382, "logps/chosen": -1.002211570739746, "logps/rejected": -2.378195285797119, "loss": 1.0485, "odds_ratio_loss": 0.46309876441955566, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10022115707397461, "rewards/margins": 0.13759836554527283, "rewards/rejected": -0.23781950771808624, "sft_loss": 1.002211570739746, "step": 3960 }, { "epoch": 0.31, "grad_norm": 15.674083709716797, "learning_rate": 7.900436540052947e-06, "logits/chosen": -1.3820528984069824, "logits/rejected": -1.1322020292282104, "logps/chosen": -1.0542728900909424, "logps/rejected": -1.2473478317260742, "loss": 1.1235, "odds_ratio_loss": 0.6921383142471313, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10542728751897812, "rewards/margins": 0.019307482987642288, "rewards/rejected": -0.1247347742319107, "sft_loss": 1.0542728900909424, "step": 3965 }, { "epoch": 0.31, "grad_norm": 9.685905456542969, "learning_rate": 7.89541866602965e-06, "logits/chosen": -1.4918893575668335, "logits/rejected": -1.1254994869232178, "logps/chosen": -1.0190510749816895, "logps/rejected": -1.6521360874176025, "loss": 1.0692, "odds_ratio_loss": 0.5017727017402649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10190512239933014, "rewards/margins": 0.06330849975347519, "rewards/rejected": -0.16521361470222473, "sft_loss": 1.0190510749816895, "step": 3970 }, { "epoch": 0.31, "grad_norm": 23.007606506347656, "learning_rate": 7.89039640073633e-06, "logits/chosen": -1.2623697519302368, "logits/rejected": -0.8954647779464722, "logps/chosen": -1.166612148284912, "logps/rejected": -1.269649624824524, "loss": 1.2286, "odds_ratio_loss": 0.6198663711547852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11666121333837509, "rewards/margins": 0.010303747840225697, "rewards/rejected": -0.12696495652198792, "sft_loss": 1.166612148284912, "step": 3975 }, { "epoch": 0.31, "grad_norm": 11.377882957458496, "learning_rate": 7.88536975178989e-06, "logits/chosen": -1.3877617120742798, "logits/rejected": -0.8028934597969055, "logps/chosen": -1.045810580253601, "logps/rejected": -1.1776955127716064, "loss": 1.1115, "odds_ratio_loss": 0.6572698950767517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1045810729265213, "rewards/margins": 0.013188472017645836, "rewards/rejected": -0.11776953935623169, "sft_loss": 1.045810580253601, "step": 3980 }, { "epoch": 0.31, "grad_norm": 4.312228202819824, "learning_rate": 7.880338726813878e-06, "logits/chosen": -1.4793024063110352, "logits/rejected": -1.0138689279556274, "logps/chosen": -0.8670511245727539, "logps/rejected": -1.6700611114501953, "loss": 0.8964, "odds_ratio_loss": 0.29342907667160034, "rewards/accuracies": 1.0, "rewards/chosen": -0.08670511841773987, "rewards/margins": 0.08030100166797638, "rewards/rejected": -0.16700613498687744, "sft_loss": 0.8670511245727539, "step": 3985 }, { "epoch": 0.31, "grad_norm": 9.054874420166016, "learning_rate": 7.875303333438488e-06, "logits/chosen": -1.4101994037628174, "logits/rejected": -0.9546709060668945, "logps/chosen": -1.5213371515274048, "logps/rejected": -1.413010835647583, "loss": 1.6373, "odds_ratio_loss": 1.1592390537261963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15213371813297272, "rewards/margins": -0.010832617059350014, "rewards/rejected": -0.14130108058452606, "sft_loss": 1.5213371515274048, "step": 3990 }, { "epoch": 0.31, "grad_norm": 6.566228866577148, "learning_rate": 7.870263579300527e-06, "logits/chosen": -1.2961866855621338, "logits/rejected": -1.0387169122695923, "logps/chosen": -1.1798770427703857, "logps/rejected": -3.3426384925842285, "loss": 1.2468, "odds_ratio_loss": 0.6694552302360535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11798770725727081, "rewards/margins": 0.2162761390209198, "rewards/rejected": -0.3342638611793518, "sft_loss": 1.1798770427703857, "step": 3995 }, { "epoch": 0.31, "grad_norm": 10.971648216247559, "learning_rate": 7.865219472043429e-06, "logits/chosen": -1.2647547721862793, "logits/rejected": -0.7655025124549866, "logps/chosen": -1.0178542137145996, "logps/rejected": -3.2869560718536377, "loss": 1.0542, "odds_ratio_loss": 0.36323755979537964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10178543627262115, "rewards/margins": 0.22691015899181366, "rewards/rejected": -0.3286955952644348, "sft_loss": 1.0178542137145996, "step": 4000 }, { "epoch": 0.31, "grad_norm": 7.956406116485596, "learning_rate": 7.860171019317215e-06, "logits/chosen": -1.2619847059249878, "logits/rejected": -0.626349151134491, "logps/chosen": -1.2772443294525146, "logps/rejected": -2.4285922050476074, "loss": 1.3216, "odds_ratio_loss": 0.44381484389305115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12772443890571594, "rewards/margins": 0.11513479053974152, "rewards/rejected": -0.24285921454429626, "sft_loss": 1.2772443294525146, "step": 4005 }, { "epoch": 0.31, "grad_norm": 12.475284576416016, "learning_rate": 7.855118228778511e-06, "logits/chosen": -1.344545841217041, "logits/rejected": -0.8007782101631165, "logps/chosen": -1.6138957738876343, "logps/rejected": -3.235304355621338, "loss": 1.6714, "odds_ratio_loss": 0.5750583410263062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16138958930969238, "rewards/margins": 0.1621408313512802, "rewards/rejected": -0.3235304355621338, "sft_loss": 1.6138957738876343, "step": 4010 }, { "epoch": 0.31, "grad_norm": 187.03219604492188, "learning_rate": 7.850061108090514e-06, "logits/chosen": -1.1676523685455322, "logits/rejected": -1.0269989967346191, "logps/chosen": -1.3574920892715454, "logps/rejected": -2.018044948577881, "loss": 1.4053, "odds_ratio_loss": 0.4782230854034424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1357492059469223, "rewards/margins": 0.06605527549982071, "rewards/rejected": -0.2018044888973236, "sft_loss": 1.3574920892715454, "step": 4015 }, { "epoch": 0.31, "grad_norm": 11.270816802978516, "learning_rate": 7.844999664922987e-06, "logits/chosen": -1.146503210067749, "logits/rejected": -1.0891648530960083, "logps/chosen": -1.6510696411132812, "logps/rejected": -4.126519203186035, "loss": 1.669, "odds_ratio_loss": 0.17917956411838531, "rewards/accuracies": 1.0, "rewards/chosen": -0.16510698199272156, "rewards/margins": 0.24754495918750763, "rewards/rejected": -0.412651926279068, "sft_loss": 1.6510696411132812, "step": 4020 }, { "epoch": 0.31, "grad_norm": 130.6370086669922, "learning_rate": 7.839933906952252e-06, "logits/chosen": -1.077316164970398, "logits/rejected": -1.1368393898010254, "logps/chosen": -1.3282784223556519, "logps/rejected": -2.2032151222229004, "loss": 1.3634, "odds_ratio_loss": 0.3508017957210541, "rewards/accuracies": 1.0, "rewards/chosen": -0.13282786309719086, "rewards/margins": 0.08749367296695709, "rewards/rejected": -0.22032153606414795, "sft_loss": 1.3282784223556519, "step": 4025 }, { "epoch": 0.31, "grad_norm": 8.202735900878906, "learning_rate": 7.834863841861178e-06, "logits/chosen": -1.162980079650879, "logits/rejected": -0.596443772315979, "logps/chosen": -1.7192405462265015, "logps/rejected": -6.087828159332275, "loss": 1.7602, "odds_ratio_loss": 0.40991973876953125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17192408442497253, "rewards/margins": 0.43685880303382874, "rewards/rejected": -0.6087828874588013, "sft_loss": 1.7192405462265015, "step": 4030 }, { "epoch": 0.31, "grad_norm": 46.36093521118164, "learning_rate": 7.829789477339157e-06, "logits/chosen": -1.1748098134994507, "logits/rejected": -1.0097134113311768, "logps/chosen": -1.247901439666748, "logps/rejected": -2.3685479164123535, "loss": 1.2843, "odds_ratio_loss": 0.36368483304977417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12479015439748764, "rewards/margins": 0.11206464469432831, "rewards/rejected": -0.23685479164123535, "sft_loss": 1.247901439666748, "step": 4035 }, { "epoch": 0.31, "grad_norm": 7.1221842765808105, "learning_rate": 7.824710821082111e-06, "logits/chosen": -1.271332025527954, "logits/rejected": -0.8087499737739563, "logps/chosen": -0.9981080293655396, "logps/rejected": -1.9443790912628174, "loss": 1.0292, "odds_ratio_loss": 0.3106873631477356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09981080144643784, "rewards/margins": 0.09462710469961166, "rewards/rejected": -0.1944379061460495, "sft_loss": 0.9981080293655396, "step": 4040 }, { "epoch": 0.31, "grad_norm": 8.026951789855957, "learning_rate": 7.819627880792465e-06, "logits/chosen": -1.1581884622573853, "logits/rejected": -0.9363598823547363, "logps/chosen": -1.0807740688323975, "logps/rejected": -2.251509428024292, "loss": 1.109, "odds_ratio_loss": 0.28213122487068176, "rewards/accuracies": 1.0, "rewards/chosen": -0.10807742178440094, "rewards/margins": 0.11707352101802826, "rewards/rejected": -0.2251509726047516, "sft_loss": 1.0807740688323975, "step": 4045 }, { "epoch": 0.32, "grad_norm": 8.195876121520996, "learning_rate": 7.814540664179143e-06, "logits/chosen": -1.2763893604278564, "logits/rejected": -0.44702619314193726, "logps/chosen": -0.9961471557617188, "logps/rejected": -3.151249885559082, "loss": 1.0107, "odds_ratio_loss": 0.14514710009098053, "rewards/accuracies": 1.0, "rewards/chosen": -0.099614717066288, "rewards/margins": 0.2155102789402008, "rewards/rejected": -0.3151249885559082, "sft_loss": 0.9961471557617188, "step": 4050 }, { "epoch": 0.32, "grad_norm": 23.922773361206055, "learning_rate": 7.809449178957558e-06, "logits/chosen": -1.2120903730392456, "logits/rejected": -1.00906240940094, "logps/chosen": -1.162179946899414, "logps/rejected": -1.754237174987793, "loss": 1.2073, "odds_ratio_loss": 0.4514268934726715, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11621799319982529, "rewards/margins": 0.059205733239650726, "rewards/rejected": -0.17542371153831482, "sft_loss": 1.162179946899414, "step": 4055 }, { "epoch": 0.32, "grad_norm": 7.789605617523193, "learning_rate": 7.80435343284959e-06, "logits/chosen": -1.2017956972122192, "logits/rejected": -0.8519023060798645, "logps/chosen": -0.9169862866401672, "logps/rejected": -2.8535914421081543, "loss": 0.9404, "odds_ratio_loss": 0.23436644673347473, "rewards/accuracies": 1.0, "rewards/chosen": -0.09169862419366837, "rewards/margins": 0.19366052746772766, "rewards/rejected": -0.28535914421081543, "sft_loss": 0.9169862866401672, "step": 4060 }, { "epoch": 0.32, "grad_norm": 25.745149612426758, "learning_rate": 7.799253433583585e-06, "logits/chosen": -1.3630164861679077, "logits/rejected": -1.2323427200317383, "logps/chosen": -0.45543041825294495, "logps/rejected": -2.0177087783813477, "loss": 0.5283, "odds_ratio_loss": 0.7285929918289185, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.045543037354946136, "rewards/margins": 0.15622785687446594, "rewards/rejected": -0.2017708718776703, "sft_loss": 0.45543041825294495, "step": 4065 }, { "epoch": 0.32, "grad_norm": 14.55126953125, "learning_rate": 7.794149188894344e-06, "logits/chosen": -1.3345615863800049, "logits/rejected": -1.1428083181381226, "logps/chosen": -1.3834812641143799, "logps/rejected": -2.4057507514953613, "loss": 1.4419, "odds_ratio_loss": 0.5839170217514038, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13834811747074127, "rewards/margins": 0.10222695022821426, "rewards/rejected": -0.24057507514953613, "sft_loss": 1.3834812641143799, "step": 4070 }, { "epoch": 0.32, "grad_norm": 7.617037773132324, "learning_rate": 7.789040706523097e-06, "logits/chosen": -1.1593537330627441, "logits/rejected": -0.9324871301651001, "logps/chosen": -1.2534054517745972, "logps/rejected": -1.9832494258880615, "loss": 1.3098, "odds_ratio_loss": 0.5639361143112183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1253405511379242, "rewards/margins": 0.07298441231250763, "rewards/rejected": -0.19832494854927063, "sft_loss": 1.2534054517745972, "step": 4075 }, { "epoch": 0.32, "grad_norm": 17.690900802612305, "learning_rate": 7.78392799421751e-06, "logits/chosen": -1.249637484550476, "logits/rejected": -1.1855614185333252, "logps/chosen": -0.6458396315574646, "logps/rejected": -2.4688522815704346, "loss": 0.6594, "odds_ratio_loss": 0.13548722863197327, "rewards/accuracies": 1.0, "rewards/chosen": -0.06458397209644318, "rewards/margins": 0.18230126798152924, "rewards/rejected": -0.24688522517681122, "sft_loss": 0.6458396315574646, "step": 4080 }, { "epoch": 0.32, "grad_norm": 5.836312770843506, "learning_rate": 7.778811059731656e-06, "logits/chosen": -1.353683590888977, "logits/rejected": -0.6263580322265625, "logps/chosen": -1.063247561454773, "logps/rejected": -1.5036994218826294, "loss": 1.1098, "odds_ratio_loss": 0.46563464403152466, "rewards/accuracies": 1.0, "rewards/chosen": -0.10632475465536118, "rewards/margins": 0.044045187532901764, "rewards/rejected": -0.15036995708942413, "sft_loss": 1.063247561454773, "step": 4085 }, { "epoch": 0.32, "grad_norm": 5.767371654510498, "learning_rate": 7.773689910826019e-06, "logits/chosen": -1.4103326797485352, "logits/rejected": -1.1369997262954712, "logps/chosen": -0.8699052929878235, "logps/rejected": -1.9617269039154053, "loss": 0.9044, "odds_ratio_loss": 0.34492164850234985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08699052780866623, "rewards/margins": 0.10918216407299042, "rewards/rejected": -0.19617268443107605, "sft_loss": 0.8699052929878235, "step": 4090 }, { "epoch": 0.32, "grad_norm": 15.891542434692383, "learning_rate": 7.768564555267473e-06, "logits/chosen": -1.466572880744934, "logits/rejected": -1.3226369619369507, "logps/chosen": -1.081580400466919, "logps/rejected": -2.9113516807556152, "loss": 1.0987, "odds_ratio_loss": 0.1707053780555725, "rewards/accuracies": 1.0, "rewards/chosen": -0.10815805196762085, "rewards/margins": 0.18297713994979858, "rewards/rejected": -0.29113519191741943, "sft_loss": 1.081580400466919, "step": 4095 }, { "epoch": 0.32, "grad_norm": 121.89453125, "learning_rate": 7.763435000829267e-06, "logits/chosen": -1.290968656539917, "logits/rejected": -1.1049576997756958, "logps/chosen": -1.3791439533233643, "logps/rejected": -3.2474513053894043, "loss": 1.4425, "odds_ratio_loss": 0.6333954930305481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13791438937187195, "rewards/margins": 0.18683074414730072, "rewards/rejected": -0.3247451186180115, "sft_loss": 1.3791439533233643, "step": 4100 }, { "epoch": 0.32, "grad_norm": 28.180767059326172, "learning_rate": 7.758301255291022e-06, "logits/chosen": -1.1826337575912476, "logits/rejected": -0.9726226925849915, "logps/chosen": -1.1570237874984741, "logps/rejected": -1.7642463445663452, "loss": 1.1979, "odds_ratio_loss": 0.409106969833374, "rewards/accuracies": 1.0, "rewards/chosen": -0.11570239067077637, "rewards/margins": 0.06072225421667099, "rewards/rejected": -0.17642465233802795, "sft_loss": 1.1570237874984741, "step": 4105 }, { "epoch": 0.32, "grad_norm": 9.990339279174805, "learning_rate": 7.753163326438716e-06, "logits/chosen": -1.3788506984710693, "logits/rejected": -0.790817379951477, "logps/chosen": -0.9605264663696289, "logps/rejected": -4.762506484985352, "loss": 0.9911, "odds_ratio_loss": 0.30601152777671814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09605265408754349, "rewards/margins": 0.38019803166389465, "rewards/rejected": -0.47625064849853516, "sft_loss": 0.9605264663696289, "step": 4110 }, { "epoch": 0.32, "grad_norm": 12.46078109741211, "learning_rate": 7.74802122206467e-06, "logits/chosen": -1.1677567958831787, "logits/rejected": -0.9421844482421875, "logps/chosen": -0.9032756686210632, "logps/rejected": -5.672630310058594, "loss": 0.9356, "odds_ratio_loss": 0.32349324226379395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09032757580280304, "rewards/margins": 0.476935476064682, "rewards/rejected": -0.5672630071640015, "sft_loss": 0.9032756686210632, "step": 4115 }, { "epoch": 0.32, "grad_norm": 16.93182373046875, "learning_rate": 7.74287494996754e-06, "logits/chosen": -1.1149744987487793, "logits/rejected": -1.004250407218933, "logps/chosen": -0.9445215463638306, "logps/rejected": -2.6246485710144043, "loss": 0.9833, "odds_ratio_loss": 0.38825660943984985, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0944521576166153, "rewards/margins": 0.16801270842552185, "rewards/rejected": -0.26246488094329834, "sft_loss": 0.9445215463638306, "step": 4120 }, { "epoch": 0.32, "grad_norm": 8.397687911987305, "learning_rate": 7.737724517952298e-06, "logits/chosen": -1.1781234741210938, "logits/rejected": -0.7847142219543457, "logps/chosen": -0.8613178133964539, "logps/rejected": -2.36332368850708, "loss": 0.8903, "odds_ratio_loss": 0.2901086211204529, "rewards/accuracies": 1.0, "rewards/chosen": -0.08613178133964539, "rewards/margins": 0.15020060539245605, "rewards/rejected": -0.23633238673210144, "sft_loss": 0.8613178133964539, "step": 4125 }, { "epoch": 0.32, "grad_norm": 34.25830078125, "learning_rate": 7.732569933830229e-06, "logits/chosen": -1.1180028915405273, "logits/rejected": -0.9175459146499634, "logps/chosen": -1.0336300134658813, "logps/rejected": -2.0328879356384277, "loss": 1.0612, "odds_ratio_loss": 0.27564844489097595, "rewards/accuracies": 1.0, "rewards/chosen": -0.10336299985647202, "rewards/margins": 0.09992580860853195, "rewards/rejected": -0.20328882336616516, "sft_loss": 1.0336300134658813, "step": 4130 }, { "epoch": 0.32, "grad_norm": 23.185644149780273, "learning_rate": 7.727411205418917e-06, "logits/chosen": -1.1140944957733154, "logits/rejected": -0.915080189704895, "logps/chosen": -1.1209285259246826, "logps/rejected": -1.5422135591506958, "loss": 1.1705, "odds_ratio_loss": 0.4952412545681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11209283769130707, "rewards/margins": 0.042128510773181915, "rewards/rejected": -0.15422135591506958, "sft_loss": 1.1209285259246826, "step": 4135 }, { "epoch": 0.32, "grad_norm": 14.448271751403809, "learning_rate": 7.722248340542224e-06, "logits/chosen": -1.1452420949935913, "logits/rejected": -1.0613524913787842, "logps/chosen": -1.0193283557891846, "logps/rejected": -3.6547303199768066, "loss": 1.0453, "odds_ratio_loss": 0.2597232460975647, "rewards/accuracies": 1.0, "rewards/chosen": -0.1019328385591507, "rewards/margins": 0.2635401487350464, "rewards/rejected": -0.3654729723930359, "sft_loss": 1.0193283557891846, "step": 4140 }, { "epoch": 0.32, "grad_norm": 13.385184288024902, "learning_rate": 7.717081347030295e-06, "logits/chosen": -1.1069594621658325, "logits/rejected": -0.6620140075683594, "logps/chosen": -1.2712277173995972, "logps/rejected": -2.2989327907562256, "loss": 1.3054, "odds_ratio_loss": 0.3419440686702728, "rewards/accuracies": 1.0, "rewards/chosen": -0.12712277472019196, "rewards/margins": 0.10277052223682404, "rewards/rejected": -0.229893296957016, "sft_loss": 1.2712277173995972, "step": 4145 }, { "epoch": 0.32, "grad_norm": 9.037135124206543, "learning_rate": 7.711910232719526e-06, "logits/chosen": -1.2756738662719727, "logits/rejected": -1.017519474029541, "logps/chosen": -0.8686092495918274, "logps/rejected": -3.832780122756958, "loss": 0.8839, "odds_ratio_loss": 0.15329131484031677, "rewards/accuracies": 1.0, "rewards/chosen": -0.08686093240976334, "rewards/margins": 0.29641711711883545, "rewards/rejected": -0.3832780420780182, "sft_loss": 0.8686092495918274, "step": 4150 }, { "epoch": 0.32, "grad_norm": 17.691438674926758, "learning_rate": 7.706735005452574e-06, "logits/chosen": -1.1250547170639038, "logits/rejected": -1.2741339206695557, "logps/chosen": -1.1006823778152466, "logps/rejected": -1.6535568237304688, "loss": 1.147, "odds_ratio_loss": 0.4627433717250824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11006822437047958, "rewards/margins": 0.05528745800256729, "rewards/rejected": -0.16535568237304688, "sft_loss": 1.1006823778152466, "step": 4155 }, { "epoch": 0.32, "grad_norm": 10.580764770507812, "learning_rate": 7.701555673078324e-06, "logits/chosen": -1.3162877559661865, "logits/rejected": -0.5796257257461548, "logps/chosen": -1.0522043704986572, "logps/rejected": -5.691858291625977, "loss": 1.0987, "odds_ratio_loss": 0.46448415517807007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10522043704986572, "rewards/margins": 0.46396535634994507, "rewards/rejected": -0.5691858530044556, "sft_loss": 1.0522043704986572, "step": 4160 }, { "epoch": 0.32, "grad_norm": 29.99909019470215, "learning_rate": 7.696372243451894e-06, "logits/chosen": -1.4073673486709595, "logits/rejected": -1.3161767721176147, "logps/chosen": -0.8386504054069519, "logps/rejected": -1.499929666519165, "loss": 0.905, "odds_ratio_loss": 0.663795530796051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08386505395174026, "rewards/margins": 0.06612792611122131, "rewards/rejected": -0.14999297261238098, "sft_loss": 0.8386504054069519, "step": 4165 }, { "epoch": 0.32, "grad_norm": 6.166081428527832, "learning_rate": 7.691184724434613e-06, "logits/chosen": -1.2340881824493408, "logits/rejected": -0.5184012651443481, "logps/chosen": -0.8715030550956726, "logps/rejected": -1.367983102798462, "loss": 0.9184, "odds_ratio_loss": 0.46853357553482056, "rewards/accuracies": 1.0, "rewards/chosen": -0.08715032041072845, "rewards/margins": 0.049647994339466095, "rewards/rejected": -0.13679829239845276, "sft_loss": 0.8715030550956726, "step": 4170 }, { "epoch": 0.32, "grad_norm": 10.00060749053955, "learning_rate": 7.685993123894008e-06, "logits/chosen": -1.2512257099151611, "logits/rejected": -0.9624239206314087, "logps/chosen": -1.1072269678115845, "logps/rejected": -2.8060803413391113, "loss": 1.1424, "odds_ratio_loss": 0.3513997197151184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11072269827127457, "rewards/margins": 0.16988535225391388, "rewards/rejected": -0.28060805797576904, "sft_loss": 1.1072269678115845, "step": 4175 }, { "epoch": 0.33, "grad_norm": 7.813191890716553, "learning_rate": 7.680797449703808e-06, "logits/chosen": -1.23801589012146, "logits/rejected": -1.0573147535324097, "logps/chosen": -1.1597460508346558, "logps/rejected": -2.114023208618164, "loss": 1.1948, "odds_ratio_loss": 0.3502116799354553, "rewards/accuracies": 1.0, "rewards/chosen": -0.11597461998462677, "rewards/margins": 0.09542771428823471, "rewards/rejected": -0.2114022970199585, "sft_loss": 1.1597460508346558, "step": 4180 }, { "epoch": 0.33, "grad_norm": 65.03484344482422, "learning_rate": 7.675597709743906e-06, "logits/chosen": -1.2186355590820312, "logits/rejected": -1.2111581563949585, "logps/chosen": -0.8498256802558899, "logps/rejected": -4.716329097747803, "loss": 0.8842, "odds_ratio_loss": 0.343315064907074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08498255908489227, "rewards/margins": 0.38665035367012024, "rewards/rejected": -0.4716328978538513, "sft_loss": 0.8498256802558899, "step": 4185 }, { "epoch": 0.33, "grad_norm": 5.66574239730835, "learning_rate": 7.67039391190037e-06, "logits/chosen": -1.1516528129577637, "logits/rejected": -1.02261483669281, "logps/chosen": -1.4871667623519897, "logps/rejected": -4.307422161102295, "loss": 1.5184, "odds_ratio_loss": 0.31264442205429077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14871668815612793, "rewards/margins": 0.2820255160331726, "rewards/rejected": -0.43074217438697815, "sft_loss": 1.4871667623519897, "step": 4190 }, { "epoch": 0.33, "grad_norm": 7.576369762420654, "learning_rate": 7.665186064065419e-06, "logits/chosen": -1.2199019193649292, "logits/rejected": -0.5204739570617676, "logps/chosen": -1.1422882080078125, "logps/rejected": -3.815093994140625, "loss": 1.1608, "odds_ratio_loss": 0.18513749539852142, "rewards/accuracies": 1.0, "rewards/chosen": -0.11422882974147797, "rewards/margins": 0.26728057861328125, "rewards/rejected": -0.381509393453598, "sft_loss": 1.1422882080078125, "step": 4195 }, { "epoch": 0.33, "grad_norm": 51.00697326660156, "learning_rate": 7.659974174137418e-06, "logits/chosen": -1.257304310798645, "logits/rejected": -1.0916509628295898, "logps/chosen": -1.3007371425628662, "logps/rejected": -2.1112220287323, "loss": 1.3443, "odds_ratio_loss": 0.4351831376552582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13007371127605438, "rewards/margins": 0.08104848861694336, "rewards/rejected": -0.21112219989299774, "sft_loss": 1.3007371425628662, "step": 4200 }, { "epoch": 0.33, "grad_norm": 19.428726196289062, "learning_rate": 7.654758250020858e-06, "logits/chosen": -1.285024642944336, "logits/rejected": -1.0297925472259521, "logps/chosen": -0.8580648303031921, "logps/rejected": -3.976184129714966, "loss": 0.8923, "odds_ratio_loss": 0.3418591618537903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08580649644136429, "rewards/margins": 0.3118119239807129, "rewards/rejected": -0.3976184129714966, "sft_loss": 0.8580648303031921, "step": 4205 }, { "epoch": 0.33, "grad_norm": 15.734051704406738, "learning_rate": 7.64953829962635e-06, "logits/chosen": -1.305315375328064, "logits/rejected": -0.7829909324645996, "logps/chosen": -1.1611686944961548, "logps/rejected": -7.541050910949707, "loss": 1.1728, "odds_ratio_loss": 0.11647912114858627, "rewards/accuracies": 1.0, "rewards/chosen": -0.11611686646938324, "rewards/margins": 0.6379882097244263, "rewards/rejected": -0.7541050314903259, "sft_loss": 1.1611686944961548, "step": 4210 }, { "epoch": 0.33, "grad_norm": 9.167802810668945, "learning_rate": 7.644314330870614e-06, "logits/chosen": -1.268035650253296, "logits/rejected": -1.1111148595809937, "logps/chosen": -1.1187152862548828, "logps/rejected": -5.738256454467773, "loss": 1.1216, "odds_ratio_loss": 0.028980012983083725, "rewards/accuracies": 1.0, "rewards/chosen": -0.11187154054641724, "rewards/margins": 0.4619540572166443, "rewards/rejected": -0.5738255977630615, "sft_loss": 1.1187152862548828, "step": 4215 }, { "epoch": 0.33, "grad_norm": 18.73733139038086, "learning_rate": 7.63908635167646e-06, "logits/chosen": -1.3024123907089233, "logits/rejected": -0.8621677160263062, "logps/chosen": -1.0445382595062256, "logps/rejected": -2.0720951557159424, "loss": 1.0909, "odds_ratio_loss": 0.4636809229850769, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10445381700992584, "rewards/margins": 0.10275568813085556, "rewards/rejected": -0.207209512591362, "sft_loss": 1.0445382595062256, "step": 4220 }, { "epoch": 0.33, "grad_norm": 7.220019817352295, "learning_rate": 7.633854369972779e-06, "logits/chosen": -1.1436474323272705, "logits/rejected": -1.0087378025054932, "logps/chosen": -0.9033777117729187, "logps/rejected": -2.1096222400665283, "loss": 0.9352, "odds_ratio_loss": 0.31843090057373047, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09033779054880142, "rewards/margins": 0.12062442302703857, "rewards/rejected": -0.2109622210264206, "sft_loss": 0.9033777117729187, "step": 4225 }, { "epoch": 0.33, "grad_norm": 7.440292835235596, "learning_rate": 7.628618393694543e-06, "logits/chosen": -1.2576261758804321, "logits/rejected": -1.3421485424041748, "logps/chosen": -0.9210315942764282, "logps/rejected": -1.9578853845596313, "loss": 0.9629, "odds_ratio_loss": 0.4189070761203766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09210315346717834, "rewards/margins": 0.1036853939294815, "rewards/rejected": -0.19578854739665985, "sft_loss": 0.9210315942764282, "step": 4230 }, { "epoch": 0.33, "grad_norm": 10.724273681640625, "learning_rate": 7.623378430782768e-06, "logits/chosen": -1.3568766117095947, "logits/rejected": -1.1589086055755615, "logps/chosen": -0.8155969381332397, "logps/rejected": -3.8145556449890137, "loss": 0.8491, "odds_ratio_loss": 0.3349160850048065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0815596953034401, "rewards/margins": 0.2998958230018616, "rewards/rejected": -0.38145551085472107, "sft_loss": 0.8155969381332397, "step": 4235 }, { "epoch": 0.33, "grad_norm": 10.178549766540527, "learning_rate": 7.618134489184527e-06, "logits/chosen": -1.283182978630066, "logits/rejected": -1.1112940311431885, "logps/chosen": -1.0650850534439087, "logps/rejected": -1.8539899587631226, "loss": 1.0975, "odds_ratio_loss": 0.32411864399909973, "rewards/accuracies": 1.0, "rewards/chosen": -0.10650850832462311, "rewards/margins": 0.07889048755168915, "rewards/rejected": -0.18539901077747345, "sft_loss": 1.0650850534439087, "step": 4240 }, { "epoch": 0.33, "grad_norm": 7.90317440032959, "learning_rate": 7.612886576852921e-06, "logits/chosen": -1.2977828979492188, "logits/rejected": -1.011577844619751, "logps/chosen": -1.0474843978881836, "logps/rejected": -1.0922882556915283, "loss": 1.1261, "odds_ratio_loss": 0.7860459089279175, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.1047484427690506, "rewards/margins": 0.004480388946831226, "rewards/rejected": -0.10922882705926895, "sft_loss": 1.0474843978881836, "step": 4245 }, { "epoch": 0.33, "grad_norm": 14.927838325500488, "learning_rate": 7.607634701747076e-06, "logits/chosen": -1.3175591230392456, "logits/rejected": -1.2058073282241821, "logps/chosen": -1.086005687713623, "logps/rejected": -6.419415473937988, "loss": 1.0937, "odds_ratio_loss": 0.07735596597194672, "rewards/accuracies": 1.0, "rewards/chosen": -0.10860057175159454, "rewards/margins": 0.5333409309387207, "rewards/rejected": -0.6419415473937988, "sft_loss": 1.086005687713623, "step": 4250 }, { "epoch": 0.33, "grad_norm": 9.913247108459473, "learning_rate": 7.602378871832126e-06, "logits/chosen": -1.297736406326294, "logits/rejected": -0.5890460014343262, "logps/chosen": -0.9432314038276672, "logps/rejected": -4.475759029388428, "loss": 0.9813, "odds_ratio_loss": 0.3807455599308014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09432314336299896, "rewards/margins": 0.3532527983188629, "rewards/rejected": -0.44757595658302307, "sft_loss": 0.9432314038276672, "step": 4255 }, { "epoch": 0.33, "grad_norm": 5.066327095031738, "learning_rate": 7.597119095079209e-06, "logits/chosen": -1.2584011554718018, "logits/rejected": -0.6905049681663513, "logps/chosen": -1.0048248767852783, "logps/rejected": -1.8865995407104492, "loss": 1.0372, "odds_ratio_loss": 0.323311984539032, "rewards/accuracies": 1.0, "rewards/chosen": -0.1004825010895729, "rewards/margins": 0.08817745000123978, "rewards/rejected": -0.1886599361896515, "sft_loss": 1.0048248767852783, "step": 4260 }, { "epoch": 0.33, "grad_norm": 35.4716796875, "learning_rate": 7.5918553794654405e-06, "logits/chosen": -1.2576793432235718, "logits/rejected": -1.0605818033218384, "logps/chosen": -1.1159580945968628, "logps/rejected": -1.926154375076294, "loss": 1.153, "odds_ratio_loss": 0.37044578790664673, "rewards/accuracies": 1.0, "rewards/chosen": -0.11159580945968628, "rewards/margins": 0.08101961761713028, "rewards/rejected": -0.19261543452739716, "sft_loss": 1.1159580945968628, "step": 4265 }, { "epoch": 0.33, "grad_norm": 6.506315231323242, "learning_rate": 7.586587732973914e-06, "logits/chosen": -1.404921293258667, "logits/rejected": -0.6291941404342651, "logps/chosen": -0.815272331237793, "logps/rejected": -2.1161012649536133, "loss": 0.8365, "odds_ratio_loss": 0.21181544661521912, "rewards/accuracies": 1.0, "rewards/chosen": -0.0815272331237793, "rewards/margins": 0.13008292019367218, "rewards/rejected": -0.21161015331745148, "sft_loss": 0.815272331237793, "step": 4270 }, { "epoch": 0.33, "grad_norm": 5.738877296447754, "learning_rate": 7.581316163593684e-06, "logits/chosen": -1.2702836990356445, "logits/rejected": -0.9186004400253296, "logps/chosen": -0.8667852282524109, "logps/rejected": -2.548339605331421, "loss": 0.8981, "odds_ratio_loss": 0.3134905695915222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08667852729558945, "rewards/margins": 0.1681554615497589, "rewards/rejected": -0.25483399629592896, "sft_loss": 0.8667852282524109, "step": 4275 }, { "epoch": 0.33, "grad_norm": 64.36282348632812, "learning_rate": 7.576040679319755e-06, "logits/chosen": -1.251603126525879, "logits/rejected": -0.6915744543075562, "logps/chosen": -0.9854947328567505, "logps/rejected": -5.712364673614502, "loss": 0.9989, "odds_ratio_loss": 0.1339355707168579, "rewards/accuracies": 1.0, "rewards/chosen": -0.09854947775602341, "rewards/margins": 0.4726869463920593, "rewards/rejected": -0.5712364912033081, "sft_loss": 0.9854947328567505, "step": 4280 }, { "epoch": 0.33, "grad_norm": 145.11282348632812, "learning_rate": 7.570761288153069e-06, "logits/chosen": -1.1111905574798584, "logits/rejected": -0.8975087404251099, "logps/chosen": -0.9915755987167358, "logps/rejected": -1.3429906368255615, "loss": 1.0428, "odds_ratio_loss": 0.5118028521537781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09915756434202194, "rewards/margins": 0.035141509026288986, "rewards/rejected": -0.13429906964302063, "sft_loss": 0.9915755987167358, "step": 4285 }, { "epoch": 0.33, "grad_norm": 6.5045084953308105, "learning_rate": 7.565477998100494e-06, "logits/chosen": -1.4710153341293335, "logits/rejected": -0.905910849571228, "logps/chosen": -0.8476712107658386, "logps/rejected": -2.0717978477478027, "loss": 0.873, "odds_ratio_loss": 0.2529553472995758, "rewards/accuracies": 1.0, "rewards/chosen": -0.08476711809635162, "rewards/margins": 0.12241265922784805, "rewards/rejected": -0.20717978477478027, "sft_loss": 0.8476712107658386, "step": 4290 }, { "epoch": 0.33, "grad_norm": 8.989347457885742, "learning_rate": 7.560190817174808e-06, "logits/chosen": -1.3830236196517944, "logits/rejected": -0.9063690900802612, "logps/chosen": -0.9198230504989624, "logps/rejected": -2.5667905807495117, "loss": 0.9511, "odds_ratio_loss": 0.31292420625686646, "rewards/accuracies": 1.0, "rewards/chosen": -0.09198231250047684, "rewards/margins": 0.16469675302505493, "rewards/rejected": -0.25667905807495117, "sft_loss": 0.9198230504989624, "step": 4295 }, { "epoch": 0.33, "grad_norm": 4.432363510131836, "learning_rate": 7.554899753394696e-06, "logits/chosen": -1.2827117443084717, "logits/rejected": -0.44630032777786255, "logps/chosen": -0.8856255412101746, "logps/rejected": -3.802992582321167, "loss": 0.9092, "odds_ratio_loss": 0.23532943427562714, "rewards/accuracies": 1.0, "rewards/chosen": -0.08856256306171417, "rewards/margins": 0.2917366921901703, "rewards/rejected": -0.38029927015304565, "sft_loss": 0.8856255412101746, "step": 4300 }, { "epoch": 0.33, "grad_norm": 8.772881507873535, "learning_rate": 7.549604814784721e-06, "logits/chosen": -1.3126475811004639, "logits/rejected": -0.811437726020813, "logps/chosen": -0.6851466298103333, "logps/rejected": -1.9268734455108643, "loss": 0.7091, "odds_ratio_loss": 0.2396661341190338, "rewards/accuracies": 1.0, "rewards/chosen": -0.06851466745138168, "rewards/margins": 0.12417266517877579, "rewards/rejected": -0.19268734753131866, "sft_loss": 0.6851466298103333, "step": 4305 }, { "epoch": 0.34, "grad_norm": 6.346287250518799, "learning_rate": 7.544306009375335e-06, "logits/chosen": -1.3210334777832031, "logits/rejected": -0.9792496562004089, "logps/chosen": -0.7893930077552795, "logps/rejected": -1.422093152999878, "loss": 0.8408, "odds_ratio_loss": 0.5139185190200806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07893930375576019, "rewards/margins": 0.06327001750469208, "rewards/rejected": -0.14220932126045227, "sft_loss": 0.7893930077552795, "step": 4310 }, { "epoch": 0.34, "grad_norm": 37.05595779418945, "learning_rate": 7.53900334520285e-06, "logits/chosen": -1.3776777982711792, "logits/rejected": -1.3684583902359009, "logps/chosen": -1.2751832008361816, "logps/rejected": -2.9548025131225586, "loss": 1.2969, "odds_ratio_loss": 0.2176557332277298, "rewards/accuracies": 1.0, "rewards/chosen": -0.12751832604408264, "rewards/margins": 0.16796192526817322, "rewards/rejected": -0.29548028111457825, "sft_loss": 1.2751832008361816, "step": 4315 }, { "epoch": 0.34, "grad_norm": 20.400171279907227, "learning_rate": 7.533696830309427e-06, "logits/chosen": -1.42886221408844, "logits/rejected": -1.1865837574005127, "logps/chosen": -0.9903708696365356, "logps/rejected": -1.2984185218811035, "loss": 1.0777, "odds_ratio_loss": 0.87353515625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09903708845376968, "rewards/margins": 0.030804771929979324, "rewards/rejected": -0.1298418492078781, "sft_loss": 0.9903708696365356, "step": 4320 }, { "epoch": 0.34, "grad_norm": 16.031402587890625, "learning_rate": 7.52838647274307e-06, "logits/chosen": -1.289991855621338, "logits/rejected": -0.9572579264640808, "logps/chosen": -1.0513736009597778, "logps/rejected": -1.9264914989471436, "loss": 1.0815, "odds_ratio_loss": 0.30108073353767395, "rewards/accuracies": 1.0, "rewards/chosen": -0.10513736307621002, "rewards/margins": 0.08751179277896881, "rewards/rejected": -0.19264915585517883, "sft_loss": 1.0513736009597778, "step": 4325 }, { "epoch": 0.34, "grad_norm": 8.47230339050293, "learning_rate": 7.5230722805576105e-06, "logits/chosen": -1.1652015447616577, "logits/rejected": -1.0036388635635376, "logps/chosen": -1.0877656936645508, "logps/rejected": -1.7497972249984741, "loss": 1.1352, "odds_ratio_loss": 0.4747348725795746, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10877655446529388, "rewards/margins": 0.06620316207408905, "rewards/rejected": -0.17497971653938293, "sft_loss": 1.0877656936645508, "step": 4330 }, { "epoch": 0.34, "grad_norm": 28.46778678894043, "learning_rate": 7.517754261812695e-06, "logits/chosen": -1.2815945148468018, "logits/rejected": -0.7410744428634644, "logps/chosen": -1.0584745407104492, "logps/rejected": -2.4538943767547607, "loss": 1.1028, "odds_ratio_loss": 0.44320353865623474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10584745556116104, "rewards/margins": 0.13954199850559235, "rewards/rejected": -0.24538946151733398, "sft_loss": 1.0584745407104492, "step": 4335 }, { "epoch": 0.34, "grad_norm": 7.073278427124023, "learning_rate": 7.512432424573777e-06, "logits/chosen": -1.2740113735198975, "logits/rejected": -0.702880859375, "logps/chosen": -1.0853495597839355, "logps/rejected": -2.0686047077178955, "loss": 1.1248, "odds_ratio_loss": 0.3948959708213806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10853495448827744, "rewards/margins": 0.09832551330327988, "rewards/rejected": -0.2068604677915573, "sft_loss": 1.0853495597839355, "step": 4340 }, { "epoch": 0.34, "grad_norm": 24.61713218688965, "learning_rate": 7.507106776912094e-06, "logits/chosen": -1.336874008178711, "logits/rejected": -0.7619214057922363, "logps/chosen": -0.7855367064476013, "logps/rejected": -2.529465913772583, "loss": 0.8186, "odds_ratio_loss": 0.3307104706764221, "rewards/accuracies": 1.0, "rewards/chosen": -0.07855366915464401, "rewards/margins": 0.1743929386138916, "rewards/rejected": -0.2529466152191162, "sft_loss": 0.7855367064476013, "step": 4345 }, { "epoch": 0.34, "grad_norm": 8.270886421203613, "learning_rate": 7.501777326904671e-06, "logits/chosen": -1.267987847328186, "logits/rejected": -0.8105506896972656, "logps/chosen": -1.2178547382354736, "logps/rejected": -2.15023136138916, "loss": 1.2791, "odds_ratio_loss": 0.6123980283737183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1217854842543602, "rewards/margins": 0.09323765337467194, "rewards/rejected": -0.21502313017845154, "sft_loss": 1.2178547382354736, "step": 4350 }, { "epoch": 0.34, "grad_norm": 15.25228214263916, "learning_rate": 7.4964440826342925e-06, "logits/chosen": -1.4595444202423096, "logits/rejected": -1.180437445640564, "logps/chosen": -1.1525750160217285, "logps/rejected": -2.413696765899658, "loss": 1.1908, "odds_ratio_loss": 0.382191002368927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11525748670101166, "rewards/margins": 0.1261121779680252, "rewards/rejected": -0.24136967957019806, "sft_loss": 1.1525750160217285, "step": 4355 }, { "epoch": 0.34, "grad_norm": 37.01783752441406, "learning_rate": 7.4911070521895015e-06, "logits/chosen": -1.1667481660842896, "logits/rejected": -1.1027100086212158, "logps/chosen": -1.0879366397857666, "logps/rejected": -1.8874828815460205, "loss": 1.1492, "odds_ratio_loss": 0.6123815774917603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10879365354776382, "rewards/margins": 0.0799546092748642, "rewards/rejected": -0.18874827027320862, "sft_loss": 1.0879366397857666, "step": 4360 }, { "epoch": 0.34, "grad_norm": 6.166680335998535, "learning_rate": 7.485766243664583e-06, "logits/chosen": -1.2937138080596924, "logits/rejected": -0.8818578720092773, "logps/chosen": -0.8433329463005066, "logps/rejected": -10.050994873046875, "loss": 0.8687, "odds_ratio_loss": 0.2538653016090393, "rewards/accuracies": 1.0, "rewards/chosen": -0.08433329313993454, "rewards/margins": 0.9207661747932434, "rewards/rejected": -1.0050995349884033, "sft_loss": 0.8433329463005066, "step": 4365 }, { "epoch": 0.34, "grad_norm": 93.40538024902344, "learning_rate": 7.480421665159551e-06, "logits/chosen": -1.0598971843719482, "logits/rejected": -1.1688032150268555, "logps/chosen": -1.1783250570297241, "logps/rejected": -2.901451349258423, "loss": 1.2068, "odds_ratio_loss": 0.2851187586784363, "rewards/accuracies": 1.0, "rewards/chosen": -0.11783250421285629, "rewards/margins": 0.1723126471042633, "rewards/rejected": -0.2901450991630554, "sft_loss": 1.1783250570297241, "step": 4370 }, { "epoch": 0.34, "grad_norm": 4.833317756652832, "learning_rate": 7.475073324780138e-06, "logits/chosen": -1.4235963821411133, "logits/rejected": -0.9377709627151489, "logps/chosen": -1.1065336465835571, "logps/rejected": -2.8160653114318848, "loss": 1.1591, "odds_ratio_loss": 0.5254218578338623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1106533631682396, "rewards/margins": 0.17095312476158142, "rewards/rejected": -0.2816064953804016, "sft_loss": 1.1065336465835571, "step": 4375 }, { "epoch": 0.34, "grad_norm": 16.7370662689209, "learning_rate": 7.4697212306377785e-06, "logits/chosen": -1.1766154766082764, "logits/rejected": -1.238294243812561, "logps/chosen": -0.9507826566696167, "logps/rejected": -1.9046952724456787, "loss": 0.9904, "odds_ratio_loss": 0.396290123462677, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09507826715707779, "rewards/margins": 0.09539126604795456, "rewards/rejected": -0.19046953320503235, "sft_loss": 0.9507826566696167, "step": 4380 }, { "epoch": 0.34, "grad_norm": 10.103072166442871, "learning_rate": 7.464365390849606e-06, "logits/chosen": -1.1430937051773071, "logits/rejected": -0.7908920049667358, "logps/chosen": -1.3288061618804932, "logps/rejected": -6.744845390319824, "loss": 1.3698, "odds_ratio_loss": 0.40968722105026245, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13288059830665588, "rewards/margins": 0.5416039228439331, "rewards/rejected": -0.6744846105575562, "sft_loss": 1.3288061618804932, "step": 4385 }, { "epoch": 0.34, "grad_norm": 9.449183464050293, "learning_rate": 7.45900581353843e-06, "logits/chosen": -1.0516716241836548, "logits/rejected": -1.0172231197357178, "logps/chosen": -1.2207443714141846, "logps/rejected": -1.3754041194915771, "loss": 1.3082, "odds_ratio_loss": 0.8747666478157043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1220744401216507, "rewards/margins": 0.015465967357158661, "rewards/rejected": -0.13754041492938995, "sft_loss": 1.2207443714141846, "step": 4390 }, { "epoch": 0.34, "grad_norm": 110.34417724609375, "learning_rate": 7.45364250683273e-06, "logits/chosen": -1.2138774394989014, "logits/rejected": -1.1199032068252563, "logps/chosen": -1.302486538887024, "logps/rejected": -2.1626267433166504, "loss": 1.3895, "odds_ratio_loss": 0.8700674772262573, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.13024863600730896, "rewards/margins": 0.086014024913311, "rewards/rejected": -0.21626269817352295, "sft_loss": 1.302486538887024, "step": 4395 }, { "epoch": 0.34, "grad_norm": 5.204502582550049, "learning_rate": 7.448275478866642e-06, "logits/chosen": -1.2459137439727783, "logits/rejected": -0.7799798846244812, "logps/chosen": -1.1113945245742798, "logps/rejected": -2.155890703201294, "loss": 1.1465, "odds_ratio_loss": 0.35096412897109985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1111394390463829, "rewards/margins": 0.10444964468479156, "rewards/rejected": -0.21558907628059387, "sft_loss": 1.1113945245742798, "step": 4400 }, { "epoch": 0.34, "grad_norm": 17.26323127746582, "learning_rate": 7.4429047377799455e-06, "logits/chosen": -1.2751134634017944, "logits/rejected": -0.9901043772697449, "logps/chosen": -1.141135573387146, "logps/rejected": -1.833753228187561, "loss": 1.1797, "odds_ratio_loss": 0.3855925500392914, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11411355435848236, "rewards/margins": 0.06926177442073822, "rewards/rejected": -0.18337532877922058, "sft_loss": 1.141135573387146, "step": 4405 }, { "epoch": 0.34, "grad_norm": 8.824706077575684, "learning_rate": 7.437530291718051e-06, "logits/chosen": -1.2793327569961548, "logits/rejected": -0.8037842512130737, "logps/chosen": -0.9298388361930847, "logps/rejected": -4.0271196365356445, "loss": 0.9434, "odds_ratio_loss": 0.13532523810863495, "rewards/accuracies": 1.0, "rewards/chosen": -0.09298388659954071, "rewards/margins": 0.30972808599472046, "rewards/rejected": -0.40271201729774475, "sft_loss": 0.9298388361930847, "step": 4410 }, { "epoch": 0.34, "grad_norm": 6.073675632476807, "learning_rate": 7.432152148831988e-06, "logits/chosen": -1.2977871894836426, "logits/rejected": -0.4883858561515808, "logps/chosen": -0.9581543207168579, "logps/rejected": -2.2285006046295166, "loss": 0.9896, "odds_ratio_loss": 0.31483057141304016, "rewards/accuracies": 1.0, "rewards/chosen": -0.09581543505191803, "rewards/margins": 0.12703463435173035, "rewards/rejected": -0.22285005450248718, "sft_loss": 0.9581543207168579, "step": 4415 }, { "epoch": 0.34, "grad_norm": 31.437864303588867, "learning_rate": 7.426770317278392e-06, "logits/chosen": -1.2266685962677002, "logits/rejected": -0.8512941598892212, "logps/chosen": -1.4204190969467163, "logps/rejected": -3.2539901733398438, "loss": 1.4582, "odds_ratio_loss": 0.37796148657798767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14204192161560059, "rewards/margins": 0.1833570897579193, "rewards/rejected": -0.3253989815711975, "sft_loss": 1.4204190969467163, "step": 4420 }, { "epoch": 0.34, "grad_norm": 4.3960676193237305, "learning_rate": 7.4213848052194955e-06, "logits/chosen": -0.9935970306396484, "logits/rejected": -0.7201655507087708, "logps/chosen": -0.8070360422134399, "logps/rejected": -2.0757639408111572, "loss": 0.8231, "odds_ratio_loss": 0.16081495583057404, "rewards/accuracies": 1.0, "rewards/chosen": -0.08070359379053116, "rewards/margins": 0.12687279284000397, "rewards/rejected": -0.20757639408111572, "sft_loss": 0.8070360422134399, "step": 4425 }, { "epoch": 0.34, "grad_norm": 9.1425199508667, "learning_rate": 7.415995620823113e-06, "logits/chosen": -1.2669662237167358, "logits/rejected": -0.7052954435348511, "logps/chosen": -0.99391108751297, "logps/rejected": -4.0261054039001465, "loss": 1.0137, "odds_ratio_loss": 0.1983099728822708, "rewards/accuracies": 1.0, "rewards/chosen": -0.09939111024141312, "rewards/margins": 0.3032194972038269, "rewards/rejected": -0.40261054039001465, "sft_loss": 0.99391108751297, "step": 4430 }, { "epoch": 0.35, "grad_norm": 5.359577178955078, "learning_rate": 7.410602772262623e-06, "logits/chosen": -1.266448974609375, "logits/rejected": -0.5301991701126099, "logps/chosen": -1.100752830505371, "logps/rejected": -2.557126045227051, "loss": 1.137, "odds_ratio_loss": 0.3625568747520447, "rewards/accuracies": 1.0, "rewards/chosen": -0.11007529497146606, "rewards/margins": 0.14563728868961334, "rewards/rejected": -0.2557125687599182, "sft_loss": 1.100752830505371, "step": 4435 }, { "epoch": 0.35, "grad_norm": 6.489447593688965, "learning_rate": 7.4052062677169675e-06, "logits/chosen": -1.280884027481079, "logits/rejected": -0.6861027479171753, "logps/chosen": -0.9087278246879578, "logps/rejected": -4.286924362182617, "loss": 0.9492, "odds_ratio_loss": 0.4042681157588959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09087278693914413, "rewards/margins": 0.3378196358680725, "rewards/rejected": -0.42869243025779724, "sft_loss": 0.9087278246879578, "step": 4440 }, { "epoch": 0.35, "grad_norm": 5.1577863693237305, "learning_rate": 7.399806115370629e-06, "logits/chosen": -1.3704392910003662, "logits/rejected": -0.6230510473251343, "logps/chosen": -0.894112765789032, "logps/rejected": -2.8345396518707275, "loss": 0.9103, "odds_ratio_loss": 0.16220757365226746, "rewards/accuracies": 1.0, "rewards/chosen": -0.08941127359867096, "rewards/margins": 0.19404269754886627, "rewards/rejected": -0.2834540009498596, "sft_loss": 0.894112765789032, "step": 4445 }, { "epoch": 0.35, "grad_norm": 10.279149055480957, "learning_rate": 7.394402323413626e-06, "logits/chosen": -1.4479832649230957, "logits/rejected": -0.8625136613845825, "logps/chosen": -1.0798912048339844, "logps/rejected": -1.4926217794418335, "loss": 1.1357, "odds_ratio_loss": 0.5580379366874695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1079891175031662, "rewards/margins": 0.041273050010204315, "rewards/rejected": -0.14926216006278992, "sft_loss": 1.0798912048339844, "step": 4450 }, { "epoch": 0.35, "grad_norm": 101.64732360839844, "learning_rate": 7.388994900041495e-06, "logits/chosen": -1.2341253757476807, "logits/rejected": -1.146302580833435, "logps/chosen": -0.8642631769180298, "logps/rejected": -3.996840238571167, "loss": 0.8875, "odds_ratio_loss": 0.23205271363258362, "rewards/accuracies": 1.0, "rewards/chosen": -0.08642631024122238, "rewards/margins": 0.31325775384902954, "rewards/rejected": -0.39968404173851013, "sft_loss": 0.8642631769180298, "step": 4455 }, { "epoch": 0.35, "grad_norm": 16.124719619750977, "learning_rate": 7.383583853455278e-06, "logits/chosen": -1.4076511859893799, "logits/rejected": -1.1392637491226196, "logps/chosen": -0.960048496723175, "logps/rejected": -3.5500710010528564, "loss": 0.9826, "odds_ratio_loss": 0.22579637169837952, "rewards/accuracies": 1.0, "rewards/chosen": -0.09600485861301422, "rewards/margins": 0.2590022683143616, "rewards/rejected": -0.3550071120262146, "sft_loss": 0.960048496723175, "step": 4460 }, { "epoch": 0.35, "grad_norm": 10.780272483825684, "learning_rate": 7.378169191861517e-06, "logits/chosen": -1.3396289348602295, "logits/rejected": -0.7209702134132385, "logps/chosen": -1.2006075382232666, "logps/rejected": -2.211906909942627, "loss": 1.2583, "odds_ratio_loss": 0.5772491693496704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1200607642531395, "rewards/margins": 0.101129911839962, "rewards/rejected": -0.2211906909942627, "sft_loss": 1.2006075382232666, "step": 4465 }, { "epoch": 0.35, "grad_norm": 43.170448303222656, "learning_rate": 7.372750923472232e-06, "logits/chosen": -1.3405921459197998, "logits/rejected": -0.9898471832275391, "logps/chosen": -1.0272200107574463, "logps/rejected": -3.1108803749084473, "loss": 1.0559, "odds_ratio_loss": 0.2867039740085602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10272200405597687, "rewards/margins": 0.20836606621742249, "rewards/rejected": -0.31108805537223816, "sft_loss": 1.0272200107574463, "step": 4470 }, { "epoch": 0.35, "grad_norm": 6.963412761688232, "learning_rate": 7.367329056504915e-06, "logits/chosen": -1.2340214252471924, "logits/rejected": -0.9777275323867798, "logps/chosen": -1.541316270828247, "logps/rejected": -1.94304621219635, "loss": 1.618, "odds_ratio_loss": 0.7671359777450562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15413162112236023, "rewards/margins": 0.04017297178506851, "rewards/rejected": -0.19430460035800934, "sft_loss": 1.541316270828247, "step": 4475 }, { "epoch": 0.35, "grad_norm": 5.4368896484375, "learning_rate": 7.361903599182516e-06, "logits/chosen": -1.361820936203003, "logits/rejected": -0.7715870141983032, "logps/chosen": -0.9218884706497192, "logps/rejected": -3.4433655738830566, "loss": 0.9409, "odds_ratio_loss": 0.1898377537727356, "rewards/accuracies": 1.0, "rewards/chosen": -0.09218885004520416, "rewards/margins": 0.2521476745605469, "rewards/rejected": -0.34433650970458984, "sft_loss": 0.9218884706497192, "step": 4480 }, { "epoch": 0.35, "grad_norm": 171.33653259277344, "learning_rate": 7.35647455973343e-06, "logits/chosen": -1.2617411613464355, "logits/rejected": -1.2783212661743164, "logps/chosen": -1.4098418951034546, "logps/rejected": -1.3931834697723389, "loss": 1.4959, "odds_ratio_loss": 0.8600964546203613, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1409842073917389, "rewards/margins": -0.0016658693784847856, "rewards/rejected": -0.1393183171749115, "sft_loss": 1.4098418951034546, "step": 4485 }, { "epoch": 0.35, "grad_norm": 13.803630828857422, "learning_rate": 7.351041946391485e-06, "logits/chosen": -1.4524598121643066, "logits/rejected": -0.7367189526557922, "logps/chosen": -0.9599382281303406, "logps/rejected": -1.7124487161636353, "loss": 1.0182, "odds_ratio_loss": 0.5826715230941772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09599383175373077, "rewards/margins": 0.07525105774402618, "rewards/rejected": -0.17124485969543457, "sft_loss": 0.9599382281303406, "step": 4490 }, { "epoch": 0.35, "grad_norm": 6.795773029327393, "learning_rate": 7.345605767395929e-06, "logits/chosen": -1.3077876567840576, "logits/rejected": -0.9576922655105591, "logps/chosen": -0.985217273235321, "logps/rejected": -1.5327274799346924, "loss": 1.0325, "odds_ratio_loss": 0.4725615084171295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09852172434329987, "rewards/margins": 0.05475100874900818, "rewards/rejected": -0.15327273309230804, "sft_loss": 0.985217273235321, "step": 4495 }, { "epoch": 0.35, "grad_norm": 6.262613773345947, "learning_rate": 7.340166030991416e-06, "logits/chosen": -1.1966087818145752, "logits/rejected": -0.7058561444282532, "logps/chosen": -1.084021806716919, "logps/rejected": -1.4973636865615845, "loss": 1.135, "odds_ratio_loss": 0.5095903873443604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10840219259262085, "rewards/margins": 0.041334182024002075, "rewards/rejected": -0.14973635971546173, "sft_loss": 1.084021806716919, "step": 4500 }, { "epoch": 0.35, "grad_norm": 8.660277366638184, "learning_rate": 7.334722745427998e-06, "logits/chosen": -1.308048963546753, "logits/rejected": -0.8562232255935669, "logps/chosen": -1.1090872287750244, "logps/rejected": -2.0640666484832764, "loss": 1.1438, "odds_ratio_loss": 0.3471030592918396, "rewards/accuracies": 1.0, "rewards/chosen": -0.11090872436761856, "rewards/margins": 0.0954979658126831, "rewards/rejected": -0.20640668272972107, "sft_loss": 1.1090872287750244, "step": 4505 }, { "epoch": 0.35, "grad_norm": 5.754500865936279, "learning_rate": 7.3292759189611075e-06, "logits/chosen": -1.0905884504318237, "logits/rejected": -0.6774468421936035, "logps/chosen": -0.8998796343803406, "logps/rejected": -3.464883804321289, "loss": 0.9201, "odds_ratio_loss": 0.2017499953508377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08998797088861465, "rewards/margins": 0.2565004229545593, "rewards/rejected": -0.3464883863925934, "sft_loss": 0.8998796343803406, "step": 4510 }, { "epoch": 0.35, "grad_norm": 5.588654041290283, "learning_rate": 7.3238255598515495e-06, "logits/chosen": -1.3673207759857178, "logits/rejected": -0.8214467763900757, "logps/chosen": -1.0710903406143188, "logps/rejected": -3.6640121936798096, "loss": 1.0837, "odds_ratio_loss": 0.12623175978660583, "rewards/accuracies": 1.0, "rewards/chosen": -0.10710903257131577, "rewards/margins": 0.25929221510887146, "rewards/rejected": -0.3664012551307678, "sft_loss": 1.0710903406143188, "step": 4515 }, { "epoch": 0.35, "grad_norm": 4.794607162475586, "learning_rate": 7.318371676365487e-06, "logits/chosen": -1.3031909465789795, "logits/rejected": -0.343318372964859, "logps/chosen": -0.9971429705619812, "logps/rejected": -9.828311920166016, "loss": 1.0109, "odds_ratio_loss": 0.13710226118564606, "rewards/accuracies": 1.0, "rewards/chosen": -0.09971430152654648, "rewards/margins": 0.8831169009208679, "rewards/rejected": -0.9828311800956726, "sft_loss": 0.9971429705619812, "step": 4520 }, { "epoch": 0.35, "grad_norm": 16.599647521972656, "learning_rate": 7.3129142767744266e-06, "logits/chosen": -1.424988031387329, "logits/rejected": -0.8827608227729797, "logps/chosen": -0.8135870099067688, "logps/rejected": -2.6405673027038574, "loss": 0.8426, "odds_ratio_loss": 0.29016706347465515, "rewards/accuracies": 1.0, "rewards/chosen": -0.08135870844125748, "rewards/margins": 0.18269802629947662, "rewards/rejected": -0.2640567421913147, "sft_loss": 0.8135870099067688, "step": 4525 }, { "epoch": 0.35, "grad_norm": 27.2139949798584, "learning_rate": 7.307453369355204e-06, "logits/chosen": -1.3583816289901733, "logits/rejected": -1.0602794885635376, "logps/chosen": -1.0894434452056885, "logps/rejected": -4.239919662475586, "loss": 1.1087, "odds_ratio_loss": 0.19260287284851074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10894433408975601, "rewards/margins": 0.31504765152931213, "rewards/rejected": -0.42399197816848755, "sft_loss": 1.0894434452056885, "step": 4530 }, { "epoch": 0.35, "grad_norm": 6.667520999908447, "learning_rate": 7.301988962389982e-06, "logits/chosen": -1.3098249435424805, "logits/rejected": -1.0471160411834717, "logps/chosen": -0.9988832473754883, "logps/rejected": -3.9677734375, "loss": 1.0147, "odds_ratio_loss": 0.1579330563545227, "rewards/accuracies": 1.0, "rewards/chosen": -0.09988833218812943, "rewards/margins": 0.296889066696167, "rewards/rejected": -0.3967773914337158, "sft_loss": 0.9988832473754883, "step": 4535 }, { "epoch": 0.35, "grad_norm": 4.398935317993164, "learning_rate": 7.2965210641662265e-06, "logits/chosen": -1.5169909000396729, "logits/rejected": -0.7749336361885071, "logps/chosen": -1.6878244876861572, "logps/rejected": -3.0113415718078613, "loss": 1.7534, "odds_ratio_loss": 0.6560280919075012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16878245770931244, "rewards/margins": 0.13235166668891907, "rewards/rejected": -0.3011341392993927, "sft_loss": 1.6878244876861572, "step": 4540 }, { "epoch": 0.35, "grad_norm": 7.342326641082764, "learning_rate": 7.2910496829767e-06, "logits/chosen": -1.2548385858535767, "logits/rejected": -1.3211010694503784, "logps/chosen": -0.7158026099205017, "logps/rejected": -2.429844617843628, "loss": 0.7398, "odds_ratio_loss": 0.240126371383667, "rewards/accuracies": 1.0, "rewards/chosen": -0.07158026844263077, "rewards/margins": 0.1714041829109192, "rewards/rejected": -0.24298445880413055, "sft_loss": 0.7158026099205017, "step": 4545 }, { "epoch": 0.35, "grad_norm": 7.4929728507995605, "learning_rate": 7.285574827119446e-06, "logits/chosen": -1.2970168590545654, "logits/rejected": -1.3728965520858765, "logps/chosen": -1.0939594507217407, "logps/rejected": -1.9339323043823242, "loss": 1.1435, "odds_ratio_loss": 0.4956664443016052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10939594358205795, "rewards/margins": 0.08399729430675507, "rewards/rejected": -0.19339323043823242, "sft_loss": 1.0939594507217407, "step": 4550 }, { "epoch": 0.35, "grad_norm": 7.341314792633057, "learning_rate": 7.280096504897778e-06, "logits/chosen": -1.408686876296997, "logits/rejected": -0.9580324292182922, "logps/chosen": -0.9844639897346497, "logps/rejected": -2.457996129989624, "loss": 1.008, "odds_ratio_loss": 0.23529568314552307, "rewards/accuracies": 1.0, "rewards/chosen": -0.09844639152288437, "rewards/margins": 0.14735323190689087, "rewards/rejected": -0.24579963088035583, "sft_loss": 0.9844639897346497, "step": 4555 }, { "epoch": 0.35, "grad_norm": 7.616289138793945, "learning_rate": 7.274614724620269e-06, "logits/chosen": -1.401800513267517, "logits/rejected": -0.6990433931350708, "logps/chosen": -1.2157708406448364, "logps/rejected": -2.5214405059814453, "loss": 1.2819, "odds_ratio_loss": 0.6614880561828613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12157706916332245, "rewards/margins": 0.13056698441505432, "rewards/rejected": -0.2521440386772156, "sft_loss": 1.2157708406448364, "step": 4560 }, { "epoch": 0.36, "grad_norm": 15.146883964538574, "learning_rate": 7.269129494600733e-06, "logits/chosen": -1.0967886447906494, "logits/rejected": -1.1191096305847168, "logps/chosen": -1.009189248085022, "logps/rejected": -1.9709656238555908, "loss": 1.0466, "odds_ratio_loss": 0.3742437958717346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10091892629861832, "rewards/margins": 0.09617763012647629, "rewards/rejected": -0.1970965564250946, "sft_loss": 1.009189248085022, "step": 4565 }, { "epoch": 0.36, "grad_norm": 34.097381591796875, "learning_rate": 7.2636408231582204e-06, "logits/chosen": -1.341217279434204, "logits/rejected": -0.9640189409255981, "logps/chosen": -0.8832473754882812, "logps/rejected": -2.830765962600708, "loss": 0.914, "odds_ratio_loss": 0.3079011142253876, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08832474052906036, "rewards/margins": 0.19475185871124268, "rewards/rejected": -0.28307658433914185, "sft_loss": 0.8832473754882812, "step": 4570 }, { "epoch": 0.36, "grad_norm": 6.27688455581665, "learning_rate": 7.258148718616994e-06, "logits/chosen": -1.4276528358459473, "logits/rejected": -0.7665327787399292, "logps/chosen": -0.7643810510635376, "logps/rejected": -8.876727104187012, "loss": 0.7689, "odds_ratio_loss": 0.045427560806274414, "rewards/accuracies": 1.0, "rewards/chosen": -0.07643811404705048, "rewards/margins": 0.8112346529960632, "rewards/rejected": -0.8876727819442749, "sft_loss": 0.7643810510635376, "step": 4575 }, { "epoch": 0.36, "grad_norm": 46.2465705871582, "learning_rate": 7.2526531893065314e-06, "logits/chosen": -1.293874979019165, "logits/rejected": -1.0485689640045166, "logps/chosen": -0.9439376592636108, "logps/rejected": -2.3564789295196533, "loss": 0.9699, "odds_ratio_loss": 0.25933530926704407, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0943937674164772, "rewards/margins": 0.14125414192676544, "rewards/rejected": -0.23564788699150085, "sft_loss": 0.9439376592636108, "step": 4580 }, { "epoch": 0.36, "grad_norm": 6.722151279449463, "learning_rate": 7.2471542435615e-06, "logits/chosen": -1.140699028968811, "logits/rejected": -0.7710426449775696, "logps/chosen": -0.9066115617752075, "logps/rejected": -2.0762696266174316, "loss": 0.9346, "odds_ratio_loss": 0.2797589898109436, "rewards/accuracies": 1.0, "rewards/chosen": -0.09066115319728851, "rewards/margins": 0.11696581542491913, "rewards/rejected": -0.20762696862220764, "sft_loss": 0.9066115617752075, "step": 4585 }, { "epoch": 0.36, "grad_norm": 84.71833038330078, "learning_rate": 7.241651889721746e-06, "logits/chosen": -1.4722684621810913, "logits/rejected": -1.06831955909729, "logps/chosen": -0.9850140810012817, "logps/rejected": -5.990573883056641, "loss": 1.0269, "odds_ratio_loss": 0.41919898986816406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09850140661001205, "rewards/margins": 0.5005560517311096, "rewards/rejected": -0.5990574359893799, "sft_loss": 0.9850140810012817, "step": 4590 }, { "epoch": 0.36, "grad_norm": 19.882492065429688, "learning_rate": 7.236146136132292e-06, "logits/chosen": -1.313164472579956, "logits/rejected": -0.8736522793769836, "logps/chosen": -1.2232404947280884, "logps/rejected": -2.109252452850342, "loss": 1.2705, "odds_ratio_loss": 0.4722273349761963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12232404947280884, "rewards/margins": 0.08860119432210922, "rewards/rejected": -0.21092525124549866, "sft_loss": 1.2232404947280884, "step": 4595 }, { "epoch": 0.36, "grad_norm": 8.371238708496094, "learning_rate": 7.230636991143309e-06, "logits/chosen": -1.3490852117538452, "logits/rejected": -0.9498292207717896, "logps/chosen": -1.0545085668563843, "logps/rejected": -1.9465465545654297, "loss": 1.0866, "odds_ratio_loss": 0.32094138860702515, "rewards/accuracies": 1.0, "rewards/chosen": -0.10545085370540619, "rewards/margins": 0.08920378983020782, "rewards/rejected": -0.194654643535614, "sft_loss": 1.0545085668563843, "step": 4600 }, { "epoch": 0.36, "grad_norm": 15.105850219726562, "learning_rate": 7.225124463110118e-06, "logits/chosen": -1.3464066982269287, "logits/rejected": -1.1369407176971436, "logps/chosen": -1.1444499492645264, "logps/rejected": -1.7896219491958618, "loss": 1.2127, "odds_ratio_loss": 0.6820577383041382, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11444501578807831, "rewards/margins": 0.06451719999313354, "rewards/rejected": -0.17896220088005066, "sft_loss": 1.1444499492645264, "step": 4605 }, { "epoch": 0.36, "grad_norm": 19.077436447143555, "learning_rate": 7.219608560393166e-06, "logits/chosen": -1.2334355115890503, "logits/rejected": -1.1886484622955322, "logps/chosen": -0.7450493574142456, "logps/rejected": -1.6827375888824463, "loss": 0.7747, "odds_ratio_loss": 0.2968628704547882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07450493425130844, "rewards/margins": 0.09376882016658783, "rewards/rejected": -0.16827376186847687, "sft_loss": 0.7450493574142456, "step": 4610 }, { "epoch": 0.36, "grad_norm": 279.6209716796875, "learning_rate": 7.2140892913580174e-06, "logits/chosen": -1.3830896615982056, "logits/rejected": -1.0050561428070068, "logps/chosen": -1.1624782085418701, "logps/rejected": -1.718340277671814, "loss": 1.2271, "odds_ratio_loss": 0.6462377905845642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11624781787395477, "rewards/margins": 0.05558621138334274, "rewards/rejected": -0.1718340367078781, "sft_loss": 1.1624782085418701, "step": 4615 }, { "epoch": 0.36, "grad_norm": 7.198555946350098, "learning_rate": 7.2085666643753475e-06, "logits/chosen": -1.2989271879196167, "logits/rejected": -1.0027689933776855, "logps/chosen": -0.7517678141593933, "logps/rejected": -1.4274622201919556, "loss": 0.7941, "odds_ratio_loss": 0.4230150580406189, "rewards/accuracies": 1.0, "rewards/chosen": -0.07517679035663605, "rewards/margins": 0.06756944954395294, "rewards/rejected": -0.142746239900589, "sft_loss": 0.7517678141593933, "step": 4620 }, { "epoch": 0.36, "grad_norm": 21.27382469177246, "learning_rate": 7.20304068782092e-06, "logits/chosen": -1.3403156995773315, "logits/rejected": -1.0457615852355957, "logps/chosen": -1.1798444986343384, "logps/rejected": -2.0619351863861084, "loss": 1.2119, "odds_ratio_loss": 0.320948988199234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11798445880413055, "rewards/margins": 0.08820907026529312, "rewards/rejected": -0.20619352161884308, "sft_loss": 1.1798444986343384, "step": 4625 }, { "epoch": 0.36, "grad_norm": 103.77518463134766, "learning_rate": 7.197511370075581e-06, "logits/chosen": -1.200635552406311, "logits/rejected": -0.9515268206596375, "logps/chosen": -1.196729302406311, "logps/rejected": -2.4454774856567383, "loss": 1.2169, "odds_ratio_loss": 0.20212802290916443, "rewards/accuracies": 1.0, "rewards/chosen": -0.11967293173074722, "rewards/margins": 0.12487481534481049, "rewards/rejected": -0.2445477545261383, "sft_loss": 1.196729302406311, "step": 4630 }, { "epoch": 0.36, "grad_norm": 6.4753313064575195, "learning_rate": 7.191978719525243e-06, "logits/chosen": -1.4594614505767822, "logits/rejected": -1.1107757091522217, "logps/chosen": -1.047928810119629, "logps/rejected": -2.2635796070098877, "loss": 1.0693, "odds_ratio_loss": 0.21393127739429474, "rewards/accuracies": 1.0, "rewards/chosen": -0.10479287803173065, "rewards/margins": 0.12156505882740021, "rewards/rejected": -0.22635793685913086, "sft_loss": 1.047928810119629, "step": 4635 }, { "epoch": 0.36, "grad_norm": 13.064247131347656, "learning_rate": 7.186442744560873e-06, "logits/chosen": -1.4128029346466064, "logits/rejected": -1.398961067199707, "logps/chosen": -0.8758260607719421, "logps/rejected": -3.4516899585723877, "loss": 0.9049, "odds_ratio_loss": 0.2910856306552887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08758260309696198, "rewards/margins": 0.25758641958236694, "rewards/rejected": -0.3451690077781677, "sft_loss": 0.8758260607719421, "step": 4640 }, { "epoch": 0.36, "grad_norm": 7.154684066772461, "learning_rate": 7.1809034535784785e-06, "logits/chosen": -1.2984259128570557, "logits/rejected": -1.042232871055603, "logps/chosen": -0.9784517288208008, "logps/rejected": -1.5918056964874268, "loss": 1.0242, "odds_ratio_loss": 0.4576658308506012, "rewards/accuracies": 1.0, "rewards/chosen": -0.0978451743721962, "rewards/margins": 0.06133540719747543, "rewards/rejected": -0.15918058156967163, "sft_loss": 0.9784517288208008, "step": 4645 }, { "epoch": 0.36, "grad_norm": 19.701332092285156, "learning_rate": 7.1753608549790985e-06, "logits/chosen": -1.2546695470809937, "logits/rejected": -1.4402790069580078, "logps/chosen": -0.7282959222793579, "logps/rejected": -1.5314674377441406, "loss": 0.7842, "odds_ratio_loss": 0.5590053796768188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07282960414886475, "rewards/margins": 0.08031713962554932, "rewards/rejected": -0.15314674377441406, "sft_loss": 0.7282959222793579, "step": 4650 }, { "epoch": 0.36, "grad_norm": 5.952921390533447, "learning_rate": 7.169814957168786e-06, "logits/chosen": -1.4671968221664429, "logits/rejected": -0.9394590258598328, "logps/chosen": -1.2331550121307373, "logps/rejected": -1.6452100276947021, "loss": 1.2829, "odds_ratio_loss": 0.49736180901527405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12331549823284149, "rewards/margins": 0.041205499321222305, "rewards/rejected": -0.1645210087299347, "sft_loss": 1.2331550121307373, "step": 4655 }, { "epoch": 0.36, "grad_norm": 4.4238104820251465, "learning_rate": 7.164265768558603e-06, "logits/chosen": -1.4732682704925537, "logits/rejected": -1.223921537399292, "logps/chosen": -0.7300776243209839, "logps/rejected": -0.9684303402900696, "loss": 0.7961, "odds_ratio_loss": 0.6600145697593689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07300776988267899, "rewards/margins": 0.02383527345955372, "rewards/rejected": -0.09684304147958755, "sft_loss": 0.7300776243209839, "step": 4660 }, { "epoch": 0.36, "grad_norm": 13.20688533782959, "learning_rate": 7.158713297564595e-06, "logits/chosen": -1.493577480316162, "logits/rejected": -0.8166548013687134, "logps/chosen": -0.8863444328308105, "logps/rejected": -6.545098304748535, "loss": 0.902, "odds_ratio_loss": 0.15621954202651978, "rewards/accuracies": 1.0, "rewards/chosen": -0.0886344462633133, "rewards/margins": 0.5658753514289856, "rewards/rejected": -0.6545097827911377, "sft_loss": 0.8863444328308105, "step": 4665 }, { "epoch": 0.36, "grad_norm": 4.963711261749268, "learning_rate": 7.153157552607789e-06, "logits/chosen": -1.436767578125, "logits/rejected": -0.8550936579704285, "logps/chosen": -1.089935064315796, "logps/rejected": -2.3934664726257324, "loss": 1.1425, "odds_ratio_loss": 0.5253725647926331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10899350792169571, "rewards/margins": 0.1303531378507614, "rewards/rejected": -0.23934665322303772, "sft_loss": 1.089935064315796, "step": 4670 }, { "epoch": 0.36, "grad_norm": 8.91596794128418, "learning_rate": 7.14759854211418e-06, "logits/chosen": -1.413935899734497, "logits/rejected": -0.7103008031845093, "logps/chosen": -1.0255316495895386, "logps/rejected": -2.3229520320892334, "loss": 1.0548, "odds_ratio_loss": 0.29279404878616333, "rewards/accuracies": 1.0, "rewards/chosen": -0.10255316644906998, "rewards/margins": 0.12974204123020172, "rewards/rejected": -0.2322952300310135, "sft_loss": 1.0255316495895386, "step": 4675 }, { "epoch": 0.36, "grad_norm": 28.46976661682129, "learning_rate": 7.142036274514712e-06, "logits/chosen": -1.3273677825927734, "logits/rejected": -1.1330890655517578, "logps/chosen": -0.9897178411483765, "logps/rejected": -2.691734790802002, "loss": 1.0195, "odds_ratio_loss": 0.2983167767524719, "rewards/accuracies": 1.0, "rewards/chosen": -0.09897179901599884, "rewards/margins": 0.17020167410373688, "rewards/rejected": -0.26917344331741333, "sft_loss": 0.9897178411483765, "step": 4680 }, { "epoch": 0.36, "grad_norm": 33.62302017211914, "learning_rate": 7.1364707582452705e-06, "logits/chosen": -1.3759291172027588, "logits/rejected": -1.1289112567901611, "logps/chosen": -0.8859437704086304, "logps/rejected": -4.888989448547363, "loss": 0.9001, "odds_ratio_loss": 0.141677588224411, "rewards/accuracies": 1.0, "rewards/chosen": -0.08859437704086304, "rewards/margins": 0.40030455589294434, "rewards/rejected": -0.4888989329338074, "sft_loss": 0.8859437704086304, "step": 4685 }, { "epoch": 0.36, "grad_norm": 12.877079010009766, "learning_rate": 7.130902001746667e-06, "logits/chosen": -1.2970373630523682, "logits/rejected": -1.0835976600646973, "logps/chosen": -1.139439344406128, "logps/rejected": -4.003388404846191, "loss": 1.1481, "odds_ratio_loss": 0.08658437430858612, "rewards/accuracies": 1.0, "rewards/chosen": -0.11394394934177399, "rewards/margins": 0.28639495372772217, "rewards/rejected": -0.40033888816833496, "sft_loss": 1.139439344406128, "step": 4690 }, { "epoch": 0.37, "grad_norm": 7.1802873611450195, "learning_rate": 7.125330013464629e-06, "logits/chosen": -1.3899294137954712, "logits/rejected": -0.9751816987991333, "logps/chosen": -1.1031863689422607, "logps/rejected": -2.300921678543091, "loss": 1.1432, "odds_ratio_loss": 0.3997074365615845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11031864583492279, "rewards/margins": 0.11977354437112808, "rewards/rejected": -0.23009219765663147, "sft_loss": 1.1031863689422607, "step": 4695 }, { "epoch": 0.37, "grad_norm": 17.34177017211914, "learning_rate": 7.119754801849782e-06, "logits/chosen": -1.5370855331420898, "logits/rejected": -1.0702970027923584, "logps/chosen": -0.8118973970413208, "logps/rejected": -1.677382230758667, "loss": 0.8715, "odds_ratio_loss": 0.5956419706344604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08118973672389984, "rewards/margins": 0.08654849231243134, "rewards/rejected": -0.16773822903633118, "sft_loss": 0.8118973970413208, "step": 4700 }, { "epoch": 0.37, "grad_norm": 15.636608123779297, "learning_rate": 7.1141763753576435e-06, "logits/chosen": -1.1833438873291016, "logits/rejected": -0.9127056002616882, "logps/chosen": -1.0889067649841309, "logps/rejected": -1.704662561416626, "loss": 1.1468, "odds_ratio_loss": 0.5786373019218445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10889067500829697, "rewards/margins": 0.06157558411359787, "rewards/rejected": -0.17046627402305603, "sft_loss": 1.0889067649841309, "step": 4705 }, { "epoch": 0.37, "grad_norm": 5.713825702667236, "learning_rate": 7.1085947424486045e-06, "logits/chosen": -1.3846780061721802, "logits/rejected": -0.7796342372894287, "logps/chosen": -0.8368937373161316, "logps/rejected": -1.4111164808273315, "loss": 0.8761, "odds_ratio_loss": 0.3923702836036682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0836893767118454, "rewards/margins": 0.05742228031158447, "rewards/rejected": -0.14111164212226868, "sft_loss": 0.8368937373161316, "step": 4710 }, { "epoch": 0.37, "grad_norm": 6.6504364013671875, "learning_rate": 7.103009911587923e-06, "logits/chosen": -1.2653753757476807, "logits/rejected": -1.285905122756958, "logps/chosen": -0.8329108357429504, "logps/rejected": -1.9709625244140625, "loss": 0.8781, "odds_ratio_loss": 0.45160213112831116, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08329108357429504, "rewards/margins": 0.11380515992641449, "rewards/rejected": -0.19709625840187073, "sft_loss": 0.8329108357429504, "step": 4715 }, { "epoch": 0.37, "grad_norm": 18.44487953186035, "learning_rate": 7.097421891245701e-06, "logits/chosen": -1.3122217655181885, "logits/rejected": -1.0286943912506104, "logps/chosen": -1.413613200187683, "logps/rejected": -7.518275260925293, "loss": 1.4387, "odds_ratio_loss": 0.2513591945171356, "rewards/accuracies": 1.0, "rewards/chosen": -0.14136134088039398, "rewards/margins": 0.6104661822319031, "rewards/rejected": -0.7518275380134583, "sft_loss": 1.413613200187683, "step": 4720 }, { "epoch": 0.37, "grad_norm": 11.627037048339844, "learning_rate": 7.091830689896883e-06, "logits/chosen": -1.3183649778366089, "logits/rejected": -0.8607378005981445, "logps/chosen": -1.0398062467575073, "logps/rejected": -2.5570998191833496, "loss": 1.0621, "odds_ratio_loss": 0.22267043590545654, "rewards/accuracies": 1.0, "rewards/chosen": -0.10398062318563461, "rewards/margins": 0.15172937512397766, "rewards/rejected": -0.2557099759578705, "sft_loss": 1.0398062467575073, "step": 4725 }, { "epoch": 0.37, "grad_norm": 20.778667449951172, "learning_rate": 7.086236316021232e-06, "logits/chosen": -0.8270589709281921, "logits/rejected": -1.1936814785003662, "logps/chosen": -0.9919061660766602, "logps/rejected": -1.9597247838974, "loss": 1.0642, "odds_ratio_loss": 0.7229019403457642, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0991906225681305, "rewards/margins": 0.09678187221288681, "rewards/rejected": -0.1959724873304367, "sft_loss": 0.9919061660766602, "step": 4730 }, { "epoch": 0.37, "grad_norm": 12.020709037780762, "learning_rate": 7.080638778103331e-06, "logits/chosen": -1.2841196060180664, "logits/rejected": -0.7834513783454895, "logps/chosen": -0.962006688117981, "logps/rejected": -4.6492919921875, "loss": 1.0038, "odds_ratio_loss": 0.41799622774124146, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09620066732168198, "rewards/margins": 0.3687285780906677, "rewards/rejected": -0.4649292528629303, "sft_loss": 0.962006688117981, "step": 4735 }, { "epoch": 0.37, "grad_norm": 15.375065803527832, "learning_rate": 7.075038084632554e-06, "logits/chosen": -1.399659514427185, "logits/rejected": -1.212721586227417, "logps/chosen": -0.8884924054145813, "logps/rejected": -1.693756103515625, "loss": 0.9439, "odds_ratio_loss": 0.5542975664138794, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08884923905134201, "rewards/margins": 0.08052637428045273, "rewards/rejected": -0.16937562823295593, "sft_loss": 0.8884924054145813, "step": 4740 }, { "epoch": 0.37, "grad_norm": 6.555948734283447, "learning_rate": 7.069434244103064e-06, "logits/chosen": -1.293717384338379, "logits/rejected": -1.2064396142959595, "logps/chosen": -0.8203238248825073, "logps/rejected": -3.335233211517334, "loss": 0.8504, "odds_ratio_loss": 0.3007420301437378, "rewards/accuracies": 1.0, "rewards/chosen": -0.08203238993883133, "rewards/margins": 0.2514909505844116, "rewards/rejected": -0.33352333307266235, "sft_loss": 0.8203238248825073, "step": 4745 }, { "epoch": 0.37, "grad_norm": 10.171930313110352, "learning_rate": 7.063827265013798e-06, "logits/chosen": -1.341202974319458, "logits/rejected": -1.1988797187805176, "logps/chosen": -0.875900149345398, "logps/rejected": -2.420520067214966, "loss": 0.9075, "odds_ratio_loss": 0.3163735270500183, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08759002387523651, "rewards/margins": 0.15446197986602783, "rewards/rejected": -0.24205198884010315, "sft_loss": 0.875900149345398, "step": 4750 }, { "epoch": 0.37, "grad_norm": 7.426060199737549, "learning_rate": 7.058217155868452e-06, "logits/chosen": -1.4535651206970215, "logits/rejected": -0.9479306936264038, "logps/chosen": -1.0767772197723389, "logps/rejected": -3.3065083026885986, "loss": 1.152, "odds_ratio_loss": 0.7525271773338318, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10767773538827896, "rewards/margins": 0.2229730784893036, "rewards/rejected": -0.33065083622932434, "sft_loss": 1.0767772197723389, "step": 4755 }, { "epoch": 0.37, "grad_norm": 5.857900142669678, "learning_rate": 7.052603925175466e-06, "logits/chosen": -1.2869961261749268, "logits/rejected": -0.8082895278930664, "logps/chosen": -0.7936242818832397, "logps/rejected": -1.4054462909698486, "loss": 0.8341, "odds_ratio_loss": 0.405081182718277, "rewards/accuracies": 1.0, "rewards/chosen": -0.0793624296784401, "rewards/margins": 0.06118218973278999, "rewards/rejected": -0.14054462313652039, "sft_loss": 0.7936242818832397, "step": 4760 }, { "epoch": 0.37, "grad_norm": 9.901500701904297, "learning_rate": 7.04698758144802e-06, "logits/chosen": -1.3234273195266724, "logits/rejected": -1.113125205039978, "logps/chosen": -1.0561497211456299, "logps/rejected": -2.484020948410034, "loss": 1.0788, "odds_ratio_loss": 0.226668119430542, "rewards/accuracies": 1.0, "rewards/chosen": -0.10561498254537582, "rewards/margins": 0.1427871435880661, "rewards/rejected": -0.24840211868286133, "sft_loss": 1.0561497211456299, "step": 4765 }, { "epoch": 0.37, "grad_norm": 56.50979995727539, "learning_rate": 7.04136813320401e-06, "logits/chosen": -1.2841142416000366, "logits/rejected": -0.7560856938362122, "logps/chosen": -1.0950348377227783, "logps/rejected": -7.483965873718262, "loss": 1.118, "odds_ratio_loss": 0.2300182282924652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10950347036123276, "rewards/margins": 0.638893187046051, "rewards/rejected": -0.748396635055542, "sft_loss": 1.0950348377227783, "step": 4770 }, { "epoch": 0.37, "grad_norm": 6.978331565856934, "learning_rate": 7.0357455889660445e-06, "logits/chosen": -1.3573987483978271, "logits/rejected": -1.0475003719329834, "logps/chosen": -0.8214191198348999, "logps/rejected": -2.1504311561584473, "loss": 0.8564, "odds_ratio_loss": 0.35030627250671387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08214191347360611, "rewards/margins": 0.13290122151374817, "rewards/rejected": -0.21504314243793488, "sft_loss": 0.8214191198348999, "step": 4775 }, { "epoch": 0.37, "grad_norm": 148.0516815185547, "learning_rate": 7.030119957261425e-06, "logits/chosen": -1.2780089378356934, "logits/rejected": -0.9989121556282043, "logps/chosen": -0.9051804542541504, "logps/rejected": -4.656085014343262, "loss": 0.915, "odds_ratio_loss": 0.09867776930332184, "rewards/accuracies": 1.0, "rewards/chosen": -0.0905180498957634, "rewards/margins": 0.37509050965309143, "rewards/rejected": -0.46560850739479065, "sft_loss": 0.9051804542541504, "step": 4780 }, { "epoch": 0.37, "grad_norm": 6.404755592346191, "learning_rate": 7.024491246622135e-06, "logits/chosen": -1.3146905899047852, "logits/rejected": -0.586790144443512, "logps/chosen": -0.9470073580741882, "logps/rejected": -2.3955118656158447, "loss": 0.9698, "odds_ratio_loss": 0.227756068110466, "rewards/accuracies": 1.0, "rewards/chosen": -0.09470073133707047, "rewards/margins": 0.1448504477739334, "rewards/rejected": -0.23955118656158447, "sft_loss": 0.9470073580741882, "step": 4785 }, { "epoch": 0.37, "grad_norm": 5.560925483703613, "learning_rate": 7.018859465584832e-06, "logits/chosen": -1.2906675338745117, "logits/rejected": -0.7796791195869446, "logps/chosen": -1.1187167167663574, "logps/rejected": -3.7551982402801514, "loss": 1.1369, "odds_ratio_loss": 0.1814715564250946, "rewards/accuracies": 1.0, "rewards/chosen": -0.11187167465686798, "rewards/margins": 0.2636481523513794, "rewards/rejected": -0.37551984190940857, "sft_loss": 1.1187167167663574, "step": 4790 }, { "epoch": 0.37, "grad_norm": 4.41677713394165, "learning_rate": 7.013224622690823e-06, "logits/chosen": -1.2547338008880615, "logits/rejected": -0.9723116159439087, "logps/chosen": -0.8910681009292603, "logps/rejected": -1.8260596990585327, "loss": 0.9175, "odds_ratio_loss": 0.26385509967803955, "rewards/accuracies": 1.0, "rewards/chosen": -0.08910682052373886, "rewards/margins": 0.09349914640188217, "rewards/rejected": -0.18260596692562103, "sft_loss": 0.8910681009292603, "step": 4795 }, { "epoch": 0.37, "grad_norm": 14.212723731994629, "learning_rate": 7.007586726486066e-06, "logits/chosen": -1.4153473377227783, "logits/rejected": -0.8781019449234009, "logps/chosen": -0.974286675453186, "logps/rejected": -3.054361343383789, "loss": 1.0106, "odds_ratio_loss": 0.3631802797317505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09742867201566696, "rewards/margins": 0.20800745487213135, "rewards/rejected": -0.3054361343383789, "sft_loss": 0.974286675453186, "step": 4800 }, { "epoch": 0.37, "grad_norm": 5.878570079803467, "learning_rate": 7.001945785521145e-06, "logits/chosen": -1.2006781101226807, "logits/rejected": -1.1574538946151733, "logps/chosen": -0.8742920756340027, "logps/rejected": -5.729536056518555, "loss": 0.9238, "odds_ratio_loss": 0.4947918951511383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08742920309305191, "rewards/margins": 0.48552441596984863, "rewards/rejected": -0.5729536414146423, "sft_loss": 0.8742920756340027, "step": 4805 }, { "epoch": 0.37, "grad_norm": 5.804833889007568, "learning_rate": 6.996301808351264e-06, "logits/chosen": -1.2291743755340576, "logits/rejected": -0.7988919019699097, "logps/chosen": -0.962105393409729, "logps/rejected": -2.536154270172119, "loss": 0.9884, "odds_ratio_loss": 0.26280778646469116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0962105393409729, "rewards/margins": 0.15740486979484558, "rewards/rejected": -0.2536154091358185, "sft_loss": 0.962105393409729, "step": 4810 }, { "epoch": 0.37, "grad_norm": 14.966432571411133, "learning_rate": 6.99065480353623e-06, "logits/chosen": -1.2544745206832886, "logits/rejected": -1.0695239305496216, "logps/chosen": -0.9016950726509094, "logps/rejected": -4.076902866363525, "loss": 0.9405, "odds_ratio_loss": 0.3877166211605072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0901695042848587, "rewards/margins": 0.31752076745033264, "rewards/rejected": -0.40769022703170776, "sft_loss": 0.9016950726509094, "step": 4815 }, { "epoch": 0.37, "grad_norm": 8.97038745880127, "learning_rate": 6.985004779640442e-06, "logits/chosen": -1.3023059368133545, "logits/rejected": -0.5928922891616821, "logps/chosen": -0.9082862138748169, "logps/rejected": -4.716104984283447, "loss": 0.9207, "odds_ratio_loss": 0.12394730001688004, "rewards/accuracies": 1.0, "rewards/chosen": -0.09082861244678497, "rewards/margins": 0.3807818591594696, "rewards/rejected": -0.47161048650741577, "sft_loss": 0.9082862138748169, "step": 4820 }, { "epoch": 0.38, "grad_norm": 7.581333160400391, "learning_rate": 6.979351745232879e-06, "logits/chosen": -1.2428420782089233, "logits/rejected": -0.9282558560371399, "logps/chosen": -1.1406314373016357, "logps/rejected": -5.644349575042725, "loss": 1.1639, "odds_ratio_loss": 0.23303601145744324, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11406314373016357, "rewards/margins": 0.4503718912601471, "rewards/rejected": -0.5644350051879883, "sft_loss": 1.1406314373016357, "step": 4825 }, { "epoch": 0.38, "grad_norm": 291.5585021972656, "learning_rate": 6.973695708887088e-06, "logits/chosen": -1.479901909828186, "logits/rejected": -1.1979395151138306, "logps/chosen": -1.0565932989120483, "logps/rejected": -6.788051605224609, "loss": 1.0746, "odds_ratio_loss": 0.1802714765071869, "rewards/accuracies": 1.0, "rewards/chosen": -0.10565934330224991, "rewards/margins": 0.5731458067893982, "rewards/rejected": -0.6788051724433899, "sft_loss": 1.0565932989120483, "step": 4830 }, { "epoch": 0.38, "grad_norm": 209.37506103515625, "learning_rate": 6.968036679181164e-06, "logits/chosen": -1.5039844512939453, "logits/rejected": -1.1092535257339478, "logps/chosen": -1.1655082702636719, "logps/rejected": -2.8518729209899902, "loss": 1.1909, "odds_ratio_loss": 0.2540439963340759, "rewards/accuracies": 1.0, "rewards/chosen": -0.11655082553625107, "rewards/margins": 0.1686364710330963, "rewards/rejected": -0.285187304019928, "sft_loss": 1.1655082702636719, "step": 4835 }, { "epoch": 0.38, "grad_norm": 95.15951538085938, "learning_rate": 6.962374664697744e-06, "logits/chosen": -1.3616310358047485, "logits/rejected": -0.9117149114608765, "logps/chosen": -1.0936627388000488, "logps/rejected": -2.5025813579559326, "loss": 1.1546, "odds_ratio_loss": 0.6098529696464539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.109366275370121, "rewards/margins": 0.1408918797969818, "rewards/rejected": -0.2502581477165222, "sft_loss": 1.0936627388000488, "step": 4840 }, { "epoch": 0.38, "grad_norm": 8.825847625732422, "learning_rate": 6.956709674023991e-06, "logits/chosen": -1.2413387298583984, "logits/rejected": -0.9800017476081848, "logps/chosen": -1.249328851699829, "logps/rejected": -1.4448187351226807, "loss": 1.3213, "odds_ratio_loss": 0.7200738191604614, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.12493288516998291, "rewards/margins": 0.019548993557691574, "rewards/rejected": -0.1444818675518036, "sft_loss": 1.249328851699829, "step": 4845 }, { "epoch": 0.38, "grad_norm": 86.56720733642578, "learning_rate": 6.951041715751585e-06, "logits/chosen": -1.232062578201294, "logits/rejected": -1.0836049318313599, "logps/chosen": -1.107206106185913, "logps/rejected": -4.509480953216553, "loss": 1.1338, "odds_ratio_loss": 0.2663348317146301, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11072063446044922, "rewards/margins": 0.34022751450538635, "rewards/rejected": -0.45094814896583557, "sft_loss": 1.107206106185913, "step": 4850 }, { "epoch": 0.38, "grad_norm": 5.162984848022461, "learning_rate": 6.945370798476704e-06, "logits/chosen": -1.1838353872299194, "logits/rejected": -0.9465975761413574, "logps/chosen": -1.0141656398773193, "logps/rejected": -4.33822774887085, "loss": 1.0332, "odds_ratio_loss": 0.1907617598772049, "rewards/accuracies": 1.0, "rewards/chosen": -0.10141657292842865, "rewards/margins": 0.3324061930179596, "rewards/rejected": -0.43382278084754944, "sft_loss": 1.0141656398773193, "step": 4855 }, { "epoch": 0.38, "grad_norm": 7.11558198928833, "learning_rate": 6.939696930800012e-06, "logits/chosen": -1.2861100435256958, "logits/rejected": -1.096118688583374, "logps/chosen": -1.4758632183074951, "logps/rejected": -4.466700077056885, "loss": 1.4866, "odds_ratio_loss": 0.10705895721912384, "rewards/accuracies": 1.0, "rewards/chosen": -0.14758633077144623, "rewards/margins": 0.2990837097167969, "rewards/rejected": -0.4466700553894043, "sft_loss": 1.4758632183074951, "step": 4860 }, { "epoch": 0.38, "grad_norm": 6.247501850128174, "learning_rate": 6.934020121326651e-06, "logits/chosen": -1.4358515739440918, "logits/rejected": -1.1405222415924072, "logps/chosen": -1.2349154949188232, "logps/rejected": -5.667292594909668, "loss": 1.2594, "odds_ratio_loss": 0.24501347541809082, "rewards/accuracies": 1.0, "rewards/chosen": -0.1234915480017662, "rewards/margins": 0.4432377219200134, "rewards/rejected": -0.5667292475700378, "sft_loss": 1.2349154949188232, "step": 4865 }, { "epoch": 0.38, "grad_norm": 6.55000114440918, "learning_rate": 6.928340378666225e-06, "logits/chosen": -1.4754161834716797, "logits/rejected": -0.9751527905464172, "logps/chosen": -1.3910658359527588, "logps/rejected": -6.125715732574463, "loss": 1.4068, "odds_ratio_loss": 0.15740497410297394, "rewards/accuracies": 1.0, "rewards/chosen": -0.13910658657550812, "rewards/margins": 0.4734649658203125, "rewards/rejected": -0.6125715970993042, "sft_loss": 1.3910658359527588, "step": 4870 }, { "epoch": 0.38, "grad_norm": 47.819461822509766, "learning_rate": 6.922657711432781e-06, "logits/chosen": -1.452848196029663, "logits/rejected": -1.0147812366485596, "logps/chosen": -1.0927129983901978, "logps/rejected": -2.6984667778015137, "loss": 1.1157, "odds_ratio_loss": 0.2295651137828827, "rewards/accuracies": 1.0, "rewards/chosen": -0.10927130281925201, "rewards/margins": 0.16057537496089935, "rewards/rejected": -0.26984667778015137, "sft_loss": 1.0927129983901978, "step": 4875 }, { "epoch": 0.38, "grad_norm": 10.338000297546387, "learning_rate": 6.9169721282448075e-06, "logits/chosen": -1.2707912921905518, "logits/rejected": -0.8893852233886719, "logps/chosen": -1.0967166423797607, "logps/rejected": -1.765512466430664, "loss": 1.1472, "odds_ratio_loss": 0.5050911903381348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10967166721820831, "rewards/margins": 0.06687958538532257, "rewards/rejected": -0.17655125260353088, "sft_loss": 1.0967166423797607, "step": 4880 }, { "epoch": 0.38, "grad_norm": 12.873258590698242, "learning_rate": 6.9112836377252136e-06, "logits/chosen": -1.247232437133789, "logits/rejected": -1.4507074356079102, "logps/chosen": -0.669430136680603, "logps/rejected": -6.086930274963379, "loss": 0.7079, "odds_ratio_loss": 0.38430362939834595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06694300472736359, "rewards/margins": 0.5417500734329224, "rewards/rejected": -0.60869300365448, "sft_loss": 0.669430136680603, "step": 4885 }, { "epoch": 0.38, "grad_norm": 54.19504165649414, "learning_rate": 6.905592248501318e-06, "logits/chosen": -1.216870665550232, "logits/rejected": -0.9719651937484741, "logps/chosen": -1.1346818208694458, "logps/rejected": -3.1256916522979736, "loss": 1.2082, "odds_ratio_loss": 0.7348722219467163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11346818506717682, "rewards/margins": 0.19910097122192383, "rewards/rejected": -0.31256914138793945, "sft_loss": 1.1346818208694458, "step": 4890 }, { "epoch": 0.38, "grad_norm": 5.423599720001221, "learning_rate": 6.899897969204834e-06, "logits/chosen": -1.3522473573684692, "logits/rejected": -1.0995229482650757, "logps/chosen": -0.8513563871383667, "logps/rejected": -3.2106711864471436, "loss": 0.8677, "odds_ratio_loss": 0.16294452548027039, "rewards/accuracies": 1.0, "rewards/chosen": -0.08513564616441727, "rewards/margins": 0.23593148589134216, "rewards/rejected": -0.32106712460517883, "sft_loss": 0.8513563871383667, "step": 4895 }, { "epoch": 0.38, "grad_norm": 5.703450679779053, "learning_rate": 6.894200808471858e-06, "logits/chosen": -1.3011819124221802, "logits/rejected": -0.487908273935318, "logps/chosen": -0.9999151229858398, "logps/rejected": -2.380300283432007, "loss": 1.0476, "odds_ratio_loss": 0.4769115447998047, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09999152272939682, "rewards/margins": 0.1380385160446167, "rewards/rejected": -0.23803003132343292, "sft_loss": 0.9999151229858398, "step": 4900 }, { "epoch": 0.38, "grad_norm": 27.872135162353516, "learning_rate": 6.88850077494286e-06, "logits/chosen": -1.3154263496398926, "logits/rejected": -0.9825452566146851, "logps/chosen": -1.085605263710022, "logps/rejected": -3.069535255432129, "loss": 1.1015, "odds_ratio_loss": 0.1588580310344696, "rewards/accuracies": 1.0, "rewards/chosen": -0.10856052488088608, "rewards/margins": 0.19839301705360413, "rewards/rejected": -0.3069535195827484, "sft_loss": 1.085605263710022, "step": 4905 }, { "epoch": 0.38, "grad_norm": 62.25426483154297, "learning_rate": 6.882797877262663e-06, "logits/chosen": -1.3177497386932373, "logits/rejected": -1.1975306272506714, "logps/chosen": -1.0411581993103027, "logps/rejected": -1.4796054363250732, "loss": 1.0933, "odds_ratio_loss": 0.5210880637168884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10411582142114639, "rewards/margins": 0.043844711035490036, "rewards/rejected": -0.14796052873134613, "sft_loss": 1.0411581993103027, "step": 4910 }, { "epoch": 0.38, "grad_norm": 8.837223052978516, "learning_rate": 6.877092124080435e-06, "logits/chosen": -1.432607650756836, "logits/rejected": -1.0251507759094238, "logps/chosen": -1.138588547706604, "logps/rejected": -7.440249443054199, "loss": 1.2108, "odds_ratio_loss": 0.7222912311553955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11385886371135712, "rewards/margins": 0.6301661729812622, "rewards/rejected": -0.744024932384491, "sft_loss": 1.138588547706604, "step": 4915 }, { "epoch": 0.38, "grad_norm": 10.964761734008789, "learning_rate": 6.8713835240496776e-06, "logits/chosen": -1.2825660705566406, "logits/rejected": -0.9963976144790649, "logps/chosen": -1.039621353149414, "logps/rejected": -4.55959415435791, "loss": 1.0755, "odds_ratio_loss": 0.3583175241947174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10396213829517365, "rewards/margins": 0.3519973158836365, "rewards/rejected": -0.45595940947532654, "sft_loss": 1.039621353149414, "step": 4920 }, { "epoch": 0.38, "grad_norm": 69.33670806884766, "learning_rate": 6.865672085828205e-06, "logits/chosen": -1.3993297815322876, "logits/rejected": -1.4595104455947876, "logps/chosen": -0.8613026738166809, "logps/rejected": -3.6366093158721924, "loss": 0.9189, "odds_ratio_loss": 0.5756146907806396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08613026142120361, "rewards/margins": 0.2775306701660156, "rewards/rejected": -0.36366090178489685, "sft_loss": 0.8613026738166809, "step": 4925 }, { "epoch": 0.38, "grad_norm": 13.625251770019531, "learning_rate": 6.859957818078139e-06, "logits/chosen": -1.4369404315948486, "logits/rejected": -0.8882350921630859, "logps/chosen": -1.2669563293457031, "logps/rejected": -8.768199920654297, "loss": 1.3147, "odds_ratio_loss": 0.47707176208496094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1266956329345703, "rewards/margins": 0.7501242756843567, "rewards/rejected": -0.8768199682235718, "sft_loss": 1.2669563293457031, "step": 4930 }, { "epoch": 0.38, "grad_norm": 6.9091267585754395, "learning_rate": 6.854240729465892e-06, "logits/chosen": -1.4900411367416382, "logits/rejected": -1.119840383529663, "logps/chosen": -1.02559494972229, "logps/rejected": -6.696244239807129, "loss": 1.0567, "odds_ratio_loss": 0.31142452359199524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10255949199199677, "rewards/margins": 0.5670649409294128, "rewards/rejected": -0.6696244478225708, "sft_loss": 1.02559494972229, "step": 4935 }, { "epoch": 0.38, "grad_norm": 7.601396083831787, "learning_rate": 6.848520828662155e-06, "logits/chosen": -1.3692182302474976, "logits/rejected": -1.2401628494262695, "logps/chosen": -1.0997951030731201, "logps/rejected": -7.7627716064453125, "loss": 1.1207, "odds_ratio_loss": 0.2095102071762085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10997951030731201, "rewards/margins": 0.6662976145744324, "rewards/rejected": -0.7762770652770996, "sft_loss": 1.0997951030731201, "step": 4940 }, { "epoch": 0.38, "grad_norm": 8.860038757324219, "learning_rate": 6.8427981243418866e-06, "logits/chosen": -1.3120964765548706, "logits/rejected": -1.2637927532196045, "logps/chosen": -0.7979799509048462, "logps/rejected": -8.659887313842773, "loss": 0.7987, "odds_ratio_loss": 0.006944218184798956, "rewards/accuracies": 1.0, "rewards/chosen": -0.07979799807071686, "rewards/margins": 0.7861906886100769, "rewards/rejected": -0.8659887313842773, "sft_loss": 0.7979799509048462, "step": 4945 }, { "epoch": 0.39, "grad_norm": 18.52011489868164, "learning_rate": 6.83707262518429e-06, "logits/chosen": -1.3004642724990845, "logits/rejected": -0.9895919561386108, "logps/chosen": -0.9648422002792358, "logps/rejected": -2.168957471847534, "loss": 1.0539, "odds_ratio_loss": 0.8904326558113098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0964842289686203, "rewards/margins": 0.12041151523590088, "rewards/rejected": -0.21689574420452118, "sft_loss": 0.9648422002792358, "step": 4950 }, { "epoch": 0.39, "grad_norm": 3.977808952331543, "learning_rate": 6.831344339872813e-06, "logits/chosen": -1.1475770473480225, "logits/rejected": -0.7214670181274414, "logps/chosen": -0.7406553030014038, "logps/rejected": -13.014932632446289, "loss": 0.7411, "odds_ratio_loss": 0.004104848951101303, "rewards/accuracies": 1.0, "rewards/chosen": -0.07406553626060486, "rewards/margins": 1.2274277210235596, "rewards/rejected": -1.3014931678771973, "sft_loss": 0.7406553030014038, "step": 4955 }, { "epoch": 0.39, "grad_norm": 9.07308578491211, "learning_rate": 6.825613277095129e-06, "logits/chosen": -1.2775825262069702, "logits/rejected": -1.0573843717575073, "logps/chosen": -0.9088460803031921, "logps/rejected": -2.937030792236328, "loss": 0.9436, "odds_ratio_loss": 0.3471711277961731, "rewards/accuracies": 1.0, "rewards/chosen": -0.09088461101055145, "rewards/margins": 0.20281848311424255, "rewards/rejected": -0.2937030792236328, "sft_loss": 0.9088460803031921, "step": 4960 }, { "epoch": 0.39, "grad_norm": 7.698314189910889, "learning_rate": 6.8198794455431205e-06, "logits/chosen": -1.3673810958862305, "logits/rejected": -0.930639386177063, "logps/chosen": -0.8960191011428833, "logps/rejected": -10.090847969055176, "loss": 0.9147, "odds_ratio_loss": 0.1867930293083191, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08960190415382385, "rewards/margins": 0.9194828867912292, "rewards/rejected": -1.009084701538086, "sft_loss": 0.8960191011428833, "step": 4965 }, { "epoch": 0.39, "grad_norm": 12.36298656463623, "learning_rate": 6.814142853912873e-06, "logits/chosen": -1.4559195041656494, "logits/rejected": -1.2453267574310303, "logps/chosen": -1.1622923612594604, "logps/rejected": -2.9156875610351562, "loss": 1.2222, "odds_ratio_loss": 0.5988325476646423, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11622923612594604, "rewards/margins": 0.17533953487873077, "rewards/rejected": -0.2915687561035156, "sft_loss": 1.1622923612594604, "step": 4970 }, { "epoch": 0.39, "grad_norm": 9.976313591003418, "learning_rate": 6.808403510904653e-06, "logits/chosen": -1.503354787826538, "logits/rejected": -1.2309849262237549, "logps/chosen": -0.7218309640884399, "logps/rejected": -3.757450580596924, "loss": 0.7453, "odds_ratio_loss": 0.23489134013652802, "rewards/accuracies": 1.0, "rewards/chosen": -0.07218309491872787, "rewards/margins": 0.3035619854927063, "rewards/rejected": -0.3757449984550476, "sft_loss": 0.7218309640884399, "step": 4975 }, { "epoch": 0.39, "grad_norm": 591.4327392578125, "learning_rate": 6.802661425222907e-06, "logits/chosen": -1.31455659866333, "logits/rejected": -0.5907676815986633, "logps/chosen": -1.8325262069702148, "logps/rejected": -3.8407177925109863, "loss": 1.8731, "odds_ratio_loss": 0.40590929985046387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18325261771678925, "rewards/margins": 0.20081916451454163, "rewards/rejected": -0.3840717673301697, "sft_loss": 1.8325262069702148, "step": 4980 }, { "epoch": 0.39, "grad_norm": 5.556074619293213, "learning_rate": 6.796916605576235e-06, "logits/chosen": -1.4891436100006104, "logits/rejected": -1.0084350109100342, "logps/chosen": -0.8972042798995972, "logps/rejected": -3.8865675926208496, "loss": 0.9218, "odds_ratio_loss": 0.2455102503299713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08972042798995972, "rewards/margins": 0.29893630743026733, "rewards/rejected": -0.38865676522254944, "sft_loss": 0.8972042798995972, "step": 4985 }, { "epoch": 0.39, "grad_norm": 11.468494415283203, "learning_rate": 6.7911690606773836e-06, "logits/chosen": -1.3948386907577515, "logits/rejected": -1.2727683782577515, "logps/chosen": -0.6499780416488647, "logps/rejected": -6.216355800628662, "loss": 0.6515, "odds_ratio_loss": 0.015562218613922596, "rewards/accuracies": 1.0, "rewards/chosen": -0.06499779969453812, "rewards/margins": 0.5566378831863403, "rewards/rejected": -0.6216356158256531, "sft_loss": 0.6499780416488647, "step": 4990 }, { "epoch": 0.39, "grad_norm": 61.68426513671875, "learning_rate": 6.785418799243238e-06, "logits/chosen": -1.1920238733291626, "logits/rejected": -1.3612303733825684, "logps/chosen": -0.7148585319519043, "logps/rejected": -4.0645341873168945, "loss": 0.7558, "odds_ratio_loss": 0.40968450903892517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07148585468530655, "rewards/margins": 0.33496764302253723, "rewards/rejected": -0.4064534604549408, "sft_loss": 0.7148585319519043, "step": 4995 }, { "epoch": 0.39, "grad_norm": 32.57492446899414, "learning_rate": 6.7796658299947946e-06, "logits/chosen": -1.2153874635696411, "logits/rejected": -1.1897119283676147, "logps/chosen": -0.749336838722229, "logps/rejected": -2.0858726501464844, "loss": 0.7786, "odds_ratio_loss": 0.2922513484954834, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07493368536233902, "rewards/margins": 0.13365359604358673, "rewards/rejected": -0.20858728885650635, "sft_loss": 0.749336838722229, "step": 5000 }, { "epoch": 0.39, "grad_norm": 24.377872467041016, "learning_rate": 6.7739101616571675e-06, "logits/chosen": -1.4823286533355713, "logits/rejected": -1.0391753911972046, "logps/chosen": -0.9777601957321167, "logps/rejected": -2.9456517696380615, "loss": 0.9917, "odds_ratio_loss": 0.1392892450094223, "rewards/accuracies": 1.0, "rewards/chosen": -0.09777601808309555, "rewards/margins": 0.19678914546966553, "rewards/rejected": -0.2945651412010193, "sft_loss": 0.9777601957321167, "step": 5005 }, { "epoch": 0.39, "grad_norm": 44.49354553222656, "learning_rate": 6.768151802959556e-06, "logits/chosen": -1.5044395923614502, "logits/rejected": -1.249171495437622, "logps/chosen": -0.8982955813407898, "logps/rejected": -2.9409663677215576, "loss": 0.9328, "odds_ratio_loss": 0.34488964080810547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08982956409454346, "rewards/margins": 0.20426709949970245, "rewards/rejected": -0.2940966486930847, "sft_loss": 0.8982955813407898, "step": 5010 }, { "epoch": 0.39, "grad_norm": 7.909432411193848, "learning_rate": 6.76239076263524e-06, "logits/chosen": -1.410831093788147, "logits/rejected": -0.765641450881958, "logps/chosen": -1.078161358833313, "logps/rejected": -6.150436878204346, "loss": 1.0887, "odds_ratio_loss": 0.1049346923828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.10781614482402802, "rewards/margins": 0.5072275400161743, "rewards/rejected": -0.6150436997413635, "sft_loss": 1.078161358833313, "step": 5015 }, { "epoch": 0.39, "grad_norm": 27.489490509033203, "learning_rate": 6.756627049421572e-06, "logits/chosen": -1.393526315689087, "logits/rejected": -1.0082769393920898, "logps/chosen": -1.1783392429351807, "logps/rejected": -6.130882263183594, "loss": 1.196, "odds_ratio_loss": 0.1762438714504242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11783391237258911, "rewards/margins": 0.49525442719459534, "rewards/rejected": -0.6130883097648621, "sft_loss": 1.1783392429351807, "step": 5020 }, { "epoch": 0.39, "grad_norm": 8.808501243591309, "learning_rate": 6.7508606720599535e-06, "logits/chosen": -1.321639895439148, "logits/rejected": -0.853980541229248, "logps/chosen": -0.7712258100509644, "logps/rejected": -4.877452373504639, "loss": 0.7797, "odds_ratio_loss": 0.08445506542921066, "rewards/accuracies": 1.0, "rewards/chosen": -0.07712258398532867, "rewards/margins": 0.4106226861476898, "rewards/rejected": -0.4877452254295349, "sft_loss": 0.7712258100509644, "step": 5025 }, { "epoch": 0.39, "grad_norm": 6.824776649475098, "learning_rate": 6.745091639295827e-06, "logits/chosen": -1.277091383934021, "logits/rejected": -0.9593345522880554, "logps/chosen": -0.764047384262085, "logps/rejected": -2.850830554962158, "loss": 0.7783, "odds_ratio_loss": 0.1427466869354248, "rewards/accuracies": 1.0, "rewards/chosen": -0.07640473544597626, "rewards/margins": 0.20867832005023956, "rewards/rejected": -0.28508302569389343, "sft_loss": 0.764047384262085, "step": 5030 }, { "epoch": 0.39, "grad_norm": 4.839137554168701, "learning_rate": 6.7393199598786655e-06, "logits/chosen": -1.359311819076538, "logits/rejected": -0.8653494119644165, "logps/chosen": -1.2023526430130005, "logps/rejected": -4.120873928070068, "loss": 1.2166, "odds_ratio_loss": 0.14224644005298615, "rewards/accuracies": 1.0, "rewards/chosen": -0.12023527920246124, "rewards/margins": 0.2918521761894226, "rewards/rejected": -0.41208744049072266, "sft_loss": 1.2023526430130005, "step": 5035 }, { "epoch": 0.39, "grad_norm": 5.2521586418151855, "learning_rate": 6.7335456425619515e-06, "logits/chosen": -1.3867168426513672, "logits/rejected": -0.7938799262046814, "logps/chosen": -0.961976170539856, "logps/rejected": -5.642143726348877, "loss": 0.9809, "odds_ratio_loss": 0.189554363489151, "rewards/accuracies": 1.0, "rewards/chosen": -0.09619762003421783, "rewards/margins": 0.4680168032646179, "rewards/rejected": -0.5642144083976746, "sft_loss": 0.961976170539856, "step": 5040 }, { "epoch": 0.39, "grad_norm": 13.384309768676758, "learning_rate": 6.72776869610317e-06, "logits/chosen": -1.4045675992965698, "logits/rejected": -0.8836909532546997, "logps/chosen": -0.9754483103752136, "logps/rejected": -3.191079616546631, "loss": 1.0381, "odds_ratio_loss": 0.6266669631004333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0975448340177536, "rewards/margins": 0.22156314551830292, "rewards/rejected": -0.3191079795360565, "sft_loss": 0.9754483103752136, "step": 5045 }, { "epoch": 0.39, "grad_norm": 25.794885635375977, "learning_rate": 6.721989129263797e-06, "logits/chosen": -1.2337299585342407, "logits/rejected": -0.8713550567626953, "logps/chosen": -1.0439655780792236, "logps/rejected": -7.396246910095215, "loss": 1.055, "odds_ratio_loss": 0.11006517708301544, "rewards/accuracies": 1.0, "rewards/chosen": -0.10439654439687729, "rewards/margins": 0.635228157043457, "rewards/rejected": -0.7396246790885925, "sft_loss": 1.0439655780792236, "step": 5050 }, { "epoch": 0.39, "grad_norm": 20.35993766784668, "learning_rate": 6.716206950809274e-06, "logits/chosen": -1.396535873413086, "logits/rejected": -0.8851677775382996, "logps/chosen": -1.0421764850616455, "logps/rejected": -3.7658889293670654, "loss": 1.0814, "odds_ratio_loss": 0.3919692039489746, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10421766340732574, "rewards/margins": 0.27237120270729065, "rewards/rejected": -0.3765888512134552, "sft_loss": 1.0421764850616455, "step": 5055 }, { "epoch": 0.39, "grad_norm": 41.810333251953125, "learning_rate": 6.710422169509015e-06, "logits/chosen": -1.5096509456634521, "logits/rejected": -1.0556962490081787, "logps/chosen": -1.108420968055725, "logps/rejected": -3.083578586578369, "loss": 1.1268, "odds_ratio_loss": 0.1836351901292801, "rewards/accuracies": 1.0, "rewards/chosen": -0.11084209382534027, "rewards/margins": 0.19751577079296112, "rewards/rejected": -0.308357834815979, "sft_loss": 1.108420968055725, "step": 5060 }, { "epoch": 0.39, "grad_norm": 23.982271194458008, "learning_rate": 6.7046347941363706e-06, "logits/chosen": -1.4725430011749268, "logits/rejected": -0.8947264552116394, "logps/chosen": -0.822553277015686, "logps/rejected": -4.3091607093811035, "loss": 0.8316, "odds_ratio_loss": 0.09055305272340775, "rewards/accuracies": 1.0, "rewards/chosen": -0.08225533366203308, "rewards/margins": 0.34866076707839966, "rewards/rejected": -0.43091607093811035, "sft_loss": 0.822553277015686, "step": 5065 }, { "epoch": 0.39, "grad_norm": 15.915932655334473, "learning_rate": 6.698844833468633e-06, "logits/chosen": -1.500983476638794, "logits/rejected": -1.2363684177398682, "logps/chosen": -0.8845119476318359, "logps/rejected": -3.9846088886260986, "loss": 0.9087, "odds_ratio_loss": 0.24226252734661102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08845119178295135, "rewards/margins": 0.3100096583366394, "rewards/rejected": -0.39846086502075195, "sft_loss": 0.8845119476318359, "step": 5070 }, { "epoch": 0.39, "grad_norm": 5.520943641662598, "learning_rate": 6.693052296287011e-06, "logits/chosen": -1.409310221672058, "logits/rejected": -0.7601202726364136, "logps/chosen": -0.9198876619338989, "logps/rejected": -2.2644782066345215, "loss": 0.9456, "odds_ratio_loss": 0.25737714767456055, "rewards/accuracies": 1.0, "rewards/chosen": -0.09198875725269318, "rewards/margins": 0.13445906341075897, "rewards/rejected": -0.22644782066345215, "sft_loss": 0.9198876619338989, "step": 5075 }, { "epoch": 0.4, "grad_norm": 48.577247619628906, "learning_rate": 6.687257191376624e-06, "logits/chosen": -1.348859429359436, "logits/rejected": -0.6082226037979126, "logps/chosen": -1.1002042293548584, "logps/rejected": -1.5409919023513794, "loss": 1.1681, "odds_ratio_loss": 0.6787872314453125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11002041399478912, "rewards/margins": 0.0440787747502327, "rewards/rejected": -0.15409919619560242, "sft_loss": 1.1002042293548584, "step": 5080 }, { "epoch": 0.4, "grad_norm": 4.986753463745117, "learning_rate": 6.681459527526484e-06, "logits/chosen": -1.1812330484390259, "logits/rejected": -0.9130045175552368, "logps/chosen": -0.7792029976844788, "logps/rejected": -2.0369057655334473, "loss": 0.8078, "odds_ratio_loss": 0.28569847345352173, "rewards/accuracies": 1.0, "rewards/chosen": -0.07792030274868011, "rewards/margins": 0.12577028572559357, "rewards/rejected": -0.2036905735731125, "sft_loss": 0.7792029976844788, "step": 5085 }, { "epoch": 0.4, "grad_norm": 7.481835842132568, "learning_rate": 6.675659313529482e-06, "logits/chosen": -1.4299769401550293, "logits/rejected": -1.0885009765625, "logps/chosen": -1.0667505264282227, "logps/rejected": -4.983901500701904, "loss": 1.0969, "odds_ratio_loss": 0.30162471532821655, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10667506605386734, "rewards/margins": 0.39171507954597473, "rewards/rejected": -0.4983901381492615, "sft_loss": 1.0667505264282227, "step": 5090 }, { "epoch": 0.4, "grad_norm": 6.409983158111572, "learning_rate": 6.669856558182384e-06, "logits/chosen": -1.3709847927093506, "logits/rejected": -0.7447436451911926, "logps/chosen": -0.974819004535675, "logps/rejected": -3.636500597000122, "loss": 0.9995, "odds_ratio_loss": 0.2470639944076538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09748189151287079, "rewards/margins": 0.26616817712783813, "rewards/rejected": -0.36365005373954773, "sft_loss": 0.974819004535675, "step": 5095 }, { "epoch": 0.4, "grad_norm": 6.735576629638672, "learning_rate": 6.664051270285801e-06, "logits/chosen": -1.391518473625183, "logits/rejected": -0.8891741633415222, "logps/chosen": -1.0563715696334839, "logps/rejected": -3.8365464210510254, "loss": 1.0792, "odds_ratio_loss": 0.22799015045166016, "rewards/accuracies": 1.0, "rewards/chosen": -0.10563715547323227, "rewards/margins": 0.27801746129989624, "rewards/rejected": -0.3836546242237091, "sft_loss": 1.0563715696334839, "step": 5100 }, { "epoch": 0.4, "grad_norm": 11.849318504333496, "learning_rate": 6.658243458644189e-06, "logits/chosen": -1.4003181457519531, "logits/rejected": -1.0954400300979614, "logps/chosen": -0.8720696568489075, "logps/rejected": -2.591235399246216, "loss": 0.89, "odds_ratio_loss": 0.17881350219249725, "rewards/accuracies": 1.0, "rewards/chosen": -0.08720696717500687, "rewards/margins": 0.17191657423973083, "rewards/rejected": -0.2591235339641571, "sft_loss": 0.8720696568489075, "step": 5105 }, { "epoch": 0.4, "grad_norm": 23.594053268432617, "learning_rate": 6.652433132065834e-06, "logits/chosen": -1.3882420063018799, "logits/rejected": -0.9964066743850708, "logps/chosen": -1.1310899257659912, "logps/rejected": -11.7948637008667, "loss": 1.1367, "odds_ratio_loss": 0.05562058836221695, "rewards/accuracies": 1.0, "rewards/chosen": -0.11310900747776031, "rewards/margins": 1.0663774013519287, "rewards/rejected": -1.1794865131378174, "sft_loss": 1.1310899257659912, "step": 5110 }, { "epoch": 0.4, "grad_norm": 59.16744613647461, "learning_rate": 6.646620299362833e-06, "logits/chosen": -1.3670847415924072, "logits/rejected": -1.0468709468841553, "logps/chosen": -0.8871728181838989, "logps/rejected": -5.707052230834961, "loss": 0.942, "odds_ratio_loss": 0.5480446815490723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08871728926897049, "rewards/margins": 0.4819878935813904, "rewards/rejected": -0.5707052946090698, "sft_loss": 0.8871728181838989, "step": 5115 }, { "epoch": 0.4, "grad_norm": 8.881211280822754, "learning_rate": 6.640804969351086e-06, "logits/chosen": -1.3376655578613281, "logits/rejected": -1.367163896560669, "logps/chosen": -1.4067643880844116, "logps/rejected": -4.877480506896973, "loss": 1.4234, "odds_ratio_loss": 0.16628073155879974, "rewards/accuracies": 1.0, "rewards/chosen": -0.14067645370960236, "rewards/margins": 0.3470715880393982, "rewards/rejected": -0.48774799704551697, "sft_loss": 1.4067643880844116, "step": 5120 }, { "epoch": 0.4, "grad_norm": 1220.0985107421875, "learning_rate": 6.63498715085028e-06, "logits/chosen": -1.1132049560546875, "logits/rejected": -1.5372873544692993, "logps/chosen": -2.2209229469299316, "logps/rejected": -6.843510627746582, "loss": 2.2332, "odds_ratio_loss": 0.12279321998357773, "rewards/accuracies": 1.0, "rewards/chosen": -0.22209230065345764, "rewards/margins": 0.46225887537002563, "rewards/rejected": -0.6843510866165161, "sft_loss": 2.2209229469299316, "step": 5125 }, { "epoch": 0.4, "grad_norm": 9.45090103149414, "learning_rate": 6.62916685268387e-06, "logits/chosen": -1.3377411365509033, "logits/rejected": -0.7452336549758911, "logps/chosen": -0.7855364680290222, "logps/rejected": -2.1455330848693848, "loss": 0.8336, "odds_ratio_loss": 0.4805460572242737, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07855365425348282, "rewards/margins": 0.1359996497631073, "rewards/rejected": -0.21455331146717072, "sft_loss": 0.7855364680290222, "step": 5130 }, { "epoch": 0.4, "grad_norm": 10.302745819091797, "learning_rate": 6.623344083679082e-06, "logits/chosen": -1.4268461465835571, "logits/rejected": -1.1838544607162476, "logps/chosen": -1.1485592126846313, "logps/rejected": -6.531126499176025, "loss": 1.1592, "odds_ratio_loss": 0.10628004372119904, "rewards/accuracies": 1.0, "rewards/chosen": -0.11485592275857925, "rewards/margins": 0.5382567644119263, "rewards/rejected": -0.6531126499176025, "sft_loss": 1.1485592126846313, "step": 5135 }, { "epoch": 0.4, "grad_norm": 11.357346534729004, "learning_rate": 6.617518852666883e-06, "logits/chosen": -1.4212384223937988, "logits/rejected": -0.9435787200927734, "logps/chosen": -1.0578309297561646, "logps/rejected": -3.755695343017578, "loss": 1.0795, "odds_ratio_loss": 0.21710722148418427, "rewards/accuracies": 1.0, "rewards/chosen": -0.10578310489654541, "rewards/margins": 0.26978641748428345, "rewards/rejected": -0.37556952238082886, "sft_loss": 1.0578309297561646, "step": 5140 }, { "epoch": 0.4, "grad_norm": 18.677724838256836, "learning_rate": 6.611691168481976e-06, "logits/chosen": -1.4505398273468018, "logits/rejected": -1.397452712059021, "logps/chosen": -2.092315435409546, "logps/rejected": -2.6640334129333496, "loss": 2.2108, "odds_ratio_loss": 1.1846152544021606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.20923154056072235, "rewards/margins": 0.05717180296778679, "rewards/rejected": -0.2664033770561218, "sft_loss": 2.092315435409546, "step": 5145 }, { "epoch": 0.4, "grad_norm": 7.536815643310547, "learning_rate": 6.605861039962785e-06, "logits/chosen": -1.3447933197021484, "logits/rejected": -0.9513217210769653, "logps/chosen": -1.1527773141860962, "logps/rejected": -5.865206241607666, "loss": 1.1605, "odds_ratio_loss": 0.07718654721975327, "rewards/accuracies": 1.0, "rewards/chosen": -0.1152777299284935, "rewards/margins": 0.47124290466308594, "rewards/rejected": -0.5865205526351929, "sft_loss": 1.1527773141860962, "step": 5150 }, { "epoch": 0.4, "grad_norm": 6.230055332183838, "learning_rate": 6.600028475951438e-06, "logits/chosen": -1.4554965496063232, "logits/rejected": -1.0521572828292847, "logps/chosen": -1.1424763202667236, "logps/rejected": -6.107195854187012, "loss": 1.1544, "odds_ratio_loss": 0.11930395662784576, "rewards/accuracies": 1.0, "rewards/chosen": -0.1142476350069046, "rewards/margins": 0.49647197127342224, "rewards/rejected": -0.610719621181488, "sft_loss": 1.1424763202667236, "step": 5155 }, { "epoch": 0.4, "grad_norm": 11.935591697692871, "learning_rate": 6.594193485293758e-06, "logits/chosen": -1.4703266620635986, "logits/rejected": -1.0110032558441162, "logps/chosen": -1.0646206140518188, "logps/rejected": -4.367392539978027, "loss": 1.0918, "odds_ratio_loss": 0.2720407247543335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10646207630634308, "rewards/margins": 0.3302771747112274, "rewards/rejected": -0.4367392957210541, "sft_loss": 1.0646206140518188, "step": 5160 }, { "epoch": 0.4, "grad_norm": 9.436556816101074, "learning_rate": 6.5883560768392544e-06, "logits/chosen": -1.4851438999176025, "logits/rejected": -1.1477575302124023, "logps/chosen": -0.8953951001167297, "logps/rejected": -4.988603591918945, "loss": 0.9186, "odds_ratio_loss": 0.23208299279212952, "rewards/accuracies": 1.0, "rewards/chosen": -0.08953951299190521, "rewards/margins": 0.4093208312988281, "rewards/rejected": -0.49886035919189453, "sft_loss": 0.8953951001167297, "step": 5165 }, { "epoch": 0.4, "grad_norm": 4.220683574676514, "learning_rate": 6.5825162594410914e-06, "logits/chosen": -1.4410722255706787, "logits/rejected": -0.8721886873245239, "logps/chosen": -0.7598224878311157, "logps/rejected": -1.8489675521850586, "loss": 0.7871, "odds_ratio_loss": 0.27249783277511597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07598225027322769, "rewards/margins": 0.10891450941562653, "rewards/rejected": -0.18489676713943481, "sft_loss": 0.7598224878311157, "step": 5170 }, { "epoch": 0.4, "grad_norm": 5.179640293121338, "learning_rate": 6.576674041956099e-06, "logits/chosen": -1.3173493146896362, "logits/rejected": -0.913482666015625, "logps/chosen": -1.0375252962112427, "logps/rejected": -11.861112594604492, "loss": 1.0394, "odds_ratio_loss": 0.018778596073389053, "rewards/accuracies": 1.0, "rewards/chosen": -0.10375253111124039, "rewards/margins": 1.082358956336975, "rewards/rejected": -1.186111330986023, "sft_loss": 1.0375252962112427, "step": 5175 }, { "epoch": 0.4, "grad_norm": 14.923868179321289, "learning_rate": 6.5708294332447385e-06, "logits/chosen": -1.3341882228851318, "logits/rejected": -0.9410299062728882, "logps/chosen": -0.9867004156112671, "logps/rejected": -1.1775720119476318, "loss": 1.0457, "odds_ratio_loss": 0.5900734663009644, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09867005050182343, "rewards/margins": 0.01908714324235916, "rewards/rejected": -0.11775718629360199, "sft_loss": 0.9867004156112671, "step": 5180 }, { "epoch": 0.4, "grad_norm": 18.309810638427734, "learning_rate": 6.564982442171103e-06, "logits/chosen": -1.444608449935913, "logits/rejected": -1.1630117893218994, "logps/chosen": -0.8816467523574829, "logps/rejected": -3.7716174125671387, "loss": 0.901, "odds_ratio_loss": 0.19364431500434875, "rewards/accuracies": 1.0, "rewards/chosen": -0.08816467970609665, "rewards/margins": 0.2889971137046814, "rewards/rejected": -0.37716180086135864, "sft_loss": 0.8816467523574829, "step": 5185 }, { "epoch": 0.4, "grad_norm": 10.51563835144043, "learning_rate": 6.559133077602895e-06, "logits/chosen": -1.471680998802185, "logits/rejected": -0.9202741384506226, "logps/chosen": -1.4591476917266846, "logps/rejected": -7.845271110534668, "loss": 1.4825, "odds_ratio_loss": 0.23371830582618713, "rewards/accuracies": 1.0, "rewards/chosen": -0.14591476321220398, "rewards/margins": 0.6386123895645142, "rewards/rejected": -0.7845271229743958, "sft_loss": 1.4591476917266846, "step": 5190 }, { "epoch": 0.4, "grad_norm": 58.147037506103516, "learning_rate": 6.55328134841142e-06, "logits/chosen": -1.2235281467437744, "logits/rejected": -0.6818448305130005, "logps/chosen": -1.0554659366607666, "logps/rejected": -10.09231948852539, "loss": 1.0665, "odds_ratio_loss": 0.11016272008419037, "rewards/accuracies": 1.0, "rewards/chosen": -0.10554659366607666, "rewards/margins": 0.903685450553894, "rewards/rejected": -1.0092319250106812, "sft_loss": 1.0554659366607666, "step": 5195 }, { "epoch": 0.4, "grad_norm": 72.4473648071289, "learning_rate": 6.5474272634715675e-06, "logits/chosen": -1.3383867740631104, "logits/rejected": -0.9712162017822266, "logps/chosen": -1.0566487312316895, "logps/rejected": -6.484910488128662, "loss": 1.0775, "odds_ratio_loss": 0.20842652022838593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10566486418247223, "rewards/margins": 0.542826235294342, "rewards/rejected": -0.6484910845756531, "sft_loss": 1.0566487312316895, "step": 5200 }, { "epoch": 0.4, "grad_norm": 41.163124084472656, "learning_rate": 6.541570831661802e-06, "logits/chosen": -1.4022243022918701, "logits/rejected": -1.1713143587112427, "logps/chosen": -1.5733802318572998, "logps/rejected": -8.44092082977295, "loss": 1.5746, "odds_ratio_loss": 0.01258087158203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.15733802318572998, "rewards/margins": 0.686754047870636, "rewards/rejected": -0.844092071056366, "sft_loss": 1.5733802318572998, "step": 5205 }, { "epoch": 0.41, "grad_norm": 7.19493293762207, "learning_rate": 6.535712061864144e-06, "logits/chosen": -1.549918293952942, "logits/rejected": -1.0348308086395264, "logps/chosen": -1.1737009286880493, "logps/rejected": -2.1659064292907715, "loss": 1.2126, "odds_ratio_loss": 0.3890572190284729, "rewards/accuracies": 1.0, "rewards/chosen": -0.11737009137868881, "rewards/margins": 0.09922054409980774, "rewards/rejected": -0.21659064292907715, "sft_loss": 1.1737009286880493, "step": 5210 }, { "epoch": 0.41, "grad_norm": 23.68317222595215, "learning_rate": 6.529850962964164e-06, "logits/chosen": -1.4581935405731201, "logits/rejected": -1.1173006296157837, "logps/chosen": -0.9190993309020996, "logps/rejected": -5.095643043518066, "loss": 0.9294, "odds_ratio_loss": 0.10286466777324677, "rewards/accuracies": 1.0, "rewards/chosen": -0.09190993010997772, "rewards/margins": 0.417654424905777, "rewards/rejected": -0.5095642805099487, "sft_loss": 0.9190993309020996, "step": 5215 }, { "epoch": 0.41, "grad_norm": 12.550178527832031, "learning_rate": 6.523987543850959e-06, "logits/chosen": -1.3836402893066406, "logits/rejected": -1.049728274345398, "logps/chosen": -0.9922205209732056, "logps/rejected": -2.728982925415039, "loss": 1.0135, "odds_ratio_loss": 0.21254947781562805, "rewards/accuracies": 1.0, "rewards/chosen": -0.09922204911708832, "rewards/margins": 0.1736762523651123, "rewards/rejected": -0.27289828658103943, "sft_loss": 0.9922205209732056, "step": 5220 }, { "epoch": 0.41, "grad_norm": 15.591679573059082, "learning_rate": 6.518121813417151e-06, "logits/chosen": -1.3791759014129639, "logits/rejected": -0.9591207504272461, "logps/chosen": -1.084330677986145, "logps/rejected": -3.4170398712158203, "loss": 1.1007, "odds_ratio_loss": 0.16401781141757965, "rewards/accuracies": 1.0, "rewards/chosen": -0.10843305289745331, "rewards/margins": 0.23327095806598663, "rewards/rejected": -0.34170401096343994, "sft_loss": 1.084330677986145, "step": 5225 }, { "epoch": 0.41, "grad_norm": 16.50823402404785, "learning_rate": 6.5122537805588655e-06, "logits/chosen": -1.3790065050125122, "logits/rejected": -1.202798843383789, "logps/chosen": -0.7999995946884155, "logps/rejected": -6.863905906677246, "loss": 0.8023, "odds_ratio_loss": 0.02324349619448185, "rewards/accuracies": 1.0, "rewards/chosen": -0.07999996095895767, "rewards/margins": 0.6063905954360962, "rewards/rejected": -0.6863905787467957, "sft_loss": 0.7999995946884155, "step": 5230 }, { "epoch": 0.41, "grad_norm": 4.774572372436523, "learning_rate": 6.50638345417572e-06, "logits/chosen": -1.3010079860687256, "logits/rejected": -1.19615638256073, "logps/chosen": -1.311397671699524, "logps/rejected": -9.872981071472168, "loss": 1.331, "odds_ratio_loss": 0.1961272656917572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13113977015018463, "rewards/margins": 0.856158435344696, "rewards/rejected": -0.9872981309890747, "sft_loss": 1.311397671699524, "step": 5235 }, { "epoch": 0.41, "grad_norm": 19.700647354125977, "learning_rate": 6.500510843170808e-06, "logits/chosen": -1.4291975498199463, "logits/rejected": -1.0822474956512451, "logps/chosen": -1.1227763891220093, "logps/rejected": -15.763837814331055, "loss": 1.1311, "odds_ratio_loss": 0.08306220918893814, "rewards/accuracies": 1.0, "rewards/chosen": -0.11227764934301376, "rewards/margins": 1.4641063213348389, "rewards/rejected": -1.5763839483261108, "sft_loss": 1.1227763891220093, "step": 5240 }, { "epoch": 0.41, "grad_norm": 17.024803161621094, "learning_rate": 6.494635956450688e-06, "logits/chosen": -1.4313437938690186, "logits/rejected": -0.7469003796577454, "logps/chosen": -0.8249877691268921, "logps/rejected": -3.162346601486206, "loss": 0.8581, "odds_ratio_loss": 0.33130815625190735, "rewards/accuracies": 1.0, "rewards/chosen": -0.08249877393245697, "rewards/margins": 0.23373588919639587, "rewards/rejected": -0.31623467803001404, "sft_loss": 0.8249877691268921, "step": 5245 }, { "epoch": 0.41, "grad_norm": 8.548990249633789, "learning_rate": 6.488758802925373e-06, "logits/chosen": -1.1915152072906494, "logits/rejected": -1.0353416204452515, "logps/chosen": -0.6875573992729187, "logps/rejected": -3.8330788612365723, "loss": 0.7024, "odds_ratio_loss": 0.14798401296138763, "rewards/accuracies": 1.0, "rewards/chosen": -0.06875574588775635, "rewards/margins": 0.3145521581172943, "rewards/rejected": -0.38330790400505066, "sft_loss": 0.6875573992729187, "step": 5250 }, { "epoch": 0.41, "grad_norm": 7.481503963470459, "learning_rate": 6.482879391508317e-06, "logits/chosen": -1.4916441440582275, "logits/rejected": -1.2765228748321533, "logps/chosen": -0.8017631769180298, "logps/rejected": -6.369771957397461, "loss": 0.8202, "odds_ratio_loss": 0.18478266894817352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08017632365226746, "rewards/margins": 0.5568008422851562, "rewards/rejected": -0.6369771957397461, "sft_loss": 0.8017631769180298, "step": 5255 }, { "epoch": 0.41, "grad_norm": 6.256443977355957, "learning_rate": 6.476997731116386e-06, "logits/chosen": -1.363231897354126, "logits/rejected": -0.838513195514679, "logps/chosen": -0.9848578572273254, "logps/rejected": -4.066619396209717, "loss": 1.0037, "odds_ratio_loss": 0.1887107640504837, "rewards/accuracies": 1.0, "rewards/chosen": -0.0984857901930809, "rewards/margins": 0.308176189661026, "rewards/rejected": -0.4066619873046875, "sft_loss": 0.9848578572273254, "step": 5260 }, { "epoch": 0.41, "grad_norm": 15.482699394226074, "learning_rate": 6.471113830669872e-06, "logits/chosen": -1.4585330486297607, "logits/rejected": -1.3035043478012085, "logps/chosen": -1.1082388162612915, "logps/rejected": -6.114068508148193, "loss": 1.161, "odds_ratio_loss": 0.5274852514266968, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11082388460636139, "rewards/margins": 0.5005830526351929, "rewards/rejected": -0.6114069223403931, "sft_loss": 1.1082388162612915, "step": 5265 }, { "epoch": 0.41, "grad_norm": 13.043397903442383, "learning_rate": 6.465227699092452e-06, "logits/chosen": -1.4129165410995483, "logits/rejected": -0.8737384080886841, "logps/chosen": -1.0952141284942627, "logps/rejected": -7.8089280128479, "loss": 1.1194, "odds_ratio_loss": 0.2414560317993164, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10952140390872955, "rewards/margins": 0.671371340751648, "rewards/rejected": -0.7808927893638611, "sft_loss": 1.0952141284942627, "step": 5270 }, { "epoch": 0.41, "grad_norm": 14.405864715576172, "learning_rate": 6.459339345311194e-06, "logits/chosen": -1.2031883001327515, "logits/rejected": -1.09691321849823, "logps/chosen": -1.3176788091659546, "logps/rejected": -2.5202293395996094, "loss": 1.3548, "odds_ratio_loss": 0.37098708748817444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1317678838968277, "rewards/margins": 0.12025503814220428, "rewards/rejected": -0.252022922039032, "sft_loss": 1.3176788091659546, "step": 5275 }, { "epoch": 0.41, "grad_norm": 5.715144157409668, "learning_rate": 6.4534487782565346e-06, "logits/chosen": -1.4191725254058838, "logits/rejected": -0.5255266427993774, "logps/chosen": -0.8059912919998169, "logps/rejected": -7.7237420082092285, "loss": 0.831, "odds_ratio_loss": 0.24965138733386993, "rewards/accuracies": 1.0, "rewards/chosen": -0.08059912174940109, "rewards/margins": 0.6917750835418701, "rewards/rejected": -0.7723742127418518, "sft_loss": 0.8059912919998169, "step": 5280 }, { "epoch": 0.41, "grad_norm": 6.963302135467529, "learning_rate": 6.447556006862266e-06, "logits/chosen": -1.422668218612671, "logits/rejected": -1.079911470413208, "logps/chosen": -1.1761057376861572, "logps/rejected": -5.600368022918701, "loss": 1.2187, "odds_ratio_loss": 0.4255724549293518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11761058866977692, "rewards/margins": 0.4424262046813965, "rewards/rejected": -0.5600367784500122, "sft_loss": 1.1761057376861572, "step": 5285 }, { "epoch": 0.41, "grad_norm": 33.12194061279297, "learning_rate": 6.441661040065523e-06, "logits/chosen": -1.6109205484390259, "logits/rejected": -1.3014240264892578, "logps/chosen": -0.7997133135795593, "logps/rejected": -3.3593928813934326, "loss": 0.8455, "odds_ratio_loss": 0.45820364356040955, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0799713283777237, "rewards/margins": 0.2559679448604584, "rewards/rejected": -0.33593928813934326, "sft_loss": 0.7997133135795593, "step": 5290 }, { "epoch": 0.41, "grad_norm": 515.0574340820312, "learning_rate": 6.435763886806774e-06, "logits/chosen": -1.4338130950927734, "logits/rejected": -1.323038101196289, "logps/chosen": -1.8791614770889282, "logps/rejected": -4.376596450805664, "loss": 1.9784, "odds_ratio_loss": 0.992806613445282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1879161298274994, "rewards/margins": 0.2497435063123703, "rewards/rejected": -0.4376596510410309, "sft_loss": 1.8791614770889282, "step": 5295 }, { "epoch": 0.41, "grad_norm": 39.97163391113281, "learning_rate": 6.4298645560297976e-06, "logits/chosen": -1.5293049812316895, "logits/rejected": -1.3366972208023071, "logps/chosen": -0.8182582855224609, "logps/rejected": -1.072858452796936, "loss": 0.9715, "odds_ratio_loss": 1.532416582107544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08182583749294281, "rewards/margins": 0.025460004806518555, "rewards/rejected": -0.10728584229946136, "sft_loss": 0.8182582855224609, "step": 5300 }, { "epoch": 0.41, "grad_norm": 9.11904239654541, "learning_rate": 6.42396305668168e-06, "logits/chosen": -1.2009570598602295, "logits/rejected": -1.1244704723358154, "logps/chosen": -1.0162278413772583, "logps/rejected": -5.84080696105957, "loss": 1.021, "odds_ratio_loss": 0.04781056195497513, "rewards/accuracies": 1.0, "rewards/chosen": -0.10162278264760971, "rewards/margins": 0.48245781660079956, "rewards/rejected": -0.5840806365013123, "sft_loss": 1.0162278413772583, "step": 5305 }, { "epoch": 0.41, "grad_norm": 10.741803169250488, "learning_rate": 6.418059397712792e-06, "logits/chosen": -1.3627127408981323, "logits/rejected": -0.805992603302002, "logps/chosen": -0.977192223072052, "logps/rejected": -3.0407779216766357, "loss": 0.9869, "odds_ratio_loss": 0.09715723246335983, "rewards/accuracies": 1.0, "rewards/chosen": -0.0977192148566246, "rewards/margins": 0.20635858178138733, "rewards/rejected": -0.30407780408859253, "sft_loss": 0.977192223072052, "step": 5310 }, { "epoch": 0.41, "grad_norm": 13.376646041870117, "learning_rate": 6.412153588076785e-06, "logits/chosen": -1.2463794946670532, "logits/rejected": -0.7775799036026001, "logps/chosen": -0.7438236474990845, "logps/rejected": -6.562718868255615, "loss": 0.7586, "odds_ratio_loss": 0.14801888167858124, "rewards/accuracies": 1.0, "rewards/chosen": -0.07438236474990845, "rewards/margins": 0.5818895697593689, "rewards/rejected": -0.6562718749046326, "sft_loss": 0.7438236474990845, "step": 5315 }, { "epoch": 0.41, "grad_norm": 11.7168607711792, "learning_rate": 6.406245636730568e-06, "logits/chosen": -1.44731867313385, "logits/rejected": -0.870677649974823, "logps/chosen": -1.2856199741363525, "logps/rejected": -4.146407127380371, "loss": 1.311, "odds_ratio_loss": 0.25399282574653625, "rewards/accuracies": 1.0, "rewards/chosen": -0.12856200337409973, "rewards/margins": 0.2860787510871887, "rewards/rejected": -0.41464075446128845, "sft_loss": 1.2856199741363525, "step": 5320 }, { "epoch": 0.41, "grad_norm": 7.065237045288086, "learning_rate": 6.4003355526342995e-06, "logits/chosen": -1.4006376266479492, "logits/rejected": -1.0352108478546143, "logps/chosen": -1.1861366033554077, "logps/rejected": -3.761228084564209, "loss": 1.2033, "odds_ratio_loss": 0.17140784859657288, "rewards/accuracies": 1.0, "rewards/chosen": -0.11861366033554077, "rewards/margins": 0.2575092017650604, "rewards/rejected": -0.3761228621006012, "sft_loss": 1.1861366033554077, "step": 5325 }, { "epoch": 0.41, "grad_norm": 13.023364067077637, "learning_rate": 6.39442334475137e-06, "logits/chosen": -1.2521476745605469, "logits/rejected": -1.1121388673782349, "logps/chosen": -1.2779855728149414, "logps/rejected": -4.648769855499268, "loss": 1.3339, "odds_ratio_loss": 0.5596238970756531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12779855728149414, "rewards/margins": 0.33707842230796814, "rewards/rejected": -0.4648769795894623, "sft_loss": 1.2779855728149414, "step": 5330 }, { "epoch": 0.42, "grad_norm": 26.406526565551758, "learning_rate": 6.388509022048396e-06, "logits/chosen": -1.4632203578948975, "logits/rejected": -1.218110203742981, "logps/chosen": -0.8516885638237, "logps/rejected": -5.130523681640625, "loss": 0.886, "odds_ratio_loss": 0.34357717633247375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08516885340213776, "rewards/margins": 0.4278835356235504, "rewards/rejected": -0.5130523443222046, "sft_loss": 0.8516885638237, "step": 5335 }, { "epoch": 0.42, "grad_norm": 183.27879333496094, "learning_rate": 6.3825925934951986e-06, "logits/chosen": -1.382738709449768, "logits/rejected": -0.9956004023551941, "logps/chosen": -1.4411137104034424, "logps/rejected": -5.785558223724365, "loss": 1.4701, "odds_ratio_loss": 0.29017865657806396, "rewards/accuracies": 1.0, "rewards/chosen": -0.14411136507987976, "rewards/margins": 0.43444448709487915, "rewards/rejected": -0.5785558819770813, "sft_loss": 1.4411137104034424, "step": 5340 }, { "epoch": 0.42, "grad_norm": 8.949464797973633, "learning_rate": 6.376674068064792e-06, "logits/chosen": -1.3090190887451172, "logits/rejected": -1.3267171382904053, "logps/chosen": -0.7871259450912476, "logps/rejected": -3.755983352661133, "loss": 0.8014, "odds_ratio_loss": 0.14319057762622833, "rewards/accuracies": 1.0, "rewards/chosen": -0.0787125900387764, "rewards/margins": 0.29688572883605957, "rewards/rejected": -0.37559834122657776, "sft_loss": 0.7871259450912476, "step": 5345 }, { "epoch": 0.42, "grad_norm": 15.70522689819336, "learning_rate": 6.370753454733371e-06, "logits/chosen": -1.4009630680084229, "logits/rejected": -1.197137713432312, "logps/chosen": -0.9578830599784851, "logps/rejected": -4.780231475830078, "loss": 0.9862, "odds_ratio_loss": 0.2829952836036682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09578831493854523, "rewards/margins": 0.3822348415851593, "rewards/rejected": -0.47802314162254333, "sft_loss": 0.9578830599784851, "step": 5350 }, { "epoch": 0.42, "grad_norm": 23.035987854003906, "learning_rate": 6.3648307624803e-06, "logits/chosen": -1.314186692237854, "logits/rejected": -1.559711217880249, "logps/chosen": -0.8188357353210449, "logps/rejected": -9.391572952270508, "loss": 0.8198, "odds_ratio_loss": 0.010050063952803612, "rewards/accuracies": 1.0, "rewards/chosen": -0.08188357204198837, "rewards/margins": 0.8572737574577332, "rewards/rejected": -0.9391573071479797, "sft_loss": 0.8188357353210449, "step": 5355 }, { "epoch": 0.42, "grad_norm": 24.58151626586914, "learning_rate": 6.358906000288091e-06, "logits/chosen": -1.376570463180542, "logits/rejected": -1.3238173723220825, "logps/chosen": -0.7370100021362305, "logps/rejected": -5.758879661560059, "loss": 0.7583, "odds_ratio_loss": 0.21306517720222473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07370099425315857, "rewards/margins": 0.5021870136260986, "rewards/rejected": -0.5758880376815796, "sft_loss": 0.7370100021362305, "step": 5360 }, { "epoch": 0.42, "grad_norm": 12.155942916870117, "learning_rate": 6.352979177142399e-06, "logits/chosen": -1.461085557937622, "logits/rejected": -0.873151957988739, "logps/chosen": -0.9852715730667114, "logps/rejected": -9.310620307922363, "loss": 0.9927, "odds_ratio_loss": 0.07405222952365875, "rewards/accuracies": 1.0, "rewards/chosen": -0.09852716326713562, "rewards/margins": 0.832534909248352, "rewards/rejected": -0.9310620427131653, "sft_loss": 0.9852715730667114, "step": 5365 }, { "epoch": 0.42, "grad_norm": 10.086821556091309, "learning_rate": 6.347050302032005e-06, "logits/chosen": -1.2039637565612793, "logits/rejected": -1.0078706741333008, "logps/chosen": -0.9459770917892456, "logps/rejected": -4.682739734649658, "loss": 0.9889, "odds_ratio_loss": 0.42916393280029297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0945977121591568, "rewards/margins": 0.37367627024650574, "rewards/rejected": -0.4682740271091461, "sft_loss": 0.9459770917892456, "step": 5370 }, { "epoch": 0.42, "grad_norm": 13.609196662902832, "learning_rate": 6.341119383948799e-06, "logits/chosen": -1.5448615550994873, "logits/rejected": -1.2496637105941772, "logps/chosen": -0.7335996627807617, "logps/rejected": -1.3010228872299194, "loss": 0.7845, "odds_ratio_loss": 0.5093629956245422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07335996627807617, "rewards/margins": 0.05674232169985771, "rewards/rejected": -0.13010229170322418, "sft_loss": 0.7335996627807617, "step": 5375 }, { "epoch": 0.42, "grad_norm": 37.54430389404297, "learning_rate": 6.335186431887772e-06, "logits/chosen": -1.4095587730407715, "logits/rejected": -1.1966378688812256, "logps/chosen": -1.336415410041809, "logps/rejected": -1.9364235401153564, "loss": 1.4304, "odds_ratio_loss": 0.9395570755004883, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1336415410041809, "rewards/margins": 0.060000818222761154, "rewards/rejected": -0.19364234805107117, "sft_loss": 1.336415410041809, "step": 5380 }, { "epoch": 0.42, "grad_norm": 8.126177787780762, "learning_rate": 6.329251454847e-06, "logits/chosen": -1.42582368850708, "logits/rejected": -0.7910014986991882, "logps/chosen": -0.8624337911605835, "logps/rejected": -1.4641921520233154, "loss": 0.9135, "odds_ratio_loss": 0.5102102756500244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08624337613582611, "rewards/margins": 0.06017584353685379, "rewards/rejected": -0.1464192271232605, "sft_loss": 0.8624337911605835, "step": 5385 }, { "epoch": 0.42, "grad_norm": 5.362218856811523, "learning_rate": 6.3233144618276265e-06, "logits/chosen": -1.4800853729248047, "logits/rejected": -0.9077849388122559, "logps/chosen": -0.9694632291793823, "logps/rejected": -10.608955383300781, "loss": 0.9849, "odds_ratio_loss": 0.15407711267471313, "rewards/accuracies": 1.0, "rewards/chosen": -0.09694632887840271, "rewards/margins": 0.9639492034912109, "rewards/rejected": -1.060895562171936, "sft_loss": 0.9694632291793823, "step": 5390 }, { "epoch": 0.42, "grad_norm": 5.7052083015441895, "learning_rate": 6.317375461833859e-06, "logits/chosen": -1.4369672536849976, "logits/rejected": -1.213986873626709, "logps/chosen": -0.9510200619697571, "logps/rejected": -4.338289260864258, "loss": 0.9789, "odds_ratio_loss": 0.278502881526947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09510200470685959, "rewards/margins": 0.3387269377708435, "rewards/rejected": -0.4338289797306061, "sft_loss": 0.9510200619697571, "step": 5395 }, { "epoch": 0.42, "grad_norm": 6.42952823638916, "learning_rate": 6.311434463872941e-06, "logits/chosen": -1.4391467571258545, "logits/rejected": -0.944113552570343, "logps/chosen": -0.8725773692131042, "logps/rejected": -2.7120556831359863, "loss": 0.9056, "odds_ratio_loss": 0.3303033709526062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0872577428817749, "rewards/margins": 0.18394781649112701, "rewards/rejected": -0.2712055742740631, "sft_loss": 0.8725773692131042, "step": 5400 }, { "epoch": 0.42, "grad_norm": 8.709639549255371, "learning_rate": 6.305491476955154e-06, "logits/chosen": -1.294272541999817, "logits/rejected": -1.2669130563735962, "logps/chosen": -1.11087965965271, "logps/rejected": -3.0580954551696777, "loss": 1.1525, "odds_ratio_loss": 0.41599932312965393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11108797788619995, "rewards/margins": 0.19472156465053558, "rewards/rejected": -0.30580952763557434, "sft_loss": 1.11087965965271, "step": 5405 }, { "epoch": 0.42, "grad_norm": 12.110684394836426, "learning_rate": 6.299546510093791e-06, "logits/chosen": -1.3638485670089722, "logits/rejected": -1.0348577499389648, "logps/chosen": -1.0306559801101685, "logps/rejected": -4.842162132263184, "loss": 1.0958, "odds_ratio_loss": 0.6510428190231323, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.1030655950307846, "rewards/margins": 0.38115063309669495, "rewards/rejected": -0.48421621322631836, "sft_loss": 1.0306559801101685, "step": 5410 }, { "epoch": 0.42, "grad_norm": 5.196389675140381, "learning_rate": 6.293599572305147e-06, "logits/chosen": -1.3511767387390137, "logits/rejected": -0.735332190990448, "logps/chosen": -1.0389249324798584, "logps/rejected": -15.559942245483398, "loss": 1.0391, "odds_ratio_loss": 0.0016535281902179122, "rewards/accuracies": 1.0, "rewards/chosen": -0.1038924902677536, "rewards/margins": 1.452101707458496, "rewards/rejected": -1.5559942722320557, "sft_loss": 1.0389249324798584, "step": 5415 }, { "epoch": 0.42, "grad_norm": 13.545232772827148, "learning_rate": 6.287650672608512e-06, "logits/chosen": -1.0957015752792358, "logits/rejected": -1.0863041877746582, "logps/chosen": -0.9821407198905945, "logps/rejected": -4.82470178604126, "loss": 0.9899, "odds_ratio_loss": 0.0780685544013977, "rewards/accuracies": 1.0, "rewards/chosen": -0.09821407496929169, "rewards/margins": 0.38425612449645996, "rewards/rejected": -0.48247018456459045, "sft_loss": 0.9821407198905945, "step": 5420 }, { "epoch": 0.42, "grad_norm": 13.815802574157715, "learning_rate": 6.281699820026144e-06, "logits/chosen": -1.2515912055969238, "logits/rejected": -0.8593171834945679, "logps/chosen": -1.2910552024841309, "logps/rejected": -4.360851287841797, "loss": 1.3075, "odds_ratio_loss": 0.16433535516262054, "rewards/accuracies": 1.0, "rewards/chosen": -0.12910553812980652, "rewards/margins": 0.30697956681251526, "rewards/rejected": -0.43608513474464417, "sft_loss": 1.2910552024841309, "step": 5425 }, { "epoch": 0.42, "grad_norm": 120.34965515136719, "learning_rate": 6.275747023583266e-06, "logits/chosen": -1.3192626237869263, "logits/rejected": -1.2971595525741577, "logps/chosen": -0.8860888481140137, "logps/rejected": -7.4777069091796875, "loss": 0.9032, "odds_ratio_loss": 0.1707981675863266, "rewards/accuracies": 1.0, "rewards/chosen": -0.08860887587070465, "rewards/margins": 0.6591618657112122, "rewards/rejected": -0.7477707266807556, "sft_loss": 0.8860888481140137, "step": 5430 }, { "epoch": 0.42, "grad_norm": 14.62352466583252, "learning_rate": 6.269792292308054e-06, "logits/chosen": -1.5659847259521484, "logits/rejected": -1.1328372955322266, "logps/chosen": -0.748722493648529, "logps/rejected": -3.743081569671631, "loss": 0.7556, "odds_ratio_loss": 0.06894762814044952, "rewards/accuracies": 1.0, "rewards/chosen": -0.07487224042415619, "rewards/margins": 0.29943591356277466, "rewards/rejected": -0.37430819869041443, "sft_loss": 0.748722493648529, "step": 5435 }, { "epoch": 0.42, "grad_norm": 12.701285362243652, "learning_rate": 6.263835635231612e-06, "logits/chosen": -1.4254283905029297, "logits/rejected": -0.7170482873916626, "logps/chosen": -1.096375823020935, "logps/rejected": -8.693497657775879, "loss": 1.1015, "odds_ratio_loss": 0.05131556838750839, "rewards/accuracies": 1.0, "rewards/chosen": -0.10963758081197739, "rewards/margins": 0.7597121596336365, "rewards/rejected": -0.8693498373031616, "sft_loss": 1.096375823020935, "step": 5440 }, { "epoch": 0.42, "grad_norm": 18.322486877441406, "learning_rate": 6.257877061387966e-06, "logits/chosen": -1.403956651687622, "logits/rejected": -1.0851489305496216, "logps/chosen": -0.80475914478302, "logps/rejected": -8.924013137817383, "loss": 0.8272, "odds_ratio_loss": 0.22463825345039368, "rewards/accuracies": 1.0, "rewards/chosen": -0.08047591149806976, "rewards/margins": 0.8119255304336548, "rewards/rejected": -0.8924013376235962, "sft_loss": 0.80475914478302, "step": 5445 }, { "epoch": 0.42, "grad_norm": 61.04814147949219, "learning_rate": 6.25191657981405e-06, "logits/chosen": -1.3005506992340088, "logits/rejected": -1.1514785289764404, "logps/chosen": -0.8403207063674927, "logps/rejected": -9.946390151977539, "loss": 0.8522, "odds_ratio_loss": 0.11916428804397583, "rewards/accuracies": 1.0, "rewards/chosen": -0.0840320736169815, "rewards/margins": 0.9106069803237915, "rewards/rejected": -0.9946390390396118, "sft_loss": 0.8403207063674927, "step": 5450 }, { "epoch": 0.42, "grad_norm": 25.489612579345703, "learning_rate": 6.24595419954969e-06, "logits/chosen": -1.4496216773986816, "logits/rejected": -0.9528179168701172, "logps/chosen": -1.1614158153533936, "logps/rejected": -1.9910621643066406, "loss": 1.2025, "odds_ratio_loss": 0.41043177247047424, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11614159494638443, "rewards/margins": 0.08296463638544083, "rewards/rejected": -0.19910623133182526, "sft_loss": 1.1614158153533936, "step": 5455 }, { "epoch": 0.42, "grad_norm": 6.294222354888916, "learning_rate": 6.239989929637595e-06, "logits/chosen": -1.4808690547943115, "logits/rejected": -0.9784132838249207, "logps/chosen": -0.863582968711853, "logps/rejected": -1.8838698863983154, "loss": 0.8951, "odds_ratio_loss": 0.3148866593837738, "rewards/accuracies": 1.0, "rewards/chosen": -0.08635830134153366, "rewards/margins": 0.10202869027853012, "rewards/rejected": -0.18838700652122498, "sft_loss": 0.863582968711853, "step": 5460 }, { "epoch": 0.43, "grad_norm": 29.04526710510254, "learning_rate": 6.234023779123337e-06, "logits/chosen": -1.4023199081420898, "logits/rejected": -1.3432037830352783, "logps/chosen": -0.9532560110092163, "logps/rejected": -3.891387462615967, "loss": 0.9783, "odds_ratio_loss": 0.25030574202537537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09532561153173447, "rewards/margins": 0.29381316900253296, "rewards/rejected": -0.38913875818252563, "sft_loss": 0.9532560110092163, "step": 5465 }, { "epoch": 0.43, "grad_norm": 26.674266815185547, "learning_rate": 6.228055757055339e-06, "logits/chosen": -1.380295991897583, "logits/rejected": -1.1915805339813232, "logps/chosen": -0.8241817355155945, "logps/rejected": -4.265925407409668, "loss": 0.8417, "odds_ratio_loss": 0.1755792200565338, "rewards/accuracies": 1.0, "rewards/chosen": -0.08241816610097885, "rewards/margins": 0.3441743552684784, "rewards/rejected": -0.42659252882003784, "sft_loss": 0.8241817355155945, "step": 5470 }, { "epoch": 0.43, "grad_norm": 6.6642560958862305, "learning_rate": 6.222085872484867e-06, "logits/chosen": -1.4019719362258911, "logits/rejected": -1.0525364875793457, "logps/chosen": -1.1212289333343506, "logps/rejected": -5.022209167480469, "loss": 1.1447, "odds_ratio_loss": 0.2343233823776245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11212290823459625, "rewards/margins": 0.39009803533554077, "rewards/rejected": -0.5022209286689758, "sft_loss": 1.1212289333343506, "step": 5475 }, { "epoch": 0.43, "grad_norm": 11.184894561767578, "learning_rate": 6.216114134466005e-06, "logits/chosen": -1.342789888381958, "logits/rejected": -1.2364380359649658, "logps/chosen": -0.923974335193634, "logps/rejected": -0.862908661365509, "loss": 1.0542, "odds_ratio_loss": 1.3019092082977295, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.09239742904901505, "rewards/margins": -0.006106560118496418, "rewards/rejected": -0.0862908661365509, "sft_loss": 0.923974335193634, "step": 5480 }, { "epoch": 0.43, "grad_norm": 49.761600494384766, "learning_rate": 6.210140552055656e-06, "logits/chosen": -1.1327351331710815, "logits/rejected": -1.489861011505127, "logps/chosen": -0.8854734301567078, "logps/rejected": -8.11977481842041, "loss": 0.898, "odds_ratio_loss": 0.12550660967826843, "rewards/accuracies": 1.0, "rewards/chosen": -0.08854734897613525, "rewards/margins": 0.7234302163124084, "rewards/rejected": -0.8119775652885437, "sft_loss": 0.8854734301567078, "step": 5485 }, { "epoch": 0.43, "grad_norm": 5.152491092681885, "learning_rate": 6.204165134313514e-06, "logits/chosen": -1.3554954528808594, "logits/rejected": -0.7023124694824219, "logps/chosen": -0.9967812299728394, "logps/rejected": -10.33210563659668, "loss": 0.997, "odds_ratio_loss": 0.0022422696929425, "rewards/accuracies": 1.0, "rewards/chosen": -0.09967813640832901, "rewards/margins": 0.9335324168205261, "rewards/rejected": -1.0332105159759521, "sft_loss": 0.9967812299728394, "step": 5490 }, { "epoch": 0.43, "grad_norm": 26.3110408782959, "learning_rate": 6.198187890302059e-06, "logits/chosen": -1.0693590641021729, "logits/rejected": -0.8451949954032898, "logps/chosen": -1.0392181873321533, "logps/rejected": -6.271853446960449, "loss": 1.057, "odds_ratio_loss": 0.1776437610387802, "rewards/accuracies": 1.0, "rewards/chosen": -0.10392183065414429, "rewards/margins": 0.5232634544372559, "rewards/rejected": -0.6271853446960449, "sft_loss": 1.0392181873321533, "step": 5495 }, { "epoch": 0.43, "grad_norm": 8.049160957336426, "learning_rate": 6.19220882908654e-06, "logits/chosen": -1.2815943956375122, "logits/rejected": -1.4038329124450684, "logps/chosen": -1.0970063209533691, "logps/rejected": -12.82702922821045, "loss": 1.1206, "odds_ratio_loss": 0.2361186295747757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10970063507556915, "rewards/margins": 1.1730024814605713, "rewards/rejected": -1.282702922821045, "sft_loss": 1.0970063209533691, "step": 5500 }, { "epoch": 0.43, "grad_norm": 147.5748291015625, "learning_rate": 6.1862279597349625e-06, "logits/chosen": -1.405278205871582, "logits/rejected": -1.2007181644439697, "logps/chosen": -1.237056016921997, "logps/rejected": -9.997976303100586, "loss": 1.2397, "odds_ratio_loss": 0.026751240715384483, "rewards/accuracies": 1.0, "rewards/chosen": -0.12370558828115463, "rewards/margins": 0.8760920763015747, "rewards/rejected": -0.9997976422309875, "sft_loss": 1.237056016921997, "step": 5505 }, { "epoch": 0.43, "grad_norm": 24.764318466186523, "learning_rate": 6.180245291318074e-06, "logits/chosen": -1.4413025379180908, "logits/rejected": -0.7700681686401367, "logps/chosen": -0.8640216588973999, "logps/rejected": -5.9592132568359375, "loss": 0.8991, "odds_ratio_loss": 0.3511553108692169, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08640217781066895, "rewards/margins": 0.5095191597938538, "rewards/rejected": -0.5959213376045227, "sft_loss": 0.8640216588973999, "step": 5510 }, { "epoch": 0.43, "grad_norm": 6.59911584854126, "learning_rate": 6.174260832909355e-06, "logits/chosen": -1.4317357540130615, "logits/rejected": -0.7478057146072388, "logps/chosen": -0.8837820291519165, "logps/rejected": -1.948574423789978, "loss": 0.9202, "odds_ratio_loss": 0.36417558789253235, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08837820589542389, "rewards/margins": 0.10647924989461899, "rewards/rejected": -0.19485744833946228, "sft_loss": 0.8837820291519165, "step": 5515 }, { "epoch": 0.43, "grad_norm": 189.65814208984375, "learning_rate": 6.168274593584991e-06, "logits/chosen": -1.2420986890792847, "logits/rejected": -1.0643844604492188, "logps/chosen": -0.9428080320358276, "logps/rejected": -6.143196105957031, "loss": 0.9637, "odds_ratio_loss": 0.20935705304145813, "rewards/accuracies": 1.0, "rewards/chosen": -0.09428079426288605, "rewards/margins": 0.5200387835502625, "rewards/rejected": -0.6143196225166321, "sft_loss": 0.9428080320358276, "step": 5520 }, { "epoch": 0.43, "grad_norm": 13.59572696685791, "learning_rate": 6.162286582423876e-06, "logits/chosen": -1.3535109758377075, "logits/rejected": -0.9895867109298706, "logps/chosen": -1.0450531244277954, "logps/rejected": -9.30381965637207, "loss": 1.0603, "odds_ratio_loss": 0.1525220423936844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10450531542301178, "rewards/margins": 0.8258765935897827, "rewards/rejected": -0.9303819537162781, "sft_loss": 1.0450531244277954, "step": 5525 }, { "epoch": 0.43, "grad_norm": 21.94033432006836, "learning_rate": 6.156296808507588e-06, "logits/chosen": -1.4038699865341187, "logits/rejected": -1.224493384361267, "logps/chosen": -0.7861626744270325, "logps/rejected": -1.9210304021835327, "loss": 0.8439, "odds_ratio_loss": 0.5771653652191162, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07861627638339996, "rewards/margins": 0.11348676681518555, "rewards/rejected": -0.1921030431985855, "sft_loss": 0.7861626744270325, "step": 5530 }, { "epoch": 0.43, "grad_norm": 25.767139434814453, "learning_rate": 6.150305280920381e-06, "logits/chosen": -1.4617096185684204, "logits/rejected": -0.8789995312690735, "logps/chosen": -0.8968402147293091, "logps/rejected": -4.829245567321777, "loss": 0.9022, "odds_ratio_loss": 0.0539650022983551, "rewards/accuracies": 1.0, "rewards/chosen": -0.08968402445316315, "rewards/margins": 0.3932405710220337, "rewards/rejected": -0.48292461037635803, "sft_loss": 0.8968402147293091, "step": 5535 }, { "epoch": 0.43, "grad_norm": 8.61695671081543, "learning_rate": 6.144312008749168e-06, "logits/chosen": -1.2894651889801025, "logits/rejected": -0.5578645467758179, "logps/chosen": -0.9731871485710144, "logps/rejected": -2.8847477436065674, "loss": 0.9916, "odds_ratio_loss": 0.18387068808078766, "rewards/accuracies": 1.0, "rewards/chosen": -0.09731872379779816, "rewards/margins": 0.1911560595035553, "rewards/rejected": -0.28847479820251465, "sft_loss": 0.9731871485710144, "step": 5540 }, { "epoch": 0.43, "grad_norm": 18.54728889465332, "learning_rate": 6.138317001083505e-06, "logits/chosen": -1.400864839553833, "logits/rejected": -1.461357831954956, "logps/chosen": -0.6988080143928528, "logps/rejected": -4.805953025817871, "loss": 0.7053, "odds_ratio_loss": 0.06458644568920135, "rewards/accuracies": 1.0, "rewards/chosen": -0.06988079845905304, "rewards/margins": 0.4107145369052887, "rewards/rejected": -0.48059535026550293, "sft_loss": 0.6988080143928528, "step": 5545 }, { "epoch": 0.43, "grad_norm": 13.36061954498291, "learning_rate": 6.132320267015586e-06, "logits/chosen": -1.3715206384658813, "logits/rejected": -1.4396508932113647, "logps/chosen": -0.9623751640319824, "logps/rejected": -8.231966018676758, "loss": 0.9782, "odds_ratio_loss": 0.1581466943025589, "rewards/accuracies": 1.0, "rewards/chosen": -0.09623752534389496, "rewards/margins": 0.7269589900970459, "rewards/rejected": -0.823196530342102, "sft_loss": 0.9623751640319824, "step": 5550 }, { "epoch": 0.43, "grad_norm": 9.104194641113281, "learning_rate": 6.126321815640215e-06, "logits/chosen": -1.454085111618042, "logits/rejected": -0.7264108061790466, "logps/chosen": -0.8256096839904785, "logps/rejected": -6.499675750732422, "loss": 0.8393, "odds_ratio_loss": 0.13687697052955627, "rewards/accuracies": 1.0, "rewards/chosen": -0.08256097137928009, "rewards/margins": 0.5674066543579102, "rewards/rejected": -0.6499676704406738, "sft_loss": 0.8256096839904785, "step": 5555 }, { "epoch": 0.43, "grad_norm": 5.782400131225586, "learning_rate": 6.1203216560548076e-06, "logits/chosen": -1.333268165588379, "logits/rejected": -0.6403939127922058, "logps/chosen": -0.9271313548088074, "logps/rejected": -9.616800308227539, "loss": 0.9338, "odds_ratio_loss": 0.06707803159952164, "rewards/accuracies": 1.0, "rewards/chosen": -0.0927131399512291, "rewards/margins": 0.8689668774604797, "rewards/rejected": -0.9616800546646118, "sft_loss": 0.9271313548088074, "step": 5560 }, { "epoch": 0.43, "grad_norm": 14.873298645019531, "learning_rate": 6.114319797359367e-06, "logits/chosen": -1.2211421728134155, "logits/rejected": -1.1900720596313477, "logps/chosen": -1.1753790378570557, "logps/rejected": -5.317530155181885, "loss": 1.2106, "odds_ratio_loss": 0.3520776629447937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11753790080547333, "rewards/margins": 0.414215087890625, "rewards/rejected": -0.5317530035972595, "sft_loss": 1.1753790378570557, "step": 5565 }, { "epoch": 0.43, "grad_norm": 39.33988952636719, "learning_rate": 6.108316248656474e-06, "logits/chosen": -1.3350688219070435, "logits/rejected": -1.342743158340454, "logps/chosen": -0.8205651044845581, "logps/rejected": -3.8720860481262207, "loss": 0.8352, "odds_ratio_loss": 0.14677473902702332, "rewards/accuracies": 1.0, "rewards/chosen": -0.08205651491880417, "rewards/margins": 0.305152028799057, "rewards/rejected": -0.38720858097076416, "sft_loss": 0.8205651044845581, "step": 5570 }, { "epoch": 0.43, "grad_norm": 22.19184684753418, "learning_rate": 6.102311019051274e-06, "logits/chosen": -1.1321027278900146, "logits/rejected": -1.1962878704071045, "logps/chosen": -1.1018847227096558, "logps/rejected": -3.9455478191375732, "loss": 1.1527, "odds_ratio_loss": 0.5081930160522461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11018846929073334, "rewards/margins": 0.28436630964279175, "rewards/rejected": -0.3945547938346863, "sft_loss": 1.1018847227096558, "step": 5575 }, { "epoch": 0.43, "grad_norm": 9.592340469360352, "learning_rate": 6.096304117651457e-06, "logits/chosen": -1.4173847436904907, "logits/rejected": -1.5834704637527466, "logps/chosen": -1.1477190256118774, "logps/rejected": -9.274484634399414, "loss": 1.161, "odds_ratio_loss": 0.1331491768360138, "rewards/accuracies": 1.0, "rewards/chosen": -0.11477188766002655, "rewards/margins": 0.8126765489578247, "rewards/rejected": -0.9274484515190125, "sft_loss": 1.1477190256118774, "step": 5580 }, { "epoch": 0.43, "grad_norm": 8.10434627532959, "learning_rate": 6.090295553567254e-06, "logits/chosen": -1.3882704973220825, "logits/rejected": -1.2911784648895264, "logps/chosen": -1.1263264417648315, "logps/rejected": -2.811375856399536, "loss": 1.1581, "odds_ratio_loss": 0.31728702783584595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1126326471567154, "rewards/margins": 0.16850493848323822, "rewards/rejected": -0.2811375558376312, "sft_loss": 1.1263264417648315, "step": 5585 }, { "epoch": 0.43, "grad_norm": 4.900169372558594, "learning_rate": 6.084285335911415e-06, "logits/chosen": -1.4652645587921143, "logits/rejected": -1.4055920839309692, "logps/chosen": -1.030133605003357, "logps/rejected": -13.148248672485352, "loss": 1.0402, "odds_ratio_loss": 0.10082044452428818, "rewards/accuracies": 1.0, "rewards/chosen": -0.10301337391138077, "rewards/margins": 1.2118113040924072, "rewards/rejected": -1.3148248195648193, "sft_loss": 1.030133605003357, "step": 5590 }, { "epoch": 0.44, "grad_norm": 6.193985462188721, "learning_rate": 6.0782734737991965e-06, "logits/chosen": -1.4546271562576294, "logits/rejected": -1.2658965587615967, "logps/chosen": -1.2212754487991333, "logps/rejected": -9.927604675292969, "loss": 1.2485, "odds_ratio_loss": 0.2723880708217621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12212755531072617, "rewards/margins": 0.8706329464912415, "rewards/rejected": -0.9927604794502258, "sft_loss": 1.2212754487991333, "step": 5595 }, { "epoch": 0.44, "grad_norm": 15.977096557617188, "learning_rate": 6.072259976348353e-06, "logits/chosen": -1.3369700908660889, "logits/rejected": -1.2094776630401611, "logps/chosen": -0.8101118803024292, "logps/rejected": -2.6742115020751953, "loss": 0.827, "odds_ratio_loss": 0.16866515576839447, "rewards/accuracies": 1.0, "rewards/chosen": -0.08101119101047516, "rewards/margins": 0.18640998005867004, "rewards/rejected": -0.267421156167984, "sft_loss": 0.8101118803024292, "step": 5600 }, { "epoch": 0.44, "grad_norm": 5.28997278213501, "learning_rate": 6.066244852679117e-06, "logits/chosen": -1.4884275197982788, "logits/rejected": -1.1082899570465088, "logps/chosen": -1.3736873865127563, "logps/rejected": -6.204714775085449, "loss": 1.3932, "odds_ratio_loss": 0.1947067677974701, "rewards/accuracies": 1.0, "rewards/chosen": -0.13736873865127563, "rewards/margins": 0.4831027090549469, "rewards/rejected": -0.6204714179039001, "sft_loss": 1.3736873865127563, "step": 5605 }, { "epoch": 0.44, "grad_norm": 6.113433837890625, "learning_rate": 6.060228111914186e-06, "logits/chosen": -1.356977939605713, "logits/rejected": -0.836715042591095, "logps/chosen": -1.1353957653045654, "logps/rejected": -8.683342933654785, "loss": 1.1709, "odds_ratio_loss": 0.3551942706108093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11353959143161774, "rewards/margins": 0.754794716835022, "rewards/rejected": -0.8683342933654785, "sft_loss": 1.1353957653045654, "step": 5610 }, { "epoch": 0.44, "grad_norm": 6.144330024719238, "learning_rate": 6.054209763178711e-06, "logits/chosen": -1.475420355796814, "logits/rejected": -1.0890623331069946, "logps/chosen": -1.471876859664917, "logps/rejected": -4.547472953796387, "loss": 1.5125, "odds_ratio_loss": 0.4066528379917145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14718768000602722, "rewards/margins": 0.30755966901779175, "rewards/rejected": -0.45474734902381897, "sft_loss": 1.471876859664917, "step": 5615 }, { "epoch": 0.44, "grad_norm": 67.3975830078125, "learning_rate": 6.048189815600281e-06, "logits/chosen": -1.257453203201294, "logits/rejected": -1.1578747034072876, "logps/chosen": -1.1685110330581665, "logps/rejected": -13.333429336547852, "loss": 1.1729, "odds_ratio_loss": 0.043477244675159454, "rewards/accuracies": 1.0, "rewards/chosen": -0.11685110628604889, "rewards/margins": 1.2164918184280396, "rewards/rejected": -1.3333427906036377, "sft_loss": 1.1685110330581665, "step": 5620 }, { "epoch": 0.44, "grad_norm": 18.023799896240234, "learning_rate": 6.042168278308913e-06, "logits/chosen": -1.3366608619689941, "logits/rejected": -1.1345504522323608, "logps/chosen": -1.0113810300827026, "logps/rejected": -9.117487907409668, "loss": 1.0589, "odds_ratio_loss": 0.4751584529876709, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10113811492919922, "rewards/margins": 0.8106106519699097, "rewards/rejected": -0.9117487668991089, "sft_loss": 1.0113810300827026, "step": 5625 }, { "epoch": 0.44, "grad_norm": 10.662079811096191, "learning_rate": 6.0361451604370335e-06, "logits/chosen": -1.32015061378479, "logits/rejected": -1.0003557205200195, "logps/chosen": -1.0010217428207397, "logps/rejected": -3.2925896644592285, "loss": 1.0352, "odds_ratio_loss": 0.34152549505233765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10010218620300293, "rewards/margins": 0.22915680706501007, "rewards/rejected": -0.3292589783668518, "sft_loss": 1.0010217428207397, "step": 5630 }, { "epoch": 0.44, "grad_norm": 6.930639743804932, "learning_rate": 6.030120471119464e-06, "logits/chosen": -1.3336502313613892, "logits/rejected": -1.0396268367767334, "logps/chosen": -1.0695290565490723, "logps/rejected": -6.448616027832031, "loss": 1.1032, "odds_ratio_loss": 0.3368968069553375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10695289075374603, "rewards/margins": 0.537908673286438, "rewards/rejected": -0.6448616981506348, "sft_loss": 1.0695290565490723, "step": 5635 }, { "epoch": 0.44, "grad_norm": 7.778130531311035, "learning_rate": 6.02409421949341e-06, "logits/chosen": -1.3733274936676025, "logits/rejected": -1.3728525638580322, "logps/chosen": -0.7616759538650513, "logps/rejected": -9.624399185180664, "loss": 0.7976, "odds_ratio_loss": 0.359219491481781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07616759836673737, "rewards/margins": 0.8862722516059875, "rewards/rejected": -0.9624398946762085, "sft_loss": 0.7616759538650513, "step": 5640 }, { "epoch": 0.44, "grad_norm": 55.70398712158203, "learning_rate": 6.018066414698448e-06, "logits/chosen": -1.2639678716659546, "logits/rejected": -0.809512734413147, "logps/chosen": -1.1552722454071045, "logps/rejected": -17.194225311279297, "loss": 1.1611, "odds_ratio_loss": 0.05807274580001831, "rewards/accuracies": 1.0, "rewards/chosen": -0.11552723497152328, "rewards/margins": 1.6038951873779297, "rewards/rejected": -1.7194225788116455, "sft_loss": 1.1552722454071045, "step": 5645 }, { "epoch": 0.44, "grad_norm": 29.567472457885742, "learning_rate": 6.012037065876509e-06, "logits/chosen": -1.5392589569091797, "logits/rejected": -1.446018934249878, "logps/chosen": -1.339097261428833, "logps/rejected": -3.9835457801818848, "loss": 1.4159, "odds_ratio_loss": 0.768239438533783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13390973210334778, "rewards/margins": 0.26444482803344727, "rewards/rejected": -0.39835453033447266, "sft_loss": 1.339097261428833, "step": 5650 }, { "epoch": 0.44, "grad_norm": 7.155758380889893, "learning_rate": 6.006006182171868e-06, "logits/chosen": -1.4382535219192505, "logits/rejected": -1.5007126331329346, "logps/chosen": -0.9383573532104492, "logps/rejected": -17.310665130615234, "loss": 0.9634, "odds_ratio_loss": 0.25030335783958435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0938357338309288, "rewards/margins": 1.6372311115264893, "rewards/rejected": -1.7310667037963867, "sft_loss": 0.9383573532104492, "step": 5655 }, { "epoch": 0.44, "grad_norm": 18.916534423828125, "learning_rate": 5.999973772731121e-06, "logits/chosen": -1.0211951732635498, "logits/rejected": -1.2356466054916382, "logps/chosen": -0.9551981091499329, "logps/rejected": -7.499606132507324, "loss": 0.9926, "odds_ratio_loss": 0.3744484484195709, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09551980346441269, "rewards/margins": 0.6544408202171326, "rewards/rejected": -0.7499606609344482, "sft_loss": 0.9551981091499329, "step": 5660 }, { "epoch": 0.44, "grad_norm": 1061.22900390625, "learning_rate": 5.993939846703189e-06, "logits/chosen": -1.481950283050537, "logits/rejected": -1.5191516876220703, "logps/chosen": -2.560549736022949, "logps/rejected": -10.782699584960938, "loss": 2.6494, "odds_ratio_loss": 0.8887947797775269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.25605496764183044, "rewards/margins": 0.8222150802612305, "rewards/rejected": -1.0782700777053833, "sft_loss": 2.560549736022949, "step": 5665 }, { "epoch": 0.44, "grad_norm": 12.319400787353516, "learning_rate": 5.987904413239284e-06, "logits/chosen": -1.3272775411605835, "logits/rejected": -1.3253281116485596, "logps/chosen": -0.9079964756965637, "logps/rejected": -6.848818778991699, "loss": 0.9143, "odds_ratio_loss": 0.06282417476177216, "rewards/accuracies": 1.0, "rewards/chosen": -0.09079965204000473, "rewards/margins": 0.594082236289978, "rewards/rejected": -0.6848819851875305, "sft_loss": 0.9079964756965637, "step": 5670 }, { "epoch": 0.44, "grad_norm": 7.039941310882568, "learning_rate": 5.981867481492906e-06, "logits/chosen": -1.4510374069213867, "logits/rejected": -1.0965907573699951, "logps/chosen": -0.9913382530212402, "logps/rejected": -12.328478813171387, "loss": 1.0036, "odds_ratio_loss": 0.12262825667858124, "rewards/accuracies": 1.0, "rewards/chosen": -0.09913383424282074, "rewards/margins": 1.1337140798568726, "rewards/rejected": -1.2328479290008545, "sft_loss": 0.9913382530212402, "step": 5675 }, { "epoch": 0.44, "grad_norm": 18.226097106933594, "learning_rate": 5.97582906061983e-06, "logits/chosen": -1.4995019435882568, "logits/rejected": -1.8552892208099365, "logps/chosen": -0.7538286447525024, "logps/rejected": -6.817639350891113, "loss": 0.7843, "odds_ratio_loss": 0.3042234778404236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07538286596536636, "rewards/margins": 0.6063810586929321, "rewards/rejected": -0.6817639470100403, "sft_loss": 0.7538286447525024, "step": 5680 }, { "epoch": 0.44, "grad_norm": 83.86195373535156, "learning_rate": 5.969789159778086e-06, "logits/chosen": -1.4456707239151, "logits/rejected": -1.1688287258148193, "logps/chosen": -1.2338379621505737, "logps/rejected": -7.469872951507568, "loss": 1.2458, "odds_ratio_loss": 0.11980722099542618, "rewards/accuracies": 1.0, "rewards/chosen": -0.12338379770517349, "rewards/margins": 0.6236035227775574, "rewards/rejected": -0.7469873428344727, "sft_loss": 1.2338379621505737, "step": 5685 }, { "epoch": 0.44, "grad_norm": 6.135641574859619, "learning_rate": 5.963747788127954e-06, "logits/chosen": -1.4537721872329712, "logits/rejected": -1.4107811450958252, "logps/chosen": -1.9091577529907227, "logps/rejected": -6.800889015197754, "loss": 1.926, "odds_ratio_loss": 0.16891498863697052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1909157782793045, "rewards/margins": 0.48917311429977417, "rewards/rejected": -0.6800888776779175, "sft_loss": 1.9091577529907227, "step": 5690 }, { "epoch": 0.44, "grad_norm": 17.577470779418945, "learning_rate": 5.9577049548319385e-06, "logits/chosen": -1.1830053329467773, "logits/rejected": -0.9701796770095825, "logps/chosen": -1.0823400020599365, "logps/rejected": -12.730480194091797, "loss": 1.085, "odds_ratio_loss": 0.02667110227048397, "rewards/accuracies": 1.0, "rewards/chosen": -0.1082339882850647, "rewards/margins": 1.1648141145706177, "rewards/rejected": -1.2730480432510376, "sft_loss": 1.0823400020599365, "step": 5695 }, { "epoch": 0.44, "grad_norm": 5.481243133544922, "learning_rate": 5.951660669054764e-06, "logits/chosen": -1.2399911880493164, "logits/rejected": -0.9875618815422058, "logps/chosen": -0.9826416969299316, "logps/rejected": -2.611884832382202, "loss": 1.0209, "odds_ratio_loss": 0.38236701488494873, "rewards/accuracies": 1.0, "rewards/chosen": -0.0982641652226448, "rewards/margins": 0.16292431950569153, "rewards/rejected": -0.2611885070800781, "sft_loss": 0.9826416969299316, "step": 5700 }, { "epoch": 0.44, "grad_norm": 20.629961013793945, "learning_rate": 5.945614939963358e-06, "logits/chosen": -1.4003746509552002, "logits/rejected": -0.9283599853515625, "logps/chosen": -1.3317302465438843, "logps/rejected": -7.419297695159912, "loss": 1.3534, "odds_ratio_loss": 0.21676786243915558, "rewards/accuracies": 1.0, "rewards/chosen": -0.13317301869392395, "rewards/margins": 0.6087567806243896, "rewards/rejected": -0.741929829120636, "sft_loss": 1.3317302465438843, "step": 5705 }, { "epoch": 0.44, "grad_norm": 16.150772094726562, "learning_rate": 5.939567776726834e-06, "logits/chosen": -1.344822883605957, "logits/rejected": -1.3809988498687744, "logps/chosen": -1.1117184162139893, "logps/rejected": -4.634824752807617, "loss": 1.1244, "odds_ratio_loss": 0.1271774023771286, "rewards/accuracies": 1.0, "rewards/chosen": -0.11117184162139893, "rewards/margins": 0.3523106276988983, "rewards/rejected": -0.46348246932029724, "sft_loss": 1.1117184162139893, "step": 5710 }, { "epoch": 0.44, "grad_norm": 5.546013355255127, "learning_rate": 5.933519188516485e-06, "logits/chosen": -1.3635753393173218, "logits/rejected": -1.1241014003753662, "logps/chosen": -0.9175945520401001, "logps/rejected": -2.8002352714538574, "loss": 0.9335, "odds_ratio_loss": 0.15915009379386902, "rewards/accuracies": 1.0, "rewards/chosen": -0.09175945818424225, "rewards/margins": 0.18826408684253693, "rewards/rejected": -0.2800235450267792, "sft_loss": 0.9175945520401001, "step": 5715 }, { "epoch": 0.44, "grad_norm": 5.130252838134766, "learning_rate": 5.927469184505762e-06, "logits/chosen": -1.347988247871399, "logits/rejected": -1.227783203125, "logps/chosen": -2.2398881912231445, "logps/rejected": -10.506120681762695, "loss": 2.2613, "odds_ratio_loss": 0.21373744308948517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22398881614208221, "rewards/margins": 0.826623260974884, "rewards/rejected": -1.0506120920181274, "sft_loss": 2.2398881912231445, "step": 5720 }, { "epoch": 0.45, "grad_norm": 6.612664222717285, "learning_rate": 5.921417773870266e-06, "logits/chosen": -1.4397814273834229, "logits/rejected": -1.322584629058838, "logps/chosen": -0.8262338638305664, "logps/rejected": -5.6490397453308105, "loss": 0.8352, "odds_ratio_loss": 0.0899687260389328, "rewards/accuracies": 1.0, "rewards/chosen": -0.08262337744235992, "rewards/margins": 0.48228058218955994, "rewards/rejected": -0.564903974533081, "sft_loss": 0.8262338638305664, "step": 5725 }, { "epoch": 0.45, "grad_norm": 7.2765727043151855, "learning_rate": 5.915364965787728e-06, "logits/chosen": -1.3210567235946655, "logits/rejected": -1.1719744205474854, "logps/chosen": -1.021377444267273, "logps/rejected": -2.6403896808624268, "loss": 1.0411, "odds_ratio_loss": 0.19730141758918762, "rewards/accuracies": 1.0, "rewards/chosen": -0.1021377444267273, "rewards/margins": 0.16190123558044434, "rewards/rejected": -0.26403898000717163, "sft_loss": 1.021377444267273, "step": 5730 }, { "epoch": 0.45, "grad_norm": 7.26978063583374, "learning_rate": 5.909310769437999e-06, "logits/chosen": -1.3003871440887451, "logits/rejected": -0.99261873960495, "logps/chosen": -0.7958885431289673, "logps/rejected": -9.100659370422363, "loss": 0.8235, "odds_ratio_loss": 0.2762227952480316, "rewards/accuracies": 1.0, "rewards/chosen": -0.07958885282278061, "rewards/margins": 0.8304770588874817, "rewards/rejected": -0.9100659489631653, "sft_loss": 0.7958885431289673, "step": 5735 }, { "epoch": 0.45, "grad_norm": 39.95994186401367, "learning_rate": 5.903255194003037e-06, "logits/chosen": -1.4754865169525146, "logits/rejected": -1.2973945140838623, "logps/chosen": -0.8494400978088379, "logps/rejected": -4.790776252746582, "loss": 0.8841, "odds_ratio_loss": 0.3469342589378357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08494400978088379, "rewards/margins": 0.39413362741470337, "rewards/rejected": -0.47907763719558716, "sft_loss": 0.8494400978088379, "step": 5740 }, { "epoch": 0.45, "grad_norm": 5.832455635070801, "learning_rate": 5.897198248666893e-06, "logits/chosen": -1.30601966381073, "logits/rejected": -0.9545402526855469, "logps/chosen": -1.1280930042266846, "logps/rejected": -3.4375717639923096, "loss": 1.191, "odds_ratio_loss": 0.6293038725852966, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11280930042266846, "rewards/margins": 0.2309478521347046, "rewards/rejected": -0.34375715255737305, "sft_loss": 1.1280930042266846, "step": 5745 }, { "epoch": 0.45, "grad_norm": 9.348652839660645, "learning_rate": 5.891139942615693e-06, "logits/chosen": -1.252437949180603, "logits/rejected": -0.779747724533081, "logps/chosen": -0.9001814723014832, "logps/rejected": -3.424330472946167, "loss": 0.9164, "odds_ratio_loss": 0.16236189007759094, "rewards/accuracies": 1.0, "rewards/chosen": -0.0900181457400322, "rewards/margins": 0.25241488218307495, "rewards/rejected": -0.34243303537368774, "sft_loss": 0.9001814723014832, "step": 5750 }, { "epoch": 0.45, "grad_norm": 15.176481246948242, "learning_rate": 5.8850802850376245e-06, "logits/chosen": -1.0830439329147339, "logits/rejected": -0.5726959109306335, "logps/chosen": -0.9458778500556946, "logps/rejected": -3.5730719566345215, "loss": 0.9607, "odds_ratio_loss": 0.14806696772575378, "rewards/accuracies": 1.0, "rewards/chosen": -0.0945877879858017, "rewards/margins": 0.26271945238113403, "rewards/rejected": -0.3573072552680969, "sft_loss": 0.9458778500556946, "step": 5755 }, { "epoch": 0.45, "grad_norm": 7.540465354919434, "learning_rate": 5.87901928512293e-06, "logits/chosen": -1.3506877422332764, "logits/rejected": -1.2902274131774902, "logps/chosen": -1.1145894527435303, "logps/rejected": -3.1862120628356934, "loss": 1.1339, "odds_ratio_loss": 0.19300493597984314, "rewards/accuracies": 1.0, "rewards/chosen": -0.11145894229412079, "rewards/margins": 0.2071622908115387, "rewards/rejected": -0.3186212182044983, "sft_loss": 1.1145894527435303, "step": 5760 }, { "epoch": 0.45, "grad_norm": 29.251178741455078, "learning_rate": 5.872956952063885e-06, "logits/chosen": -1.3513835668563843, "logits/rejected": -1.3920825719833374, "logps/chosen": -0.811087429523468, "logps/rejected": -6.7181830406188965, "loss": 0.8188, "odds_ratio_loss": 0.07711207121610641, "rewards/accuracies": 1.0, "rewards/chosen": -0.08110874146223068, "rewards/margins": 0.5907095670700073, "rewards/rejected": -0.6718182563781738, "sft_loss": 0.811087429523468, "step": 5765 }, { "epoch": 0.45, "grad_norm": 8.553319931030273, "learning_rate": 5.866893295054788e-06, "logits/chosen": -1.4405219554901123, "logits/rejected": -1.3422691822052002, "logps/chosen": -1.048010230064392, "logps/rejected": -8.861117362976074, "loss": 1.0655, "odds_ratio_loss": 0.17444069683551788, "rewards/accuracies": 1.0, "rewards/chosen": -0.10480103641748428, "rewards/margins": 0.7813106775283813, "rewards/rejected": -0.8861117362976074, "sft_loss": 1.048010230064392, "step": 5770 }, { "epoch": 0.45, "grad_norm": 5.439850807189941, "learning_rate": 5.860828323291943e-06, "logits/chosen": -1.3938419818878174, "logits/rejected": -1.003071665763855, "logps/chosen": -0.8749563097953796, "logps/rejected": -10.783254623413086, "loss": 0.8823, "odds_ratio_loss": 0.07309209555387497, "rewards/accuracies": 1.0, "rewards/chosen": -0.08749563992023468, "rewards/margins": 0.9908298254013062, "rewards/rejected": -1.0783255100250244, "sft_loss": 0.8749563097953796, "step": 5775 }, { "epoch": 0.45, "grad_norm": 8.660329818725586, "learning_rate": 5.854762045973652e-06, "logits/chosen": -1.3360105752944946, "logits/rejected": -1.2421538829803467, "logps/chosen": -1.4898730516433716, "logps/rejected": -5.747361660003662, "loss": 1.5518, "odds_ratio_loss": 0.6197172403335571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1489873081445694, "rewards/margins": 0.42574888467788696, "rewards/rejected": -0.5747361779212952, "sft_loss": 1.4898730516433716, "step": 5780 }, { "epoch": 0.45, "grad_norm": 6.601789474487305, "learning_rate": 5.8486944723001926e-06, "logits/chosen": -1.5374510288238525, "logits/rejected": -1.1397758722305298, "logps/chosen": -1.0105637311935425, "logps/rejected": -3.0968735218048096, "loss": 1.0427, "odds_ratio_loss": 0.3209215998649597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10105638206005096, "rewards/margins": 0.2086309939622879, "rewards/rejected": -0.30968737602233887, "sft_loss": 1.0105637311935425, "step": 5785 }, { "epoch": 0.45, "grad_norm": 6.6358160972595215, "learning_rate": 5.842625611473811e-06, "logits/chosen": -1.3155019283294678, "logits/rejected": -1.1303455829620361, "logps/chosen": -0.8606014251708984, "logps/rejected": -10.288931846618652, "loss": 0.8908, "odds_ratio_loss": 0.3021547198295593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08606014400720596, "rewards/margins": 0.9428330659866333, "rewards/rejected": -1.028893232345581, "sft_loss": 0.8606014251708984, "step": 5790 }, { "epoch": 0.45, "grad_norm": 19.110700607299805, "learning_rate": 5.836555472698707e-06, "logits/chosen": -1.28428316116333, "logits/rejected": -1.1632674932479858, "logps/chosen": -0.964084804058075, "logps/rejected": -10.772963523864746, "loss": 0.9645, "odds_ratio_loss": 0.004020442720502615, "rewards/accuracies": 1.0, "rewards/chosen": -0.09640847891569138, "rewards/margins": 0.9808878898620605, "rewards/rejected": -1.077296495437622, "sft_loss": 0.964084804058075, "step": 5795 }, { "epoch": 0.45, "grad_norm": 9.711421966552734, "learning_rate": 5.830484065181015e-06, "logits/chosen": -1.2102570533752441, "logits/rejected": -1.208566665649414, "logps/chosen": -1.4164773225784302, "logps/rejected": -2.8852813243865967, "loss": 1.4742, "odds_ratio_loss": 0.5771785974502563, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.14164772629737854, "rewards/margins": 0.1468804031610489, "rewards/rejected": -0.28852811455726624, "sft_loss": 1.4164773225784302, "step": 5800 }, { "epoch": 0.45, "grad_norm": 47.696739196777344, "learning_rate": 5.824411398128795e-06, "logits/chosen": -1.4307831525802612, "logits/rejected": -1.4838054180145264, "logps/chosen": -0.7359696626663208, "logps/rejected": -5.552895545959473, "loss": 0.7465, "odds_ratio_loss": 0.10489847511053085, "rewards/accuracies": 1.0, "rewards/chosen": -0.07359696924686432, "rewards/margins": 0.4816926121711731, "rewards/rejected": -0.5552895069122314, "sft_loss": 0.7359696626663208, "step": 5805 }, { "epoch": 0.45, "grad_norm": 23.042871475219727, "learning_rate": 5.81833748075202e-06, "logits/chosen": -1.394331693649292, "logits/rejected": -0.6417025327682495, "logps/chosen": -1.1504552364349365, "logps/rejected": -2.6900577545166016, "loss": 1.1776, "odds_ratio_loss": 0.2714696228504181, "rewards/accuracies": 1.0, "rewards/chosen": -0.11504554748535156, "rewards/margins": 0.15396027266979218, "rewards/rejected": -0.26900583505630493, "sft_loss": 1.1504552364349365, "step": 5810 }, { "epoch": 0.45, "grad_norm": 12.432936668395996, "learning_rate": 5.812262322262554e-06, "logits/chosen": -1.3173372745513916, "logits/rejected": -1.1222119331359863, "logps/chosen": -0.9128435850143433, "logps/rejected": -1.756940484046936, "loss": 0.9457, "odds_ratio_loss": 0.32820624113082886, "rewards/accuracies": 1.0, "rewards/chosen": -0.0912843644618988, "rewards/margins": 0.0844096764922142, "rewards/rejected": -0.1756940335035324, "sft_loss": 0.9128435850143433, "step": 5815 }, { "epoch": 0.45, "grad_norm": 7.906182765960693, "learning_rate": 5.806185931874148e-06, "logits/chosen": -1.3855034112930298, "logits/rejected": -0.9745451211929321, "logps/chosen": -1.174140453338623, "logps/rejected": -10.40697193145752, "loss": 1.1841, "odds_ratio_loss": 0.09939200431108475, "rewards/accuracies": 1.0, "rewards/chosen": -0.11741403490304947, "rewards/margins": 0.9232832193374634, "rewards/rejected": -1.0406970977783203, "sft_loss": 1.174140453338623, "step": 5820 }, { "epoch": 0.45, "grad_norm": 17.768505096435547, "learning_rate": 5.800108318802418e-06, "logits/chosen": -1.29469895362854, "logits/rejected": -1.390100121498108, "logps/chosen": -1.0928549766540527, "logps/rejected": -5.852885723114014, "loss": 1.1167, "odds_ratio_loss": 0.2388814389705658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10928548872470856, "rewards/margins": 0.47600308060646057, "rewards/rejected": -0.5852886438369751, "sft_loss": 1.0928549766540527, "step": 5825 }, { "epoch": 0.45, "grad_norm": 6.824836730957031, "learning_rate": 5.7940294922648365e-06, "logits/chosen": -1.4115650653839111, "logits/rejected": -1.1731384992599487, "logps/chosen": -1.6221885681152344, "logps/rejected": -6.723604679107666, "loss": 1.6568, "odds_ratio_loss": 0.34624502062797546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1622188687324524, "rewards/margins": 0.5101416110992432, "rewards/rejected": -0.6723604798316956, "sft_loss": 1.6221885681152344, "step": 5830 }, { "epoch": 0.45, "grad_norm": 10.766157150268555, "learning_rate": 5.787949461480717e-06, "logits/chosen": -1.1654672622680664, "logits/rejected": -1.4366929531097412, "logps/chosen": -1.0175927877426147, "logps/rejected": -9.682588577270508, "loss": 1.0291, "odds_ratio_loss": 0.11466507613658905, "rewards/accuracies": 1.0, "rewards/chosen": -0.10175929218530655, "rewards/margins": 0.8664995431900024, "rewards/rejected": -0.9682588577270508, "sft_loss": 1.0175927877426147, "step": 5835 }, { "epoch": 0.45, "grad_norm": 6.221446514129639, "learning_rate": 5.781868235671197e-06, "logits/chosen": -1.406813383102417, "logits/rejected": -0.8143359422683716, "logps/chosen": -0.9683381915092468, "logps/rejected": -4.046400547027588, "loss": 0.9999, "odds_ratio_loss": 0.31571096181869507, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09683381766080856, "rewards/margins": 0.30780622363090515, "rewards/rejected": -0.4046400487422943, "sft_loss": 0.9683381915092468, "step": 5840 }, { "epoch": 0.45, "grad_norm": 14.350464820861816, "learning_rate": 5.775785824059228e-06, "logits/chosen": -1.3366856575012207, "logits/rejected": -1.1903644800186157, "logps/chosen": -0.9601635932922363, "logps/rejected": -1.6902868747711182, "loss": 0.9953, "odds_ratio_loss": 0.35140281915664673, "rewards/accuracies": 1.0, "rewards/chosen": -0.09601635485887527, "rewards/margins": 0.0730123296380043, "rewards/rejected": -0.16902866959571838, "sft_loss": 0.9601635932922363, "step": 5845 }, { "epoch": 0.46, "grad_norm": 7.248025417327881, "learning_rate": 5.7697022358695595e-06, "logits/chosen": -1.3718547821044922, "logits/rejected": -1.3155597448349, "logps/chosen": -0.9688811302185059, "logps/rejected": -6.297783851623535, "loss": 0.9867, "odds_ratio_loss": 0.17819690704345703, "rewards/accuracies": 1.0, "rewards/chosen": -0.09688811004161835, "rewards/margins": 0.5328903794288635, "rewards/rejected": -0.6297784447669983, "sft_loss": 0.9688811302185059, "step": 5850 }, { "epoch": 0.46, "grad_norm": 8.421185493469238, "learning_rate": 5.763617480328725e-06, "logits/chosen": -1.3368728160858154, "logits/rejected": -1.1470426321029663, "logps/chosen": -1.2153146266937256, "logps/rejected": -8.046792030334473, "loss": 1.2225, "odds_ratio_loss": 0.07202035933732986, "rewards/accuracies": 1.0, "rewards/chosen": -0.12153146415948868, "rewards/margins": 0.6831477284431458, "rewards/rejected": -0.8046790957450867, "sft_loss": 1.2153146266937256, "step": 5855 }, { "epoch": 0.46, "grad_norm": 469.78997802734375, "learning_rate": 5.757531566665029e-06, "logits/chosen": -1.4922001361846924, "logits/rejected": -1.472839117050171, "logps/chosen": -2.2965407371520996, "logps/rejected": -17.437091827392578, "loss": 2.2972, "odds_ratio_loss": 0.007058045826852322, "rewards/accuracies": 1.0, "rewards/chosen": -0.22965407371520996, "rewards/margins": 1.5140551328659058, "rewards/rejected": -1.7437093257904053, "sft_loss": 2.2965407371520996, "step": 5860 }, { "epoch": 0.46, "grad_norm": 7.139679431915283, "learning_rate": 5.751444504108532e-06, "logits/chosen": -1.504317045211792, "logits/rejected": -1.1596872806549072, "logps/chosen": -1.9202678203582764, "logps/rejected": -15.623100280761719, "loss": 1.9275, "odds_ratio_loss": 0.07225911319255829, "rewards/accuracies": 1.0, "rewards/chosen": -0.1920267790555954, "rewards/margins": 1.3702832460403442, "rewards/rejected": -1.562309980392456, "sft_loss": 1.9202678203582764, "step": 5865 }, { "epoch": 0.46, "grad_norm": 9.28328800201416, "learning_rate": 5.745356301891036e-06, "logits/chosen": -1.3105159997940063, "logits/rejected": -0.9987856149673462, "logps/chosen": -0.8882701992988586, "logps/rejected": -6.670027256011963, "loss": 0.8993, "odds_ratio_loss": 0.11003986746072769, "rewards/accuracies": 1.0, "rewards/chosen": -0.08882702887058258, "rewards/margins": 0.5781757235527039, "rewards/rejected": -0.66700279712677, "sft_loss": 0.8882701992988586, "step": 5870 }, { "epoch": 0.46, "grad_norm": 14.096705436706543, "learning_rate": 5.739266969246077e-06, "logits/chosen": -1.4479315280914307, "logits/rejected": -1.2866063117980957, "logps/chosen": -1.385507345199585, "logps/rejected": -4.894179344177246, "loss": 1.3972, "odds_ratio_loss": 0.11661858856678009, "rewards/accuracies": 1.0, "rewards/chosen": -0.13855072855949402, "rewards/margins": 0.35086721181869507, "rewards/rejected": -0.4894179403781891, "sft_loss": 1.385507345199585, "step": 5875 }, { "epoch": 0.46, "grad_norm": 6.6659746170043945, "learning_rate": 5.733176515408896e-06, "logits/chosen": -1.3389222621917725, "logits/rejected": -0.9701216816902161, "logps/chosen": -1.0844879150390625, "logps/rejected": -11.632673263549805, "loss": 1.0921, "odds_ratio_loss": 0.07584533095359802, "rewards/accuracies": 1.0, "rewards/chosen": -0.10844878852367401, "rewards/margins": 1.0548183917999268, "rewards/rejected": -1.1632672548294067, "sft_loss": 1.0844879150390625, "step": 5880 }, { "epoch": 0.46, "grad_norm": 195.03273010253906, "learning_rate": 5.727084949616443e-06, "logits/chosen": -1.4234966039657593, "logits/rejected": -1.4936151504516602, "logps/chosen": -1.540858507156372, "logps/rejected": -5.37454891204834, "loss": 1.5697, "odds_ratio_loss": 0.28838053345680237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15408584475517273, "rewards/margins": 0.3833690583705902, "rewards/rejected": -0.5374549627304077, "sft_loss": 1.540858507156372, "step": 5885 }, { "epoch": 0.46, "grad_norm": 7.396588325500488, "learning_rate": 5.720992281107347e-06, "logits/chosen": -1.4567421674728394, "logits/rejected": -0.7531291842460632, "logps/chosen": -1.057483196258545, "logps/rejected": -1.95417058467865, "loss": 1.1194, "odds_ratio_loss": 0.6192514300346375, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10574833303689957, "rewards/margins": 0.08966872841119766, "rewards/rejected": -0.19541704654693604, "sft_loss": 1.057483196258545, "step": 5890 }, { "epoch": 0.46, "grad_norm": 17.07513427734375, "learning_rate": 5.714898519121919e-06, "logits/chosen": -1.527635097503662, "logits/rejected": -1.0351780652999878, "logps/chosen": -0.8463979959487915, "logps/rejected": -3.124258518218994, "loss": 0.876, "odds_ratio_loss": 0.2964860796928406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08463980257511139, "rewards/margins": 0.22778606414794922, "rewards/rejected": -0.3124258816242218, "sft_loss": 0.8463979959487915, "step": 5895 }, { "epoch": 0.46, "grad_norm": 21.76619529724121, "learning_rate": 5.708803672902119e-06, "logits/chosen": -1.3705116510391235, "logits/rejected": -1.0004701614379883, "logps/chosen": -0.9214539527893066, "logps/rejected": -3.389974594116211, "loss": 0.937, "odds_ratio_loss": 0.15547111630439758, "rewards/accuracies": 1.0, "rewards/chosen": -0.0921453982591629, "rewards/margins": 0.2468520700931549, "rewards/rejected": -0.338997483253479, "sft_loss": 0.9214539527893066, "step": 5900 }, { "epoch": 0.46, "grad_norm": 10.555737495422363, "learning_rate": 5.7027077516915544e-06, "logits/chosen": -1.3661435842514038, "logits/rejected": -1.0756629705429077, "logps/chosen": -0.8574056625366211, "logps/rejected": -2.659536600112915, "loss": 0.8867, "odds_ratio_loss": 0.2932834029197693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08574056625366211, "rewards/margins": 0.180213063955307, "rewards/rejected": -0.2659536302089691, "sft_loss": 0.8574056625366211, "step": 5905 }, { "epoch": 0.46, "grad_norm": 8.062660217285156, "learning_rate": 5.6966107647354655e-06, "logits/chosen": -1.5043424367904663, "logits/rejected": -1.2339164018630981, "logps/chosen": -1.0549910068511963, "logps/rejected": -2.0557339191436768, "loss": 1.081, "odds_ratio_loss": 0.26006263494491577, "rewards/accuracies": 1.0, "rewards/chosen": -0.10549911111593246, "rewards/margins": 0.10007427632808685, "rewards/rejected": -0.20557339489459991, "sft_loss": 1.0549910068511963, "step": 5910 }, { "epoch": 0.46, "grad_norm": 38.601078033447266, "learning_rate": 5.690512721280707e-06, "logits/chosen": -1.3967393636703491, "logits/rejected": -0.752194881439209, "logps/chosen": -1.0708661079406738, "logps/rejected": -1.977912187576294, "loss": 1.1081, "odds_ratio_loss": 0.3719109892845154, "rewards/accuracies": 1.0, "rewards/chosen": -0.10708661377429962, "rewards/margins": 0.09070460498332977, "rewards/rejected": -0.1977912038564682, "sft_loss": 1.0708661079406738, "step": 5915 }, { "epoch": 0.46, "grad_norm": 24.084102630615234, "learning_rate": 5.684413630575737e-06, "logits/chosen": -1.4691526889801025, "logits/rejected": -1.2201991081237793, "logps/chosen": -0.8691366314888, "logps/rejected": -2.033484935760498, "loss": 0.9264, "odds_ratio_loss": 0.5723375678062439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08691366016864777, "rewards/margins": 0.11643485724925995, "rewards/rejected": -0.20334851741790771, "sft_loss": 0.8691366314888, "step": 5920 }, { "epoch": 0.46, "grad_norm": 11.225747108459473, "learning_rate": 5.678313501870599e-06, "logits/chosen": -1.2836955785751343, "logits/rejected": -1.284705400466919, "logps/chosen": -0.8444737195968628, "logps/rejected": -4.509748458862305, "loss": 0.8763, "odds_ratio_loss": 0.31828147172927856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08444737643003464, "rewards/margins": 0.3665274977684021, "rewards/rejected": -0.45097485184669495, "sft_loss": 0.8444737195968628, "step": 5925 }, { "epoch": 0.46, "grad_norm": 10.136873245239258, "learning_rate": 5.672212344416912e-06, "logits/chosen": -1.4180656671524048, "logits/rejected": -1.0987080335617065, "logps/chosen": -0.887566864490509, "logps/rejected": -8.7537841796875, "loss": 0.9074, "odds_ratio_loss": 0.1985785961151123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08875668048858643, "rewards/margins": 0.7866216897964478, "rewards/rejected": -0.875378429889679, "sft_loss": 0.887566864490509, "step": 5930 }, { "epoch": 0.46, "grad_norm": 8.497200965881348, "learning_rate": 5.666110167467858e-06, "logits/chosen": -1.3332171440124512, "logits/rejected": -1.3454488515853882, "logps/chosen": -1.0007665157318115, "logps/rejected": -4.486929416656494, "loss": 1.0361, "odds_ratio_loss": 0.3537340760231018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10007666051387787, "rewards/margins": 0.34861627221107483, "rewards/rejected": -0.4486928880214691, "sft_loss": 1.0007665157318115, "step": 5935 }, { "epoch": 0.46, "grad_norm": 13.362495422363281, "learning_rate": 5.6600069802781634e-06, "logits/chosen": -1.440659999847412, "logits/rejected": -0.8654731512069702, "logps/chosen": -0.9633086323738098, "logps/rejected": -3.2738614082336426, "loss": 0.9943, "odds_ratio_loss": 0.30953115224838257, "rewards/accuracies": 1.0, "rewards/chosen": -0.09633086621761322, "rewards/margins": 0.23105528950691223, "rewards/rejected": -0.32738617062568665, "sft_loss": 0.9633086323738098, "step": 5940 }, { "epoch": 0.46, "grad_norm": 25.45018768310547, "learning_rate": 5.6539027921040836e-06, "logits/chosen": -1.2397971153259277, "logits/rejected": -0.7677778005599976, "logps/chosen": -0.9034073948860168, "logps/rejected": -4.984935760498047, "loss": 0.9151, "odds_ratio_loss": 0.11679281294345856, "rewards/accuracies": 1.0, "rewards/chosen": -0.0903407484292984, "rewards/margins": 0.40815287828445435, "rewards/rejected": -0.49849358201026917, "sft_loss": 0.9034073948860168, "step": 5945 }, { "epoch": 0.46, "grad_norm": 5.295702934265137, "learning_rate": 5.647797612203399e-06, "logits/chosen": -1.394932508468628, "logits/rejected": -1.2857708930969238, "logps/chosen": -0.8075026273727417, "logps/rejected": -8.137491226196289, "loss": 0.8361, "odds_ratio_loss": 0.2856552302837372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08075026422739029, "rewards/margins": 0.7329989671707153, "rewards/rejected": -0.8137491941452026, "sft_loss": 0.8075026273727417, "step": 5950 }, { "epoch": 0.46, "grad_norm": 5.818515300750732, "learning_rate": 5.641691449835387e-06, "logits/chosen": -1.3131741285324097, "logits/rejected": -1.2235875129699707, "logps/chosen": -0.9159797430038452, "logps/rejected": -6.6266608238220215, "loss": 0.9419, "odds_ratio_loss": 0.258743017911911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09159798920154572, "rewards/margins": 0.5710681676864624, "rewards/rejected": -0.6626661419868469, "sft_loss": 0.9159797430038452, "step": 5955 }, { "epoch": 0.46, "grad_norm": 17.876649856567383, "learning_rate": 5.635584314260818e-06, "logits/chosen": -1.0992735624313354, "logits/rejected": -1.378377079963684, "logps/chosen": -0.7821237444877625, "logps/rejected": -14.559664726257324, "loss": 0.7915, "odds_ratio_loss": 0.0936480313539505, "rewards/accuracies": 1.0, "rewards/chosen": -0.07821237295866013, "rewards/margins": 1.3777542114257812, "rewards/rejected": -1.455966591835022, "sft_loss": 0.7821237444877625, "step": 5960 }, { "epoch": 0.46, "grad_norm": 5.123594284057617, "learning_rate": 5.629476214741941e-06, "logits/chosen": -1.41877281665802, "logits/rejected": -0.8851032257080078, "logps/chosen": -1.00931715965271, "logps/rejected": -9.06434440612793, "loss": 1.0301, "odds_ratio_loss": 0.20743639767169952, "rewards/accuracies": 1.0, "rewards/chosen": -0.10093171894550323, "rewards/margins": 0.8055028915405273, "rewards/rejected": -0.9064345359802246, "sft_loss": 1.00931715965271, "step": 5965 }, { "epoch": 0.46, "grad_norm": 21.803184509277344, "learning_rate": 5.6233671605424625e-06, "logits/chosen": -1.2566730976104736, "logits/rejected": -1.1595937013626099, "logps/chosen": -0.8769540786743164, "logps/rejected": -1.283287763595581, "loss": 0.9293, "odds_ratio_loss": 0.5238395929336548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08769541233778, "rewards/margins": 0.04063335806131363, "rewards/rejected": -0.12832877039909363, "sft_loss": 0.8769540786743164, "step": 5970 }, { "epoch": 0.46, "grad_norm": 20.731637954711914, "learning_rate": 5.617257160927539e-06, "logits/chosen": -1.3091537952423096, "logits/rejected": -0.9598578214645386, "logps/chosen": -1.185593843460083, "logps/rejected": -11.909768104553223, "loss": 1.2584, "odds_ratio_loss": 0.7278609275817871, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11855938285589218, "rewards/margins": 1.0724174976348877, "rewards/rejected": -1.190976858139038, "sft_loss": 1.185593843460083, "step": 5975 }, { "epoch": 0.47, "grad_norm": 27.39661407470703, "learning_rate": 5.611146225163762e-06, "logits/chosen": -1.1741008758544922, "logits/rejected": -0.8947650194168091, "logps/chosen": -0.9251441955566406, "logps/rejected": -5.459146022796631, "loss": 0.9395, "odds_ratio_loss": 0.14343713223934174, "rewards/accuracies": 1.0, "rewards/chosen": -0.09251442551612854, "rewards/margins": 0.453400194644928, "rewards/rejected": -0.5459145903587341, "sft_loss": 0.9251441955566406, "step": 5980 }, { "epoch": 0.47, "grad_norm": 5.630356311798096, "learning_rate": 5.6050343625191385e-06, "logits/chosen": -1.3278236389160156, "logits/rejected": -1.176492691040039, "logps/chosen": -0.9233428835868835, "logps/rejected": -3.4967033863067627, "loss": 0.9333, "odds_ratio_loss": 0.0994301587343216, "rewards/accuracies": 1.0, "rewards/chosen": -0.09233428537845612, "rewards/margins": 0.2573360800743103, "rewards/rejected": -0.3496703505516052, "sft_loss": 0.9233428835868835, "step": 5985 }, { "epoch": 0.47, "grad_norm": 17.32468605041504, "learning_rate": 5.598921582263087e-06, "logits/chosen": -1.1766599416732788, "logits/rejected": -1.1519089937210083, "logps/chosen": -1.043341040611267, "logps/rejected": -6.694605827331543, "loss": 1.103, "odds_ratio_loss": 0.5967239737510681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10433411598205566, "rewards/margins": 0.5651265382766724, "rewards/rejected": -0.669460654258728, "sft_loss": 1.043341040611267, "step": 5990 }, { "epoch": 0.47, "grad_norm": 5.883633613586426, "learning_rate": 5.592807893666413e-06, "logits/chosen": -1.2702155113220215, "logits/rejected": -1.0136901140213013, "logps/chosen": -0.5562310814857483, "logps/rejected": -4.595681667327881, "loss": 0.5879, "odds_ratio_loss": 0.316739022731781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05562310293316841, "rewards/margins": 0.40394511818885803, "rewards/rejected": -0.45956820249557495, "sft_loss": 0.5562310814857483, "step": 5995 }, { "epoch": 0.47, "grad_norm": 784.6876220703125, "learning_rate": 5.586693306001303e-06, "logits/chosen": -1.3234854936599731, "logits/rejected": -1.0099961757659912, "logps/chosen": -3.74432110786438, "logps/rejected": -5.342950820922852, "loss": 3.938, "odds_ratio_loss": 1.9367377758026123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.37443211674690247, "rewards/margins": 0.1598629653453827, "rewards/rejected": -0.5342950820922852, "sft_loss": 3.74432110786438, "step": 6000 }, { "epoch": 0.47, "grad_norm": 10.769394874572754, "learning_rate": 5.580577828541306e-06, "logits/chosen": -1.3043229579925537, "logits/rejected": -1.1818064451217651, "logps/chosen": -0.7584089040756226, "logps/rejected": -6.9647536277771, "loss": 0.7731, "odds_ratio_loss": 0.1465327888727188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07584089040756226, "rewards/margins": 0.6206345558166504, "rewards/rejected": -0.6964754462242126, "sft_loss": 0.7584089040756226, "step": 6005 }, { "epoch": 0.47, "grad_norm": 8.166546821594238, "learning_rate": 5.5744614705613185e-06, "logits/chosen": -1.3184517621994019, "logits/rejected": -0.8232451677322388, "logps/chosen": -1.2114698886871338, "logps/rejected": -2.8581385612487793, "loss": 1.2571, "odds_ratio_loss": 0.45625367760658264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12114699184894562, "rewards/margins": 0.16466687619686127, "rewards/rejected": -0.2858138680458069, "sft_loss": 1.2114698886871338, "step": 6010 }, { "epoch": 0.47, "grad_norm": 4.491515159606934, "learning_rate": 5.568344241337575e-06, "logits/chosen": -1.3477157354354858, "logits/rejected": -1.0697810649871826, "logps/chosen": -0.9576603174209595, "logps/rejected": -1.879073143005371, "loss": 1.0139, "odds_ratio_loss": 0.5625036358833313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09576603025197983, "rewards/margins": 0.092141292989254, "rewards/rejected": -0.18790733814239502, "sft_loss": 0.9576603174209595, "step": 6015 }, { "epoch": 0.47, "grad_norm": 5.953708648681641, "learning_rate": 5.562226150147629e-06, "logits/chosen": -1.3620707988739014, "logits/rejected": -1.260248064994812, "logps/chosen": -0.64806067943573, "logps/rejected": -1.206020712852478, "loss": 0.6885, "odds_ratio_loss": 0.4039697051048279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06480606645345688, "rewards/margins": 0.05579598993062973, "rewards/rejected": -0.12060205638408661, "sft_loss": 0.64806067943573, "step": 6020 }, { "epoch": 0.47, "grad_norm": 16.700389862060547, "learning_rate": 5.5561072062703426e-06, "logits/chosen": -1.267134428024292, "logits/rejected": -1.0757681131362915, "logps/chosen": -0.8285772204399109, "logps/rejected": -2.7522521018981934, "loss": 0.847, "odds_ratio_loss": 0.18403898179531097, "rewards/accuracies": 1.0, "rewards/chosen": -0.08285772055387497, "rewards/margins": 0.1923675239086151, "rewards/rejected": -0.2752252519130707, "sft_loss": 0.8285772204399109, "step": 6025 }, { "epoch": 0.47, "grad_norm": 6.605173110961914, "learning_rate": 5.549987418985873e-06, "logits/chosen": -1.3399419784545898, "logits/rejected": -0.7796742916107178, "logps/chosen": -0.9934493899345398, "logps/rejected": -2.5662717819213867, "loss": 1.0154, "odds_ratio_loss": 0.2192169427871704, "rewards/accuracies": 1.0, "rewards/chosen": -0.09934493154287338, "rewards/margins": 0.1572822481393814, "rewards/rejected": -0.2566271722316742, "sft_loss": 0.9934493899345398, "step": 6030 }, { "epoch": 0.47, "grad_norm": 5.67198371887207, "learning_rate": 5.543866797575653e-06, "logits/chosen": -1.2885167598724365, "logits/rejected": -0.9590283632278442, "logps/chosen": -1.0691325664520264, "logps/rejected": -3.0669302940368652, "loss": 1.0897, "odds_ratio_loss": 0.20612020790576935, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10691328346729279, "rewards/margins": 0.19977974891662598, "rewards/rejected": -0.30669301748275757, "sft_loss": 1.0691325664520264, "step": 6035 }, { "epoch": 0.47, "grad_norm": 15.120502471923828, "learning_rate": 5.537745351322382e-06, "logits/chosen": -1.2657688856124878, "logits/rejected": -0.9663764834403992, "logps/chosen": -0.5593496561050415, "logps/rejected": -4.728362560272217, "loss": 0.567, "odds_ratio_loss": 0.07687228918075562, "rewards/accuracies": 1.0, "rewards/chosen": -0.05593496561050415, "rewards/margins": 0.41690129041671753, "rewards/rejected": -0.4728362560272217, "sft_loss": 0.5593496561050415, "step": 6040 }, { "epoch": 0.47, "grad_norm": 18.173458099365234, "learning_rate": 5.531623089510011e-06, "logits/chosen": -1.3215463161468506, "logits/rejected": -0.9034023284912109, "logps/chosen": -1.3617818355560303, "logps/rejected": -5.3735671043396, "loss": 1.3972, "odds_ratio_loss": 0.35413289070129395, "rewards/accuracies": 1.0, "rewards/chosen": -0.1361781805753708, "rewards/margins": 0.4011785387992859, "rewards/rejected": -0.5373567342758179, "sft_loss": 1.3617818355560303, "step": 6045 }, { "epoch": 0.47, "grad_norm": 16.87181854248047, "learning_rate": 5.525500021423726e-06, "logits/chosen": -1.3848448991775513, "logits/rejected": -1.0528861284255981, "logps/chosen": -1.08721923828125, "logps/rejected": -6.1181511878967285, "loss": 1.1468, "odds_ratio_loss": 0.5962321162223816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10872192680835724, "rewards/margins": 0.5030931830406189, "rewards/rejected": -0.6118150949478149, "sft_loss": 1.08721923828125, "step": 6050 }, { "epoch": 0.47, "grad_norm": 7.121865272521973, "learning_rate": 5.519376156349942e-06, "logits/chosen": -1.3546545505523682, "logits/rejected": -0.6967934370040894, "logps/chosen": -0.7989253997802734, "logps/rejected": -5.922381401062012, "loss": 0.8182, "odds_ratio_loss": 0.1925160139799118, "rewards/accuracies": 1.0, "rewards/chosen": -0.07989253848791122, "rewards/margins": 0.5123456716537476, "rewards/rejected": -0.592238187789917, "sft_loss": 0.7989253997802734, "step": 6055 }, { "epoch": 0.47, "grad_norm": 10.315966606140137, "learning_rate": 5.513251503576271e-06, "logits/chosen": -1.4382257461547852, "logits/rejected": -0.8289369344711304, "logps/chosen": -0.7345893383026123, "logps/rejected": -3.5460829734802246, "loss": 0.7703, "odds_ratio_loss": 0.35676613450050354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07345893979072571, "rewards/margins": 0.28114938735961914, "rewards/rejected": -0.35460832715034485, "sft_loss": 0.7345893383026123, "step": 6060 }, { "epoch": 0.47, "grad_norm": 12.880824089050293, "learning_rate": 5.507126072391531e-06, "logits/chosen": -1.294734239578247, "logits/rejected": -0.965386688709259, "logps/chosen": -0.9744323492050171, "logps/rejected": -3.9022727012634277, "loss": 0.986, "odds_ratio_loss": 0.11532683670520782, "rewards/accuracies": 1.0, "rewards/chosen": -0.09744324535131454, "rewards/margins": 0.29278403520584106, "rewards/rejected": -0.3902273178100586, "sft_loss": 0.9744323492050171, "step": 6065 }, { "epoch": 0.47, "grad_norm": 4.685819149017334, "learning_rate": 5.500999872085716e-06, "logits/chosen": -1.4079697132110596, "logits/rejected": -1.3205724954605103, "logps/chosen": -3.4261250495910645, "logps/rejected": -5.981120586395264, "loss": 3.6766, "odds_ratio_loss": 2.5042619705200195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.34261250495910645, "rewards/margins": 0.25549960136413574, "rewards/rejected": -0.5981121063232422, "sft_loss": 3.4261250495910645, "step": 6070 }, { "epoch": 0.47, "grad_norm": 9.858994483947754, "learning_rate": 5.494872911949984e-06, "logits/chosen": -1.4069186449050903, "logits/rejected": -1.3357126712799072, "logps/chosen": -1.2475851774215698, "logps/rejected": -9.896788597106934, "loss": 1.2574, "odds_ratio_loss": 0.0983632430434227, "rewards/accuracies": 1.0, "rewards/chosen": -0.1247585192322731, "rewards/margins": 0.8649203181266785, "rewards/rejected": -0.9896788597106934, "sft_loss": 1.2475851774215698, "step": 6075 }, { "epoch": 0.47, "grad_norm": 7.707642078399658, "learning_rate": 5.488745201276651e-06, "logits/chosen": -1.3376123905181885, "logits/rejected": -0.8407508730888367, "logps/chosen": -0.7080657482147217, "logps/rejected": -3.033329963684082, "loss": 0.7166, "odds_ratio_loss": 0.0854514017701149, "rewards/accuracies": 1.0, "rewards/chosen": -0.0708065778017044, "rewards/margins": 0.23252645134925842, "rewards/rejected": -0.303333044052124, "sft_loss": 0.7080657482147217, "step": 6080 }, { "epoch": 0.47, "grad_norm": 8.964948654174805, "learning_rate": 5.482616749359165e-06, "logits/chosen": -1.382056474685669, "logits/rejected": -0.9311412572860718, "logps/chosen": -1.1271944046020508, "logps/rejected": -3.809868335723877, "loss": 1.1583, "odds_ratio_loss": 0.31120091676712036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11271943897008896, "rewards/margins": 0.26826736330986023, "rewards/rejected": -0.3809867799282074, "sft_loss": 1.1271944046020508, "step": 6085 }, { "epoch": 0.47, "grad_norm": 12.242707252502441, "learning_rate": 5.476487565492105e-06, "logits/chosen": -1.3703018426895142, "logits/rejected": -1.1052435636520386, "logps/chosen": -0.9987226724624634, "logps/rejected": -7.2316131591796875, "loss": 1.0014, "odds_ratio_loss": 0.026345301419496536, "rewards/accuracies": 1.0, "rewards/chosen": -0.09987227618694305, "rewards/margins": 0.6232890486717224, "rewards/rejected": -0.7231613397598267, "sft_loss": 0.9987226724624634, "step": 6090 }, { "epoch": 0.47, "grad_norm": 6.250059127807617, "learning_rate": 5.4703576589711534e-06, "logits/chosen": -1.2467752695083618, "logits/rejected": -1.3182735443115234, "logps/chosen": -0.82000333070755, "logps/rejected": -5.343472957611084, "loss": 0.8405, "odds_ratio_loss": 0.20504280924797058, "rewards/accuracies": 1.0, "rewards/chosen": -0.08200033009052277, "rewards/margins": 0.45234689116477966, "rewards/rejected": -0.5343472957611084, "sft_loss": 0.82000333070755, "step": 6095 }, { "epoch": 0.47, "grad_norm": 19.261926651000977, "learning_rate": 5.464227039093093e-06, "logits/chosen": -1.3524301052093506, "logits/rejected": -1.0798319578170776, "logps/chosen": -1.0138263702392578, "logps/rejected": -7.436760902404785, "loss": 1.0177, "odds_ratio_loss": 0.03829532116651535, "rewards/accuracies": 1.0, "rewards/chosen": -0.10138263553380966, "rewards/margins": 0.6422935128211975, "rewards/rejected": -0.7436760663986206, "sft_loss": 1.0138263702392578, "step": 6100 }, { "epoch": 0.47, "grad_norm": 11.354137420654297, "learning_rate": 5.458095715155788e-06, "logits/chosen": -1.3508212566375732, "logits/rejected": -1.3399690389633179, "logps/chosen": -1.062408685684204, "logps/rejected": -5.051278591156006, "loss": 1.0874, "odds_ratio_loss": 0.24966564774513245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1062408834695816, "rewards/margins": 0.3988869786262512, "rewards/rejected": -0.5051278471946716, "sft_loss": 1.062408685684204, "step": 6105 }, { "epoch": 0.48, "grad_norm": 6.081448554992676, "learning_rate": 5.451963696458168e-06, "logits/chosen": -1.338024377822876, "logits/rejected": -1.045498251914978, "logps/chosen": -0.9185785055160522, "logps/rejected": -5.6302618980407715, "loss": 0.9524, "odds_ratio_loss": 0.33786827325820923, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09185785055160522, "rewards/margins": 0.4711683690547943, "rewards/rejected": -0.5630262494087219, "sft_loss": 0.9185785055160522, "step": 6110 }, { "epoch": 0.48, "grad_norm": 9.321093559265137, "learning_rate": 5.445830992300218e-06, "logits/chosen": -1.4295237064361572, "logits/rejected": -0.8577009439468384, "logps/chosen": -0.8555432558059692, "logps/rejected": -4.726191520690918, "loss": 0.867, "odds_ratio_loss": 0.11437414586544037, "rewards/accuracies": 1.0, "rewards/chosen": -0.08555431663990021, "rewards/margins": 0.38706478476524353, "rewards/rejected": -0.4726191461086273, "sft_loss": 0.8555432558059692, "step": 6115 }, { "epoch": 0.48, "grad_norm": 51.58865737915039, "learning_rate": 5.439697611982966e-06, "logits/chosen": -1.3293185234069824, "logits/rejected": -1.09556245803833, "logps/chosen": -0.684034526348114, "logps/rejected": -5.386687278747559, "loss": 0.6868, "odds_ratio_loss": 0.028074750676751137, "rewards/accuracies": 1.0, "rewards/chosen": -0.068403460085392, "rewards/margins": 0.47026529908180237, "rewards/rejected": -0.5386687517166138, "sft_loss": 0.684034526348114, "step": 6120 }, { "epoch": 0.48, "grad_norm": 7.41538143157959, "learning_rate": 5.4335635648084586e-06, "logits/chosen": -1.3778337240219116, "logits/rejected": -1.0915286540985107, "logps/chosen": -0.9108161926269531, "logps/rejected": -6.016547203063965, "loss": 0.932, "odds_ratio_loss": 0.21229729056358337, "rewards/accuracies": 1.0, "rewards/chosen": -0.09108161926269531, "rewards/margins": 0.5105730891227722, "rewards/rejected": -0.6016547083854675, "sft_loss": 0.9108161926269531, "step": 6125 }, { "epoch": 0.48, "grad_norm": 5.8430352210998535, "learning_rate": 5.4274288600797575e-06, "logits/chosen": -1.5422677993774414, "logits/rejected": -1.1286903619766235, "logps/chosen": -0.8631542921066284, "logps/rejected": -6.0735392570495605, "loss": 0.8843, "odds_ratio_loss": 0.21111269295215607, "rewards/accuracies": 1.0, "rewards/chosen": -0.08631541579961777, "rewards/margins": 0.5210385322570801, "rewards/rejected": -0.607353925704956, "sft_loss": 0.8631542921066284, "step": 6130 }, { "epoch": 0.48, "grad_norm": 117.86041259765625, "learning_rate": 5.42129350710092e-06, "logits/chosen": -1.4739691019058228, "logits/rejected": -1.2402070760726929, "logps/chosen": -1.0958428382873535, "logps/rejected": -8.138407707214355, "loss": 1.1265, "odds_ratio_loss": 0.30624285340309143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10958428680896759, "rewards/margins": 0.7042564749717712, "rewards/rejected": -0.8138407468795776, "sft_loss": 1.0958428382873535, "step": 6135 }, { "epoch": 0.48, "grad_norm": 7.562963008880615, "learning_rate": 5.41515751517699e-06, "logits/chosen": -1.413245439529419, "logits/rejected": -1.022647500038147, "logps/chosen": -0.8586851358413696, "logps/rejected": -4.1241044998168945, "loss": 0.8875, "odds_ratio_loss": 0.287839412689209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08586851507425308, "rewards/margins": 0.326541930437088, "rewards/rejected": -0.4124104380607605, "sft_loss": 0.8586851358413696, "step": 6140 }, { "epoch": 0.48, "grad_norm": 8.0787353515625, "learning_rate": 5.409020893613979e-06, "logits/chosen": -1.4699350595474243, "logits/rejected": -1.1512773036956787, "logps/chosen": -1.0986006259918213, "logps/rejected": -2.7863099575042725, "loss": 1.1517, "odds_ratio_loss": 0.5305068492889404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10986007750034332, "rewards/margins": 0.1687709391117096, "rewards/rejected": -0.2786310017108917, "sft_loss": 1.0986006259918213, "step": 6145 }, { "epoch": 0.48, "grad_norm": 29.433841705322266, "learning_rate": 5.402883651718851e-06, "logits/chosen": -1.394988775253296, "logits/rejected": -1.1701581478118896, "logps/chosen": -0.9054506421089172, "logps/rejected": -3.5625643730163574, "loss": 0.94, "odds_ratio_loss": 0.34540730714797974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09054505825042725, "rewards/margins": 0.26571139693260193, "rewards/rejected": -0.3562564551830292, "sft_loss": 0.9054506421089172, "step": 6150 }, { "epoch": 0.48, "grad_norm": 5.943950176239014, "learning_rate": 5.396745798799513e-06, "logits/chosen": -1.378969669342041, "logits/rejected": -1.0622516870498657, "logps/chosen": -0.8753480911254883, "logps/rejected": -5.49657678604126, "loss": 0.8861, "odds_ratio_loss": 0.10759921371936798, "rewards/accuracies": 1.0, "rewards/chosen": -0.08753480017185211, "rewards/margins": 0.4621228277683258, "rewards/rejected": -0.5496576428413391, "sft_loss": 0.8753480911254883, "step": 6155 }, { "epoch": 0.48, "grad_norm": 42.807716369628906, "learning_rate": 5.390607344164799e-06, "logits/chosen": -1.5209299325942993, "logits/rejected": -0.9063835144042969, "logps/chosen": -0.9058001637458801, "logps/rejected": -2.6679656505584717, "loss": 0.9345, "odds_ratio_loss": 0.28699612617492676, "rewards/accuracies": 1.0, "rewards/chosen": -0.09058002382516861, "rewards/margins": 0.17621657252311707, "rewards/rejected": -0.2667965590953827, "sft_loss": 0.9058001637458801, "step": 6160 }, { "epoch": 0.48, "grad_norm": 35.36753845214844, "learning_rate": 5.384468297124452e-06, "logits/chosen": -1.4319443702697754, "logits/rejected": -1.0489351749420166, "logps/chosen": -0.871281623840332, "logps/rejected": -8.540701866149902, "loss": 0.8721, "odds_ratio_loss": 0.008153039962053299, "rewards/accuracies": 1.0, "rewards/chosen": -0.0871281623840332, "rewards/margins": 0.7669421434402466, "rewards/rejected": -0.8540701866149902, "sft_loss": 0.871281623840332, "step": 6165 }, { "epoch": 0.48, "grad_norm": 17.795150756835938, "learning_rate": 5.378328666989121e-06, "logits/chosen": -1.5011329650878906, "logits/rejected": -1.4191067218780518, "logps/chosen": -0.7918688058853149, "logps/rejected": -5.1941094398498535, "loss": 0.8059, "odds_ratio_loss": 0.1406429409980774, "rewards/accuracies": 1.0, "rewards/chosen": -0.07918687909841537, "rewards/margins": 0.4402240216732025, "rewards/rejected": -0.5194109678268433, "sft_loss": 0.7918688058853149, "step": 6170 }, { "epoch": 0.48, "grad_norm": 48.25984573364258, "learning_rate": 5.37218846307033e-06, "logits/chosen": -1.117694616317749, "logits/rejected": -1.5141966342926025, "logps/chosen": -1.1156768798828125, "logps/rejected": -4.8804168701171875, "loss": 1.1522, "odds_ratio_loss": 0.3651631474494934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1115676760673523, "rewards/margins": 0.376473993062973, "rewards/rejected": -0.4880416989326477, "sft_loss": 1.1156768798828125, "step": 6175 }, { "epoch": 0.48, "grad_norm": 13.673327445983887, "learning_rate": 5.36604769468048e-06, "logits/chosen": -1.465071678161621, "logits/rejected": -1.0160772800445557, "logps/chosen": -0.8801183700561523, "logps/rejected": -9.317262649536133, "loss": 0.9039, "odds_ratio_loss": 0.23795120418071747, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08801184594631195, "rewards/margins": 0.8437144160270691, "rewards/rejected": -0.9317262768745422, "sft_loss": 0.8801183700561523, "step": 6180 }, { "epoch": 0.48, "grad_norm": 9.324233055114746, "learning_rate": 5.359906371132828e-06, "logits/chosen": -1.3929476737976074, "logits/rejected": -0.9207280278205872, "logps/chosen": -1.0257680416107178, "logps/rejected": -5.169532299041748, "loss": 1.0405, "odds_ratio_loss": 0.14743469655513763, "rewards/accuracies": 1.0, "rewards/chosen": -0.10257680714130402, "rewards/margins": 0.414376437664032, "rewards/rejected": -0.5169532299041748, "sft_loss": 1.0257680416107178, "step": 6185 }, { "epoch": 0.48, "grad_norm": 26.96074676513672, "learning_rate": 5.3537645017414666e-06, "logits/chosen": -1.2415544986724854, "logits/rejected": -0.9335296750068665, "logps/chosen": -0.7189784049987793, "logps/rejected": -7.029814720153809, "loss": 0.7201, "odds_ratio_loss": 0.010862020775675774, "rewards/accuracies": 1.0, "rewards/chosen": -0.07189784198999405, "rewards/margins": 0.631083607673645, "rewards/rejected": -0.7029815316200256, "sft_loss": 0.7189784049987793, "step": 6190 }, { "epoch": 0.48, "grad_norm": 10.321436882019043, "learning_rate": 5.347622095821324e-06, "logits/chosen": -1.3559714555740356, "logits/rejected": -1.0688087940216064, "logps/chosen": -1.0005199909210205, "logps/rejected": -4.694094657897949, "loss": 1.0338, "odds_ratio_loss": 0.332375168800354, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10005201399326324, "rewards/margins": 0.36935749650001526, "rewards/rejected": -0.4694095253944397, "sft_loss": 1.0005199909210205, "step": 6195 }, { "epoch": 0.48, "grad_norm": 10.095077514648438, "learning_rate": 5.3414791626881355e-06, "logits/chosen": -1.275702714920044, "logits/rejected": -1.1703498363494873, "logps/chosen": -1.2571362257003784, "logps/rejected": -11.217076301574707, "loss": 1.2676, "odds_ratio_loss": 0.10499601066112518, "rewards/accuracies": 1.0, "rewards/chosen": -0.12571361660957336, "rewards/margins": 0.9959939122200012, "rewards/rejected": -1.121707558631897, "sft_loss": 1.2571362257003784, "step": 6200 }, { "epoch": 0.48, "grad_norm": 13.795586585998535, "learning_rate": 5.335335711658443e-06, "logits/chosen": -1.413290023803711, "logits/rejected": -1.6291453838348389, "logps/chosen": -0.8038554191589355, "logps/rejected": -13.573580741882324, "loss": 0.8133, "odds_ratio_loss": 0.09486626088619232, "rewards/accuracies": 1.0, "rewards/chosen": -0.08038554340600967, "rewards/margins": 1.2769726514816284, "rewards/rejected": -1.3573582172393799, "sft_loss": 0.8038554191589355, "step": 6205 }, { "epoch": 0.48, "grad_norm": 7.473656177520752, "learning_rate": 5.329191752049567e-06, "logits/chosen": -1.356372594833374, "logits/rejected": -0.8928033709526062, "logps/chosen": -0.9042119979858398, "logps/rejected": -6.3035078048706055, "loss": 0.9056, "odds_ratio_loss": 0.013910293579101562, "rewards/accuracies": 1.0, "rewards/chosen": -0.09042120724916458, "rewards/margins": 0.5399295687675476, "rewards/rejected": -0.6303507685661316, "sft_loss": 0.9042119979858398, "step": 6210 }, { "epoch": 0.48, "grad_norm": 7.009920597076416, "learning_rate": 5.3230472931796015e-06, "logits/chosen": -1.2536189556121826, "logits/rejected": -0.914240837097168, "logps/chosen": -1.2416796684265137, "logps/rejected": -11.471125602722168, "loss": 1.2476, "odds_ratio_loss": 0.058856308460235596, "rewards/accuracies": 1.0, "rewards/chosen": -0.12416797876358032, "rewards/margins": 1.022944688796997, "rewards/rejected": -1.1471126079559326, "sft_loss": 1.2416796684265137, "step": 6215 }, { "epoch": 0.48, "grad_norm": 257.0062255859375, "learning_rate": 5.316902344367403e-06, "logits/chosen": -1.0821647644042969, "logits/rejected": -1.439767599105835, "logps/chosen": -2.010810375213623, "logps/rejected": -7.0934295654296875, "loss": 2.0161, "odds_ratio_loss": 0.05243242532014847, "rewards/accuracies": 1.0, "rewards/chosen": -0.2010810375213623, "rewards/margins": 0.5082619786262512, "rewards/rejected": -0.7093430161476135, "sft_loss": 2.010810375213623, "step": 6220 }, { "epoch": 0.48, "grad_norm": 11.290236473083496, "learning_rate": 5.310756914932562e-06, "logits/chosen": -1.4713438749313354, "logits/rejected": -1.341290831565857, "logps/chosen": -1.0983505249023438, "logps/rejected": -3.4779162406921387, "loss": 1.1325, "odds_ratio_loss": 0.3412621021270752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10983506590127945, "rewards/margins": 0.23795659840106964, "rewards/rejected": -0.3477916717529297, "sft_loss": 1.0983505249023438, "step": 6225 }, { "epoch": 0.48, "grad_norm": 6.028042316436768, "learning_rate": 5.304611014195404e-06, "logits/chosen": -1.4126899242401123, "logits/rejected": -0.7001927495002747, "logps/chosen": -1.079026699066162, "logps/rejected": -5.179287910461426, "loss": 1.0821, "odds_ratio_loss": 0.030901432037353516, "rewards/accuracies": 1.0, "rewards/chosen": -0.10790266841650009, "rewards/margins": 0.4100261330604553, "rewards/rejected": -0.5179287791252136, "sft_loss": 1.079026699066162, "step": 6230 }, { "epoch": 0.49, "grad_norm": 11.563214302062988, "learning_rate": 5.298464651476969e-06, "logits/chosen": -1.3443890810012817, "logits/rejected": -0.8630617260932922, "logps/chosen": -1.0741831064224243, "logps/rejected": -2.882352352142334, "loss": 1.1107, "odds_ratio_loss": 0.3656443953514099, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10741831362247467, "rewards/margins": 0.18081694841384888, "rewards/rejected": -0.28823524713516235, "sft_loss": 1.0741831064224243, "step": 6235 }, { "epoch": 0.49, "grad_norm": 8.393163681030273, "learning_rate": 5.292317836098996e-06, "logits/chosen": -1.4747278690338135, "logits/rejected": -0.805279552936554, "logps/chosen": -0.876976490020752, "logps/rejected": -2.44502329826355, "loss": 0.9467, "odds_ratio_loss": 0.69696044921875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08769764751195908, "rewards/margins": 0.15680469572544098, "rewards/rejected": -0.24450235068798065, "sft_loss": 0.876976490020752, "step": 6240 }, { "epoch": 0.49, "grad_norm": 37.72679138183594, "learning_rate": 5.286170577383909e-06, "logits/chosen": -1.3478347063064575, "logits/rejected": -0.7419066429138184, "logps/chosen": -1.057916522026062, "logps/rejected": -2.9338550567626953, "loss": 1.1178, "odds_ratio_loss": 0.5986490249633789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10579165071249008, "rewards/margins": 0.18759389221668243, "rewards/rejected": -0.2933855354785919, "sft_loss": 1.057916522026062, "step": 6245 }, { "epoch": 0.49, "grad_norm": 6.0733795166015625, "learning_rate": 5.280022884654809e-06, "logits/chosen": -1.2470510005950928, "logits/rejected": -0.6841954588890076, "logps/chosen": -0.8482405543327332, "logps/rejected": -1.3880754709243774, "loss": 0.8984, "odds_ratio_loss": 0.5011777281761169, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08482404798269272, "rewards/margins": 0.053983498364686966, "rewards/rejected": -0.13880756497383118, "sft_loss": 0.8482405543327332, "step": 6250 }, { "epoch": 0.49, "grad_norm": 87.05939483642578, "learning_rate": 5.27387476723545e-06, "logits/chosen": -1.4118794202804565, "logits/rejected": -1.2081636190414429, "logps/chosen": -1.1015691757202148, "logps/rejected": -4.243358612060547, "loss": 1.1348, "odds_ratio_loss": 0.332470178604126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11015691608190536, "rewards/margins": 0.3141789436340332, "rewards/rejected": -0.42433586716651917, "sft_loss": 1.1015691757202148, "step": 6255 }, { "epoch": 0.49, "grad_norm": 5.861100673675537, "learning_rate": 5.267726234450236e-06, "logits/chosen": -1.2835099697113037, "logits/rejected": -0.6652259826660156, "logps/chosen": -0.7800506949424744, "logps/rejected": -3.8315799236297607, "loss": 0.8021, "odds_ratio_loss": 0.2206587791442871, "rewards/accuracies": 1.0, "rewards/chosen": -0.07800506800413132, "rewards/margins": 0.30515292286872864, "rewards/rejected": -0.38315796852111816, "sft_loss": 0.7800506949424744, "step": 6260 }, { "epoch": 0.49, "grad_norm": 43.585899353027344, "learning_rate": 5.261577295624194e-06, "logits/chosen": -1.4102215766906738, "logits/rejected": -0.8064863085746765, "logps/chosen": -1.0556681156158447, "logps/rejected": -4.5865912437438965, "loss": 1.07, "odds_ratio_loss": 0.14332325756549835, "rewards/accuracies": 1.0, "rewards/chosen": -0.10556681454181671, "rewards/margins": 0.3530922830104828, "rewards/rejected": -0.4586590826511383, "sft_loss": 1.0556681156158447, "step": 6265 }, { "epoch": 0.49, "grad_norm": 7.329597473144531, "learning_rate": 5.2554279600829714e-06, "logits/chosen": -1.498291254043579, "logits/rejected": -1.073769211769104, "logps/chosen": -1.0253098011016846, "logps/rejected": -1.5684717893600464, "loss": 1.0742, "odds_ratio_loss": 0.48867884278297424, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10253097862005234, "rewards/margins": 0.0543162003159523, "rewards/rejected": -0.15684717893600464, "sft_loss": 1.0253098011016846, "step": 6270 }, { "epoch": 0.49, "grad_norm": 21.413816452026367, "learning_rate": 5.24927823715282e-06, "logits/chosen": -1.4272135496139526, "logits/rejected": -1.0142067670822144, "logps/chosen": -0.8830957412719727, "logps/rejected": -3.962498903274536, "loss": 0.8897, "odds_ratio_loss": 0.06568063795566559, "rewards/accuracies": 1.0, "rewards/chosen": -0.08830957859754562, "rewards/margins": 0.3079403340816498, "rewards/rejected": -0.396249920129776, "sft_loss": 0.8830957412719727, "step": 6275 }, { "epoch": 0.49, "grad_norm": 60.13251876831055, "learning_rate": 5.243128136160569e-06, "logits/chosen": -1.362410306930542, "logits/rejected": -1.2184256315231323, "logps/chosen": -1.1046503782272339, "logps/rejected": -2.781712055206299, "loss": 1.1461, "odds_ratio_loss": 0.4149476885795593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11046504974365234, "rewards/margins": 0.1677061766386032, "rewards/rejected": -0.27817121148109436, "sft_loss": 1.1046503782272339, "step": 6280 }, { "epoch": 0.49, "grad_norm": 50.41915512084961, "learning_rate": 5.236977666433633e-06, "logits/chosen": -1.3632569313049316, "logits/rejected": -1.3883846998214722, "logps/chosen": -1.651738166809082, "logps/rejected": -3.164759635925293, "loss": 1.6967, "odds_ratio_loss": 0.44982948899269104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16517381370067596, "rewards/margins": 0.15130215883255005, "rewards/rejected": -0.3164759576320648, "sft_loss": 1.651738166809082, "step": 6285 }, { "epoch": 0.49, "grad_norm": 28.82509422302246, "learning_rate": 5.230826837299976e-06, "logits/chosen": -1.3222404718399048, "logits/rejected": -1.2658565044403076, "logps/chosen": -1.2086848020553589, "logps/rejected": -2.9066264629364014, "loss": 1.2547, "odds_ratio_loss": 0.46052518486976624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1208684891462326, "rewards/margins": 0.16979417204856873, "rewards/rejected": -0.2906626760959625, "sft_loss": 1.2086848020553589, "step": 6290 }, { "epoch": 0.49, "grad_norm": 8.832147598266602, "learning_rate": 5.224675658088115e-06, "logits/chosen": -1.4218961000442505, "logits/rejected": -1.237674593925476, "logps/chosen": -0.9825714826583862, "logps/rejected": -4.558527946472168, "loss": 1.0425, "odds_ratio_loss": 0.5992078185081482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0982571467757225, "rewards/margins": 0.35759562253952026, "rewards/rejected": -0.45585280656814575, "sft_loss": 0.9825714826583862, "step": 6295 }, { "epoch": 0.49, "grad_norm": 10.061553001403809, "learning_rate": 5.218524138127092e-06, "logits/chosen": -1.4015705585479736, "logits/rejected": -0.6733571887016296, "logps/chosen": -0.9295048713684082, "logps/rejected": -1.6065304279327393, "loss": 0.9707, "odds_ratio_loss": 0.41185611486434937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0929504856467247, "rewards/margins": 0.06770254671573639, "rewards/rejected": -0.1606530249118805, "sft_loss": 0.9295048713684082, "step": 6300 }, { "epoch": 0.49, "grad_norm": 7.776479244232178, "learning_rate": 5.212372286746469e-06, "logits/chosen": -1.291441559791565, "logits/rejected": -1.0221130847930908, "logps/chosen": -1.084132432937622, "logps/rejected": -3.2497684955596924, "loss": 1.1144, "odds_ratio_loss": 0.30258169770240784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10841324180364609, "rewards/margins": 0.2165636122226715, "rewards/rejected": -0.3249768614768982, "sft_loss": 1.084132432937622, "step": 6305 }, { "epoch": 0.49, "grad_norm": 8.815446853637695, "learning_rate": 5.206220113276309e-06, "logits/chosen": -1.3701775074005127, "logits/rejected": -0.9479221105575562, "logps/chosen": -1.3828586339950562, "logps/rejected": -3.8318514823913574, "loss": 1.4137, "odds_ratio_loss": 0.30864453315734863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13828587532043457, "rewards/margins": 0.24489931762218475, "rewards/rejected": -0.38318517804145813, "sft_loss": 1.3828586339950562, "step": 6310 }, { "epoch": 0.49, "grad_norm": 16.117143630981445, "learning_rate": 5.200067627047164e-06, "logits/chosen": -1.4318413734436035, "logits/rejected": -1.2156895399093628, "logps/chosen": -1.0816423892974854, "logps/rejected": -2.959960460662842, "loss": 1.1072, "odds_ratio_loss": 0.2559763193130493, "rewards/accuracies": 1.0, "rewards/chosen": -0.1081642359495163, "rewards/margins": 0.1878318041563034, "rewards/rejected": -0.2959960401058197, "sft_loss": 1.0816423892974854, "step": 6315 }, { "epoch": 0.49, "grad_norm": 15.240541458129883, "learning_rate": 5.193914837390062e-06, "logits/chosen": -1.1071289777755737, "logits/rejected": -1.1513497829437256, "logps/chosen": -0.6938591003417969, "logps/rejected": -2.251840591430664, "loss": 0.7126, "odds_ratio_loss": 0.18697881698608398, "rewards/accuracies": 1.0, "rewards/chosen": -0.06938590854406357, "rewards/margins": 0.15579816699028015, "rewards/rejected": -0.22518405318260193, "sft_loss": 0.6938591003417969, "step": 6320 }, { "epoch": 0.49, "grad_norm": 13.213371276855469, "learning_rate": 5.187761753636488e-06, "logits/chosen": -1.2028157711029053, "logits/rejected": -0.8299140930175781, "logps/chosen": -1.0828067064285278, "logps/rejected": -2.840242862701416, "loss": 1.1056, "odds_ratio_loss": 0.2279554158449173, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10828067362308502, "rewards/margins": 0.17574363946914673, "rewards/rejected": -0.28402429819107056, "sft_loss": 1.0828067064285278, "step": 6325 }, { "epoch": 0.49, "grad_norm": 5.185869216918945, "learning_rate": 5.181608385118375e-06, "logits/chosen": -1.275193452835083, "logits/rejected": -0.7473627924919128, "logps/chosen": -0.817855179309845, "logps/rejected": -3.1327741146087646, "loss": 0.847, "odds_ratio_loss": 0.2910774052143097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08178551495075226, "rewards/margins": 0.23149189352989197, "rewards/rejected": -0.3132774233818054, "sft_loss": 0.817855179309845, "step": 6330 }, { "epoch": 0.49, "grad_norm": 5.627729892730713, "learning_rate": 5.175454741168088e-06, "logits/chosen": -1.3115571737289429, "logits/rejected": -0.943062424659729, "logps/chosen": -0.9332623481750488, "logps/rejected": -1.7769415378570557, "loss": 0.9732, "odds_ratio_loss": 0.3995886743068695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09332623332738876, "rewards/margins": 0.08436791598796844, "rewards/rejected": -0.1776941567659378, "sft_loss": 0.9332623481750488, "step": 6335 }, { "epoch": 0.49, "grad_norm": 11.997739791870117, "learning_rate": 5.169300831118411e-06, "logits/chosen": -1.3951687812805176, "logits/rejected": -1.3034172058105469, "logps/chosen": -1.0512475967407227, "logps/rejected": -2.227795124053955, "loss": 1.0738, "odds_ratio_loss": 0.22538113594055176, "rewards/accuracies": 1.0, "rewards/chosen": -0.10512475669384003, "rewards/margins": 0.11765476316213608, "rewards/rejected": -0.2227795124053955, "sft_loss": 1.0512475967407227, "step": 6340 }, { "epoch": 0.49, "grad_norm": 7.004037380218506, "learning_rate": 5.163146664302526e-06, "logits/chosen": -1.308970332145691, "logits/rejected": -0.8894938230514526, "logps/chosen": -1.0626189708709717, "logps/rejected": -2.5290350914001465, "loss": 1.0829, "odds_ratio_loss": 0.20267066359519958, "rewards/accuracies": 1.0, "rewards/chosen": -0.10626189410686493, "rewards/margins": 0.14664161205291748, "rewards/rejected": -0.2529035210609436, "sft_loss": 1.0626189708709717, "step": 6345 }, { "epoch": 0.49, "grad_norm": 7.914247512817383, "learning_rate": 5.156992250054012e-06, "logits/chosen": -1.1687589883804321, "logits/rejected": -1.1605756282806396, "logps/chosen": -0.959539532661438, "logps/rejected": -4.1293487548828125, "loss": 0.9736, "odds_ratio_loss": 0.14065495133399963, "rewards/accuracies": 1.0, "rewards/chosen": -0.09595395624637604, "rewards/margins": 0.31698092818260193, "rewards/rejected": -0.4129348695278168, "sft_loss": 0.959539532661438, "step": 6350 }, { "epoch": 0.49, "grad_norm": 15.616728782653809, "learning_rate": 5.15083759770682e-06, "logits/chosen": -1.275852918624878, "logits/rejected": -0.7614493370056152, "logps/chosen": -1.340222954750061, "logps/rejected": -1.3793169260025024, "loss": 1.4223, "odds_ratio_loss": 0.8204905390739441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1340222954750061, "rewards/margins": 0.003909393213689327, "rewards/rejected": -0.137931689620018, "sft_loss": 1.340222954750061, "step": 6355 }, { "epoch": 0.49, "grad_norm": 14.889914512634277, "learning_rate": 5.144682716595257e-06, "logits/chosen": -1.1798590421676636, "logits/rejected": -1.333195686340332, "logps/chosen": -0.8878809213638306, "logps/rejected": -4.077374458312988, "loss": 0.8987, "odds_ratio_loss": 0.10868176072835922, "rewards/accuracies": 1.0, "rewards/chosen": -0.08878809958696365, "rewards/margins": 0.31894931197166443, "rewards/rejected": -0.4077374339103699, "sft_loss": 0.8878809213638306, "step": 6360 }, { "epoch": 0.5, "grad_norm": 32.431114196777344, "learning_rate": 5.138527616053988e-06, "logits/chosen": -1.4382386207580566, "logits/rejected": -1.2178415060043335, "logps/chosen": -0.9835016131401062, "logps/rejected": -4.400031089782715, "loss": 1.0424, "odds_ratio_loss": 0.5888200998306274, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0983501672744751, "rewards/margins": 0.3416529893875122, "rewards/rejected": -0.4400032162666321, "sft_loss": 0.9835016131401062, "step": 6365 }, { "epoch": 0.5, "grad_norm": 22.511014938354492, "learning_rate": 5.132372305417997e-06, "logits/chosen": -1.2628698348999023, "logits/rejected": -1.4043775796890259, "logps/chosen": -1.1757280826568604, "logps/rejected": -9.438942909240723, "loss": 1.1913, "odds_ratio_loss": 0.15536382794380188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11757279932498932, "rewards/margins": 0.8263214826583862, "rewards/rejected": -0.9438942670822144, "sft_loss": 1.1757280826568604, "step": 6370 }, { "epoch": 0.5, "grad_norm": 25.160930633544922, "learning_rate": 5.126216794022601e-06, "logits/chosen": -1.431951642036438, "logits/rejected": -1.627963662147522, "logps/chosen": -1.1234722137451172, "logps/rejected": -3.777705430984497, "loss": 1.1563, "odds_ratio_loss": 0.327811062335968, "rewards/accuracies": 1.0, "rewards/chosen": -0.11234722286462784, "rewards/margins": 0.26542332768440247, "rewards/rejected": -0.3777705729007721, "sft_loss": 1.1234722137451172, "step": 6375 }, { "epoch": 0.5, "grad_norm": 15.66126537322998, "learning_rate": 5.120061091203412e-06, "logits/chosen": -1.46817946434021, "logits/rejected": -1.135692834854126, "logps/chosen": -1.0381033420562744, "logps/rejected": -2.4826722145080566, "loss": 1.1139, "odds_ratio_loss": 0.7575585246086121, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10381032526493073, "rewards/margins": 0.1444568783044815, "rewards/rejected": -0.24826720356941223, "sft_loss": 1.0381033420562744, "step": 6380 }, { "epoch": 0.5, "grad_norm": 27.64168357849121, "learning_rate": 5.1139052062963335e-06, "logits/chosen": -1.3704216480255127, "logits/rejected": -0.9594882726669312, "logps/chosen": -0.8388217687606812, "logps/rejected": -6.790257453918457, "loss": 0.8457, "odds_ratio_loss": 0.06886833161115646, "rewards/accuracies": 1.0, "rewards/chosen": -0.08388218283653259, "rewards/margins": 0.5951434969902039, "rewards/rejected": -0.6790256500244141, "sft_loss": 0.8388217687606812, "step": 6385 }, { "epoch": 0.5, "grad_norm": 9.125890731811523, "learning_rate": 5.1077491486375475e-06, "logits/chosen": -1.2376512289047241, "logits/rejected": -1.1638411283493042, "logps/chosen": -1.0288476943969727, "logps/rejected": -2.5409605503082275, "loss": 1.0533, "odds_ratio_loss": 0.24471692740917206, "rewards/accuracies": 1.0, "rewards/chosen": -0.10288476943969727, "rewards/margins": 0.15121129155158997, "rewards/rejected": -0.25409606099128723, "sft_loss": 1.0288476943969727, "step": 6390 }, { "epoch": 0.5, "grad_norm": 9.3634614944458, "learning_rate": 5.101592927563498e-06, "logits/chosen": -1.1934869289398193, "logits/rejected": -1.513981580734253, "logps/chosen": -0.8802944421768188, "logps/rejected": -3.766958713531494, "loss": 0.8927, "odds_ratio_loss": 0.12386783212423325, "rewards/accuracies": 1.0, "rewards/chosen": -0.08802944421768188, "rewards/margins": 0.28866642713546753, "rewards/rejected": -0.3766958713531494, "sft_loss": 0.8802944421768188, "step": 6395 }, { "epoch": 0.5, "grad_norm": 15.044342041015625, "learning_rate": 5.095436552410874e-06, "logits/chosen": -1.3474657535552979, "logits/rejected": -0.7830338478088379, "logps/chosen": -1.0011160373687744, "logps/rejected": -2.68935227394104, "loss": 1.0384, "odds_ratio_loss": 0.37255430221557617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10011158883571625, "rewards/margins": 0.16882364451885223, "rewards/rejected": -0.2689352333545685, "sft_loss": 1.0011160373687744, "step": 6400 }, { "epoch": 0.5, "grad_norm": 22.000795364379883, "learning_rate": 5.089280032516601e-06, "logits/chosen": -1.305230736732483, "logits/rejected": -1.1422834396362305, "logps/chosen": -0.9740656018257141, "logps/rejected": -5.649570465087891, "loss": 0.9822, "odds_ratio_loss": 0.08109962940216064, "rewards/accuracies": 1.0, "rewards/chosen": -0.09740655869245529, "rewards/margins": 0.4675505757331848, "rewards/rejected": -0.5649570822715759, "sft_loss": 0.9740656018257141, "step": 6405 }, { "epoch": 0.5, "grad_norm": 8.750536918640137, "learning_rate": 5.083123377217826e-06, "logits/chosen": -1.2674823999404907, "logits/rejected": -0.589019775390625, "logps/chosen": -0.9614855051040649, "logps/rejected": -3.0463080406188965, "loss": 0.9959, "odds_ratio_loss": 0.3439742922782898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0961485505104065, "rewards/margins": 0.2084822654724121, "rewards/rejected": -0.3046308159828186, "sft_loss": 0.9614855051040649, "step": 6410 }, { "epoch": 0.5, "grad_norm": 13.309040069580078, "learning_rate": 5.076966595851894e-06, "logits/chosen": -1.3427834510803223, "logits/rejected": -1.3737881183624268, "logps/chosen": -0.7586371302604675, "logps/rejected": -7.239819526672363, "loss": 0.7824, "odds_ratio_loss": 0.23804640769958496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07586371898651123, "rewards/margins": 0.6481181979179382, "rewards/rejected": -0.7239819765090942, "sft_loss": 0.7586371302604675, "step": 6415 }, { "epoch": 0.5, "grad_norm": 7.2068610191345215, "learning_rate": 5.070809697756347e-06, "logits/chosen": -1.2784476280212402, "logits/rejected": -1.0112392902374268, "logps/chosen": -0.8584426641464233, "logps/rejected": -8.016109466552734, "loss": 0.8745, "odds_ratio_loss": 0.16093608736991882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08584427088499069, "rewards/margins": 0.7157666683197021, "rewards/rejected": -0.8016109466552734, "sft_loss": 0.8584426641464233, "step": 6420 }, { "epoch": 0.5, "grad_norm": 28.779720306396484, "learning_rate": 5.064652692268902e-06, "logits/chosen": -1.3163127899169922, "logits/rejected": -0.7983849048614502, "logps/chosen": -0.8614311218261719, "logps/rejected": -10.967259407043457, "loss": 0.8856, "odds_ratio_loss": 0.24148115515708923, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0861431136727333, "rewards/margins": 1.0105828046798706, "rewards/rejected": -1.0967258214950562, "sft_loss": 0.8614311218261719, "step": 6425 }, { "epoch": 0.5, "grad_norm": 7.035211563110352, "learning_rate": 5.0584955887274425e-06, "logits/chosen": -1.3263068199157715, "logits/rejected": -1.1194041967391968, "logps/chosen": -1.0586429834365845, "logps/rejected": -4.127714157104492, "loss": 1.0917, "odds_ratio_loss": 0.3301314115524292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10586428642272949, "rewards/margins": 0.30690711736679077, "rewards/rejected": -0.41277140378952026, "sft_loss": 1.0586429834365845, "step": 6430 }, { "epoch": 0.5, "grad_norm": 9.80324935913086, "learning_rate": 5.0523383964699955e-06, "logits/chosen": -1.414355993270874, "logits/rejected": -1.308115005493164, "logps/chosen": -0.8374627232551575, "logps/rejected": -5.090659141540527, "loss": 0.8747, "odds_ratio_loss": 0.3718825578689575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08374626934528351, "rewards/margins": 0.4253196716308594, "rewards/rejected": -0.5090659260749817, "sft_loss": 0.8374627232551575, "step": 6435 }, { "epoch": 0.5, "grad_norm": 3.8619236946105957, "learning_rate": 5.0461811248347245e-06, "logits/chosen": -1.4813154935836792, "logits/rejected": -0.9258764982223511, "logps/chosen": -0.9134475588798523, "logps/rejected": -2.8745944499969482, "loss": 0.9478, "odds_ratio_loss": 0.3434610962867737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09134475886821747, "rewards/margins": 0.1961146891117096, "rewards/rejected": -0.28745943307876587, "sft_loss": 0.9134475588798523, "step": 6440 }, { "epoch": 0.5, "grad_norm": 12.92715072631836, "learning_rate": 5.040023783159914e-06, "logits/chosen": -1.3870189189910889, "logits/rejected": -1.30267333984375, "logps/chosen": -1.0112378597259521, "logps/rejected": -6.614487648010254, "loss": 1.0638, "odds_ratio_loss": 0.5260337591171265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10112377256155014, "rewards/margins": 0.5603249669075012, "rewards/rejected": -0.6614487171173096, "sft_loss": 1.0112378597259521, "step": 6445 }, { "epoch": 0.5, "grad_norm": 12.387499809265137, "learning_rate": 5.033866380783955e-06, "logits/chosen": -1.4080413579940796, "logits/rejected": -1.0505297183990479, "logps/chosen": -0.9702507257461548, "logps/rejected": -4.5742621421813965, "loss": 0.9856, "odds_ratio_loss": 0.1531083881855011, "rewards/accuracies": 1.0, "rewards/chosen": -0.097025066614151, "rewards/margins": 0.3604011833667755, "rewards/rejected": -0.4574262201786041, "sft_loss": 0.9702507257461548, "step": 6450 }, { "epoch": 0.5, "grad_norm": 30.352596282958984, "learning_rate": 5.027708927045331e-06, "logits/chosen": -1.4215143918991089, "logits/rejected": -1.3137174844741821, "logps/chosen": -0.7543405890464783, "logps/rejected": -5.948849678039551, "loss": 0.7989, "odds_ratio_loss": 0.44525426626205444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07543405890464783, "rewards/margins": 0.5194509029388428, "rewards/rejected": -0.594884991645813, "sft_loss": 0.7543405890464783, "step": 6455 }, { "epoch": 0.5, "grad_norm": 7.94644832611084, "learning_rate": 5.021551431282599e-06, "logits/chosen": -1.3876312971115112, "logits/rejected": -0.3571079969406128, "logps/chosen": -0.9345148801803589, "logps/rejected": -4.167781352996826, "loss": 0.9487, "odds_ratio_loss": 0.14198826253414154, "rewards/accuracies": 1.0, "rewards/chosen": -0.09345149248838425, "rewards/margins": 0.32332664728164673, "rewards/rejected": -0.4167781472206116, "sft_loss": 0.9345148801803589, "step": 6460 }, { "epoch": 0.5, "grad_norm": 8.467122077941895, "learning_rate": 5.0153939028343855e-06, "logits/chosen": -1.2451789379119873, "logits/rejected": -1.2449508905410767, "logps/chosen": -0.7295305132865906, "logps/rejected": -3.314526081085205, "loss": 0.763, "odds_ratio_loss": 0.33510622382164, "rewards/accuracies": 1.0, "rewards/chosen": -0.07295306026935577, "rewards/margins": 0.2584995627403259, "rewards/rejected": -0.3314525783061981, "sft_loss": 0.7295305132865906, "step": 6465 }, { "epoch": 0.5, "grad_norm": 11.057312965393066, "learning_rate": 5.009236351039366e-06, "logits/chosen": -1.4403380155563354, "logits/rejected": -1.1567604541778564, "logps/chosen": -0.9498114585876465, "logps/rejected": -6.46216344833374, "loss": 0.9606, "odds_ratio_loss": 0.10795946419239044, "rewards/accuracies": 1.0, "rewards/chosen": -0.09498114883899689, "rewards/margins": 0.5512352585792542, "rewards/rejected": -0.6462163925170898, "sft_loss": 0.9498114585876465, "step": 6470 }, { "epoch": 0.5, "grad_norm": 5.785250663757324, "learning_rate": 5.003078785236245e-06, "logits/chosen": -1.288499355316162, "logits/rejected": -1.5583384037017822, "logps/chosen": -1.2569966316223145, "logps/rejected": -6.098020076751709, "loss": 1.2615, "odds_ratio_loss": 0.0446125753223896, "rewards/accuracies": 1.0, "rewards/chosen": -0.12569966912269592, "rewards/margins": 0.484102338552475, "rewards/rejected": -0.6098020672798157, "sft_loss": 1.2569966316223145, "step": 6475 }, { "epoch": 0.5, "grad_norm": 7.866329669952393, "learning_rate": 4.996921214763755e-06, "logits/chosen": -1.4628150463104248, "logits/rejected": -0.8090093731880188, "logps/chosen": -0.8259984850883484, "logps/rejected": -7.853868007659912, "loss": 0.8338, "odds_ratio_loss": 0.07781483232975006, "rewards/accuracies": 1.0, "rewards/chosen": -0.08259985595941544, "rewards/margins": 0.7027870416641235, "rewards/rejected": -0.7853869199752808, "sft_loss": 0.8259984850883484, "step": 6480 }, { "epoch": 0.5, "grad_norm": 26.424274444580078, "learning_rate": 4.990763648960636e-06, "logits/chosen": -1.3772542476654053, "logits/rejected": -0.935505211353302, "logps/chosen": -1.1864153146743774, "logps/rejected": -6.842642307281494, "loss": 1.1887, "odds_ratio_loss": 0.02320820465683937, "rewards/accuracies": 1.0, "rewards/chosen": -0.11864154040813446, "rewards/margins": 0.5656226873397827, "rewards/rejected": -0.6842642426490784, "sft_loss": 1.1864153146743774, "step": 6485 }, { "epoch": 0.5, "grad_norm": 17.74825096130371, "learning_rate": 4.984606097165615e-06, "logits/chosen": -1.3706109523773193, "logits/rejected": -1.016859531402588, "logps/chosen": -0.7705894708633423, "logps/rejected": -3.3283610343933105, "loss": 0.7842, "odds_ratio_loss": 0.13585834205150604, "rewards/accuracies": 1.0, "rewards/chosen": -0.07705894857645035, "rewards/margins": 0.2557772099971771, "rewards/rejected": -0.3328361511230469, "sft_loss": 0.7705894708633423, "step": 6490 }, { "epoch": 0.51, "grad_norm": 5.96090030670166, "learning_rate": 4.978448568717402e-06, "logits/chosen": -1.1399333477020264, "logits/rejected": -1.155973196029663, "logps/chosen": -1.3984425067901611, "logps/rejected": -7.212183952331543, "loss": 1.4215, "odds_ratio_loss": 0.23092961311340332, "rewards/accuracies": 1.0, "rewards/chosen": -0.13984423875808716, "rewards/margins": 0.5813741683959961, "rewards/rejected": -0.7212185263633728, "sft_loss": 1.3984425067901611, "step": 6495 }, { "epoch": 0.51, "grad_norm": 5.749975681304932, "learning_rate": 4.972291072954672e-06, "logits/chosen": -1.350691795349121, "logits/rejected": -1.1734728813171387, "logps/chosen": -1.0730262994766235, "logps/rejected": -4.12339973449707, "loss": 1.0974, "odds_ratio_loss": 0.24394991993904114, "rewards/accuracies": 1.0, "rewards/chosen": -0.10730264335870743, "rewards/margins": 0.30503731966018677, "rewards/rejected": -0.412339985370636, "sft_loss": 1.0730262994766235, "step": 6500 }, { "epoch": 0.51, "grad_norm": 29.49888801574707, "learning_rate": 4.966133619216047e-06, "logits/chosen": -1.090889573097229, "logits/rejected": -1.3663781881332397, "logps/chosen": -1.12447988986969, "logps/rejected": -6.730982780456543, "loss": 1.1604, "odds_ratio_loss": 0.3587406575679779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11244799196720123, "rewards/margins": 0.5606502890586853, "rewards/rejected": -0.6730983853340149, "sft_loss": 1.12447988986969, "step": 6505 }, { "epoch": 0.51, "grad_norm": 8.06817626953125, "learning_rate": 4.959976216840088e-06, "logits/chosen": -1.4288852214813232, "logits/rejected": -1.4129282236099243, "logps/chosen": -1.0631098747253418, "logps/rejected": -14.163995742797852, "loss": 1.0644, "odds_ratio_loss": 0.01322208158671856, "rewards/accuracies": 1.0, "rewards/chosen": -0.10631098598241806, "rewards/margins": 1.3100885152816772, "rewards/rejected": -1.4163994789123535, "sft_loss": 1.0631098747253418, "step": 6510 }, { "epoch": 0.51, "grad_norm": 7.540499210357666, "learning_rate": 4.953818875165276e-06, "logits/chosen": -1.4435449838638306, "logits/rejected": -1.3639262914657593, "logps/chosen": -0.7851355671882629, "logps/rejected": -3.850607395172119, "loss": 0.8093, "odds_ratio_loss": 0.24117080867290497, "rewards/accuracies": 1.0, "rewards/chosen": -0.07851354777812958, "rewards/margins": 0.3065471649169922, "rewards/rejected": -0.38506072759628296, "sft_loss": 0.7851355671882629, "step": 6515 }, { "epoch": 0.51, "grad_norm": 12.225910186767578, "learning_rate": 4.947661603530006e-06, "logits/chosen": -1.300437569618225, "logits/rejected": -1.2396416664123535, "logps/chosen": -1.1051909923553467, "logps/rejected": -6.844620704650879, "loss": 1.1282, "odds_ratio_loss": 0.2299909144639969, "rewards/accuracies": 1.0, "rewards/chosen": -0.11051911115646362, "rewards/margins": 0.5739429593086243, "rewards/rejected": -0.6844619512557983, "sft_loss": 1.1051909923553467, "step": 6520 }, { "epoch": 0.51, "grad_norm": 11.778657913208008, "learning_rate": 4.941504411272559e-06, "logits/chosen": -1.1931259632110596, "logits/rejected": -1.3879821300506592, "logps/chosen": -1.3136595487594604, "logps/rejected": -6.5946364402771, "loss": 1.3388, "odds_ratio_loss": 0.2512947916984558, "rewards/accuracies": 1.0, "rewards/chosen": -0.13136595487594604, "rewards/margins": 0.5280976891517639, "rewards/rejected": -0.6594635844230652, "sft_loss": 1.3136595487594604, "step": 6525 }, { "epoch": 0.51, "grad_norm": 31.723094940185547, "learning_rate": 4.9353473077310985e-06, "logits/chosen": -1.382830023765564, "logits/rejected": -1.0908111333847046, "logps/chosen": -1.1355063915252686, "logps/rejected": -11.42387580871582, "loss": 1.1552, "odds_ratio_loss": 0.19670705497264862, "rewards/accuracies": 1.0, "rewards/chosen": -0.11355062574148178, "rewards/margins": 1.028836965560913, "rewards/rejected": -1.1423876285552979, "sft_loss": 1.1355063915252686, "step": 6530 }, { "epoch": 0.51, "grad_norm": 18.334205627441406, "learning_rate": 4.929190302243655e-06, "logits/chosen": -1.3520934581756592, "logits/rejected": -1.506186842918396, "logps/chosen": -1.0280715227127075, "logps/rejected": -9.33717155456543, "loss": 1.0397, "odds_ratio_loss": 0.11600234359502792, "rewards/accuracies": 1.0, "rewards/chosen": -0.10280714929103851, "rewards/margins": 0.8309100866317749, "rewards/rejected": -0.9337173700332642, "sft_loss": 1.0280715227127075, "step": 6535 }, { "epoch": 0.51, "grad_norm": 22.152624130249023, "learning_rate": 4.9230334041481085e-06, "logits/chosen": -1.3186109066009521, "logits/rejected": -1.539162278175354, "logps/chosen": -0.8228222727775574, "logps/rejected": -12.513707160949707, "loss": 0.8367, "odds_ratio_loss": 0.13917812705039978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08228223025798798, "rewards/margins": 1.16908860206604, "rewards/rejected": -1.2513707876205444, "sft_loss": 0.8228222727775574, "step": 6540 }, { "epoch": 0.51, "grad_norm": 19.71617889404297, "learning_rate": 4.916876622782176e-06, "logits/chosen": -1.2768125534057617, "logits/rejected": -1.4440394639968872, "logps/chosen": -0.863481879234314, "logps/rejected": -8.750720024108887, "loss": 0.8689, "odds_ratio_loss": 0.054228268563747406, "rewards/accuracies": 1.0, "rewards/chosen": -0.08634819090366364, "rewards/margins": 0.7887238264083862, "rewards/rejected": -0.8750720024108887, "sft_loss": 0.863481879234314, "step": 6545 }, { "epoch": 0.51, "grad_norm": 136.82469177246094, "learning_rate": 4.9107199674833995e-06, "logits/chosen": -1.065739631652832, "logits/rejected": -1.2996230125427246, "logps/chosen": -0.8647258877754211, "logps/rejected": -6.739018440246582, "loss": 0.8745, "odds_ratio_loss": 0.0975758507847786, "rewards/accuracies": 1.0, "rewards/chosen": -0.08647258579730988, "rewards/margins": 0.5874292850494385, "rewards/rejected": -0.6739019155502319, "sft_loss": 0.8647258877754211, "step": 6550 }, { "epoch": 0.51, "grad_norm": 20.120380401611328, "learning_rate": 4.904563447589128e-06, "logits/chosen": -1.1406917572021484, "logits/rejected": -1.0273816585540771, "logps/chosen": -0.9011715054512024, "logps/rejected": -5.575922966003418, "loss": 0.956, "odds_ratio_loss": 0.5483925342559814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09011714160442352, "rewards/margins": 0.46747517585754395, "rewards/rejected": -0.5575922727584839, "sft_loss": 0.9011715054512024, "step": 6555 }, { "epoch": 0.51, "grad_norm": 241.13442993164062, "learning_rate": 4.898407072436503e-06, "logits/chosen": -1.3336999416351318, "logits/rejected": -0.5124937295913696, "logps/chosen": -1.1472948789596558, "logps/rejected": -3.278719663619995, "loss": 1.1716, "odds_ratio_loss": 0.24312250316143036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11472950130701065, "rewards/margins": 0.2131424844264984, "rewards/rejected": -0.32787197828292847, "sft_loss": 1.1472948789596558, "step": 6560 }, { "epoch": 0.51, "grad_norm": 8.492508888244629, "learning_rate": 4.892250851362453e-06, "logits/chosen": -1.4392950534820557, "logits/rejected": -1.133285641670227, "logps/chosen": -0.6859443783760071, "logps/rejected": -2.3474297523498535, "loss": 0.7659, "odds_ratio_loss": 0.7996104955673218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06859444081783295, "rewards/margins": 0.16614854335784912, "rewards/rejected": -0.23474296927452087, "sft_loss": 0.6859443783760071, "step": 6565 }, { "epoch": 0.51, "grad_norm": 11.404682159423828, "learning_rate": 4.886094793703668e-06, "logits/chosen": -1.47762930393219, "logits/rejected": -1.2724297046661377, "logps/chosen": -0.7437477111816406, "logps/rejected": -6.391782283782959, "loss": 0.7519, "odds_ratio_loss": 0.0815412700176239, "rewards/accuracies": 1.0, "rewards/chosen": -0.07437478005886078, "rewards/margins": 0.5648034811019897, "rewards/rejected": -0.6391782760620117, "sft_loss": 0.7437477111816406, "step": 6570 }, { "epoch": 0.51, "grad_norm": 26.6160888671875, "learning_rate": 4.87993890879659e-06, "logits/chosen": -1.1805088520050049, "logits/rejected": -0.6844810247421265, "logps/chosen": -1.189537763595581, "logps/rejected": -5.205257415771484, "loss": 1.233, "odds_ratio_loss": 0.4342225193977356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11895380169153214, "rewards/margins": 0.40157192945480347, "rewards/rejected": -0.5205257534980774, "sft_loss": 1.189537763595581, "step": 6575 }, { "epoch": 0.51, "grad_norm": 68.27629089355469, "learning_rate": 4.8737832059773996e-06, "logits/chosen": -1.1993681192398071, "logits/rejected": -1.2758736610412598, "logps/chosen": -0.9924455881118774, "logps/rejected": -5.342121601104736, "loss": 1.0453, "odds_ratio_loss": 0.5286139249801636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09924455732107162, "rewards/margins": 0.43496760725975037, "rewards/rejected": -0.5342121720314026, "sft_loss": 0.9924455881118774, "step": 6580 }, { "epoch": 0.51, "grad_norm": 7.67741060256958, "learning_rate": 4.867627694582004e-06, "logits/chosen": -1.3459727764129639, "logits/rejected": -1.067171335220337, "logps/chosen": -0.9003369212150574, "logps/rejected": -9.823450088500977, "loss": 0.9058, "odds_ratio_loss": 0.05429646372795105, "rewards/accuracies": 1.0, "rewards/chosen": -0.09003370255231857, "rewards/margins": 0.8923112750053406, "rewards/rejected": -0.9823449850082397, "sft_loss": 0.9003369212150574, "step": 6585 }, { "epoch": 0.51, "grad_norm": 12.953150749206543, "learning_rate": 4.861472383946016e-06, "logits/chosen": -1.2747457027435303, "logits/rejected": -0.9779289364814758, "logps/chosen": -1.1248152256011963, "logps/rejected": -8.683989524841309, "loss": 1.1304, "odds_ratio_loss": 0.0556270070374012, "rewards/accuracies": 1.0, "rewards/chosen": -0.11248151957988739, "rewards/margins": 0.7559173703193665, "rewards/rejected": -0.8683989644050598, "sft_loss": 1.1248152256011963, "step": 6590 }, { "epoch": 0.51, "grad_norm": 11.446743965148926, "learning_rate": 4.855317283404742e-06, "logits/chosen": -1.3993771076202393, "logits/rejected": -1.2163068056106567, "logps/chosen": -1.0514163970947266, "logps/rejected": -6.497828006744385, "loss": 1.0756, "odds_ratio_loss": 0.24144919216632843, "rewards/accuracies": 1.0, "rewards/chosen": -0.10514162480831146, "rewards/margins": 0.5446411371231079, "rewards/rejected": -0.6497827768325806, "sft_loss": 1.0514163970947266, "step": 6595 }, { "epoch": 0.51, "grad_norm": 7.466728210449219, "learning_rate": 4.849162402293182e-06, "logits/chosen": -1.4714564085006714, "logits/rejected": -0.9703958630561829, "logps/chosen": -0.8602334260940552, "logps/rejected": -5.671200275421143, "loss": 0.8751, "odds_ratio_loss": 0.14834879338741302, "rewards/accuracies": 1.0, "rewards/chosen": -0.08602333068847656, "rewards/margins": 0.48109668493270874, "rewards/rejected": -0.5671200156211853, "sft_loss": 0.8602334260940552, "step": 6600 }, { "epoch": 0.51, "grad_norm": 4.813689231872559, "learning_rate": 4.8430077499459885e-06, "logits/chosen": -1.4296540021896362, "logits/rejected": -1.2248282432556152, "logps/chosen": -0.9535678625106812, "logps/rejected": -8.022564888000488, "loss": 1.0042, "odds_ratio_loss": 0.5060486793518066, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09535679221153259, "rewards/margins": 0.7068997025489807, "rewards/rejected": -0.8022565841674805, "sft_loss": 0.9535678625106812, "step": 6605 }, { "epoch": 0.51, "grad_norm": 10.360968589782715, "learning_rate": 4.836853335697474e-06, "logits/chosen": -1.4981950521469116, "logits/rejected": -1.1309444904327393, "logps/chosen": -0.9048945307731628, "logps/rejected": -7.258805274963379, "loss": 0.9231, "odds_ratio_loss": 0.18187430500984192, "rewards/accuracies": 1.0, "rewards/chosen": -0.090489462018013, "rewards/margins": 0.6353910565376282, "rewards/rejected": -0.72588050365448, "sft_loss": 0.9048945307731628, "step": 6610 }, { "epoch": 0.51, "grad_norm": 242.73080444335938, "learning_rate": 4.830699168881591e-06, "logits/chosen": -1.413058876991272, "logits/rejected": -1.4513750076293945, "logps/chosen": -1.963711142539978, "logps/rejected": -5.550110816955566, "loss": 1.9724, "odds_ratio_loss": 0.08717626333236694, "rewards/accuracies": 1.0, "rewards/chosen": -0.19637110829353333, "rewards/margins": 0.35864001512527466, "rewards/rejected": -0.5550111532211304, "sft_loss": 1.963711142539978, "step": 6615 }, { "epoch": 0.51, "grad_norm": 7.038618564605713, "learning_rate": 4.824545258831913e-06, "logits/chosen": -1.2381751537322998, "logits/rejected": -0.697541356086731, "logps/chosen": -0.9126702547073364, "logps/rejected": -7.588592529296875, "loss": 0.9215, "odds_ratio_loss": 0.08810234069824219, "rewards/accuracies": 1.0, "rewards/chosen": -0.09126702696084976, "rewards/margins": 0.6675922274589539, "rewards/rejected": -0.7588592767715454, "sft_loss": 0.9126702547073364, "step": 6620 }, { "epoch": 0.52, "grad_norm": 10.980341911315918, "learning_rate": 4.818391614881625e-06, "logits/chosen": -1.4112461805343628, "logits/rejected": -0.8747898936271667, "logps/chosen": -1.4316513538360596, "logps/rejected": -9.764450073242188, "loss": 1.4621, "odds_ratio_loss": 0.30480560660362244, "rewards/accuracies": 1.0, "rewards/chosen": -0.14316514134407043, "rewards/margins": 0.8332799077033997, "rewards/rejected": -0.9764450192451477, "sft_loss": 1.4316513538360596, "step": 6625 }, { "epoch": 0.52, "grad_norm": 6.748105049133301, "learning_rate": 4.812238246363513e-06, "logits/chosen": -1.410508394241333, "logits/rejected": -1.1683613061904907, "logps/chosen": -1.2869287729263306, "logps/rejected": -1.9622493982315063, "loss": 1.3375, "odds_ratio_loss": 0.505769670009613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1286928802728653, "rewards/margins": 0.06753206253051758, "rewards/rejected": -0.19622494280338287, "sft_loss": 1.2869287729263306, "step": 6630 }, { "epoch": 0.52, "grad_norm": 63.81284713745117, "learning_rate": 4.80608516260994e-06, "logits/chosen": -1.3456947803497314, "logits/rejected": -1.339935064315796, "logps/chosen": -3.2150325775146484, "logps/rejected": -10.46107292175293, "loss": 3.2597, "odds_ratio_loss": 0.44674915075302124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.321503221988678, "rewards/margins": 0.724604070186615, "rewards/rejected": -1.046107292175293, "sft_loss": 3.2150325775146484, "step": 6635 }, { "epoch": 0.52, "grad_norm": 16.180912017822266, "learning_rate": 4.799932372952838e-06, "logits/chosen": -1.3194289207458496, "logits/rejected": -0.9958184361457825, "logps/chosen": -1.0840357542037964, "logps/rejected": -8.959443092346191, "loss": 1.0964, "odds_ratio_loss": 0.123465895652771, "rewards/accuracies": 1.0, "rewards/chosen": -0.10840357840061188, "rewards/margins": 0.7875407338142395, "rewards/rejected": -0.8959442973136902, "sft_loss": 1.0840357542037964, "step": 6640 }, { "epoch": 0.52, "grad_norm": 10.999001502990723, "learning_rate": 4.793779886723693e-06, "logits/chosen": -1.0370042324066162, "logits/rejected": -1.355830430984497, "logps/chosen": -1.304805040359497, "logps/rejected": -4.567163944244385, "loss": 1.3344, "odds_ratio_loss": 0.29614123702049255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13048049807548523, "rewards/margins": 0.32623592019081116, "rewards/rejected": -0.456716388463974, "sft_loss": 1.304805040359497, "step": 6645 }, { "epoch": 0.52, "grad_norm": 15.348061561584473, "learning_rate": 4.787627713253533e-06, "logits/chosen": -1.0208803415298462, "logits/rejected": -1.2926498651504517, "logps/chosen": -0.6741073727607727, "logps/rejected": -5.330902099609375, "loss": 0.6904, "odds_ratio_loss": 0.16298021376132965, "rewards/accuracies": 1.0, "rewards/chosen": -0.06741073727607727, "rewards/margins": 0.46567949652671814, "rewards/rejected": -0.5330902338027954, "sft_loss": 0.6741073727607727, "step": 6650 }, { "epoch": 0.52, "grad_norm": 4.856533050537109, "learning_rate": 4.78147586187291e-06, "logits/chosen": -1.4575669765472412, "logits/rejected": -0.7827960848808289, "logps/chosen": -1.1962621212005615, "logps/rejected": -14.277833938598633, "loss": 1.205, "odds_ratio_loss": 0.08750542253255844, "rewards/accuracies": 1.0, "rewards/chosen": -0.11962622404098511, "rewards/margins": 1.308157205581665, "rewards/rejected": -1.4277832508087158, "sft_loss": 1.1962621212005615, "step": 6655 }, { "epoch": 0.52, "grad_norm": 13.62650203704834, "learning_rate": 4.775324341911887e-06, "logits/chosen": -1.4796792268753052, "logits/rejected": -1.3119274377822876, "logps/chosen": -2.3290915489196777, "logps/rejected": -4.825216770172119, "loss": 2.3427, "odds_ratio_loss": 0.13567480444908142, "rewards/accuracies": 1.0, "rewards/chosen": -0.23290912806987762, "rewards/margins": 0.24961252510547638, "rewards/rejected": -0.4825216829776764, "sft_loss": 2.3290915489196777, "step": 6660 }, { "epoch": 0.52, "grad_norm": 18.167985916137695, "learning_rate": 4.769173162700025e-06, "logits/chosen": -1.3903343677520752, "logits/rejected": -1.4637458324432373, "logps/chosen": -0.8073641061782837, "logps/rejected": -8.170225143432617, "loss": 0.8118, "odds_ratio_loss": 0.04413865879178047, "rewards/accuracies": 1.0, "rewards/chosen": -0.08073641359806061, "rewards/margins": 0.7362861037254333, "rewards/rejected": -0.8170225024223328, "sft_loss": 0.8073641061782837, "step": 6665 }, { "epoch": 0.52, "grad_norm": 8.094761848449707, "learning_rate": 4.76302233356637e-06, "logits/chosen": -1.473113775253296, "logits/rejected": -1.0856086015701294, "logps/chosen": -0.7588493227958679, "logps/rejected": -11.757055282592773, "loss": 0.7646, "odds_ratio_loss": 0.05762838199734688, "rewards/accuracies": 1.0, "rewards/chosen": -0.07588493824005127, "rewards/margins": 1.0998207330703735, "rewards/rejected": -1.1757055521011353, "sft_loss": 0.7588493227958679, "step": 6670 }, { "epoch": 0.52, "grad_norm": 5.533440113067627, "learning_rate": 4.756871863839431e-06, "logits/chosen": -1.4507572650909424, "logits/rejected": -1.3853797912597656, "logps/chosen": -0.7131852507591248, "logps/rejected": -2.202475070953369, "loss": 0.7341, "odds_ratio_loss": 0.20953097939491272, "rewards/accuracies": 1.0, "rewards/chosen": -0.07131852954626083, "rewards/margins": 0.14892897009849548, "rewards/rejected": -0.22024747729301453, "sft_loss": 0.7131852507591248, "step": 6675 }, { "epoch": 0.52, "grad_norm": 6.879472255706787, "learning_rate": 4.750721762847182e-06, "logits/chosen": -1.523850440979004, "logits/rejected": -1.277369737625122, "logps/chosen": -1.0856904983520508, "logps/rejected": -7.033446311950684, "loss": 1.0938, "odds_ratio_loss": 0.08136365562677383, "rewards/accuracies": 1.0, "rewards/chosen": -0.10856904834508896, "rewards/margins": 0.5947756767272949, "rewards/rejected": -0.7033447027206421, "sft_loss": 1.0856904983520508, "step": 6680 }, { "epoch": 0.52, "grad_norm": 5.807984352111816, "learning_rate": 4.744572039917029e-06, "logits/chosen": -1.2817108631134033, "logits/rejected": -1.3534901142120361, "logps/chosen": -1.1051974296569824, "logps/rejected": -5.682037830352783, "loss": 1.1592, "odds_ratio_loss": 0.5402711033821106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11051974445581436, "rewards/margins": 0.4576840400695801, "rewards/rejected": -0.5682038068771362, "sft_loss": 1.1051974296569824, "step": 6685 }, { "epoch": 0.52, "grad_norm": 40.49241638183594, "learning_rate": 4.738422704375807e-06, "logits/chosen": -1.3966052532196045, "logits/rejected": -0.7777081727981567, "logps/chosen": -1.0331476926803589, "logps/rejected": -12.39820671081543, "loss": 1.0353, "odds_ratio_loss": 0.021451503038406372, "rewards/accuracies": 1.0, "rewards/chosen": -0.10331475734710693, "rewards/margins": 1.1365059614181519, "rewards/rejected": -1.2398207187652588, "sft_loss": 1.0331476926803589, "step": 6690 }, { "epoch": 0.52, "grad_norm": 10.272732734680176, "learning_rate": 4.732273765549766e-06, "logits/chosen": -1.4794766902923584, "logits/rejected": -1.0296047925949097, "logps/chosen": -1.1400551795959473, "logps/rejected": -6.162641525268555, "loss": 1.1564, "odds_ratio_loss": 0.16316184401512146, "rewards/accuracies": 1.0, "rewards/chosen": -0.11400550603866577, "rewards/margins": 0.5022586584091187, "rewards/rejected": -0.6162641644477844, "sft_loss": 1.1400551795959473, "step": 6695 }, { "epoch": 0.52, "grad_norm": 9.777023315429688, "learning_rate": 4.726125232764551e-06, "logits/chosen": -1.5177513360977173, "logits/rejected": -1.1050008535385132, "logps/chosen": -0.8778587579727173, "logps/rejected": -5.110650062561035, "loss": 0.9289, "odds_ratio_loss": 0.5105150938034058, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08778587728738785, "rewards/margins": 0.4232791066169739, "rewards/rejected": -0.5110650062561035, "sft_loss": 0.8778587579727173, "step": 6700 }, { "epoch": 0.52, "grad_norm": 8.837748527526855, "learning_rate": 4.719977115345194e-06, "logits/chosen": -1.5516029596328735, "logits/rejected": -1.4659761190414429, "logps/chosen": -1.1462736129760742, "logps/rejected": -12.577552795410156, "loss": 1.1534, "odds_ratio_loss": 0.0709126815199852, "rewards/accuracies": 1.0, "rewards/chosen": -0.11462736129760742, "rewards/margins": 1.1431279182434082, "rewards/rejected": -1.2577552795410156, "sft_loss": 1.1462736129760742, "step": 6705 }, { "epoch": 0.52, "grad_norm": 56.009029388427734, "learning_rate": 4.713829422616091e-06, "logits/chosen": -1.3187446594238281, "logits/rejected": -1.0171353816986084, "logps/chosen": -1.2572777271270752, "logps/rejected": -4.165177822113037, "loss": 1.2969, "odds_ratio_loss": 0.395923376083374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1257277876138687, "rewards/margins": 0.29078999161720276, "rewards/rejected": -0.41651779413223267, "sft_loss": 1.2572777271270752, "step": 6710 }, { "epoch": 0.52, "grad_norm": 8.197124481201172, "learning_rate": 4.7076821639010055e-06, "logits/chosen": -1.488896131515503, "logits/rejected": -0.8135197758674622, "logps/chosen": -0.9947648048400879, "logps/rejected": -6.5308356285095215, "loss": 1.0008, "odds_ratio_loss": 0.060649238526821136, "rewards/accuracies": 1.0, "rewards/chosen": -0.09947647899389267, "rewards/margins": 0.5536071062088013, "rewards/rejected": -0.6530836224555969, "sft_loss": 0.9947648048400879, "step": 6715 }, { "epoch": 0.52, "grad_norm": 8.14523983001709, "learning_rate": 4.701535348523032e-06, "logits/chosen": -1.4530843496322632, "logits/rejected": -1.2772367000579834, "logps/chosen": -0.9320453405380249, "logps/rejected": -3.5855605602264404, "loss": 0.9633, "odds_ratio_loss": 0.31262874603271484, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0932045429944992, "rewards/margins": 0.2653515338897705, "rewards/rejected": -0.35855603218078613, "sft_loss": 0.9320453405380249, "step": 6720 }, { "epoch": 0.52, "grad_norm": 11.718148231506348, "learning_rate": 4.695388985804597e-06, "logits/chosen": -1.4163570404052734, "logits/rejected": -1.3425827026367188, "logps/chosen": -0.9410263895988464, "logps/rejected": -11.246500015258789, "loss": 0.9429, "odds_ratio_loss": 0.018949458375573158, "rewards/accuracies": 1.0, "rewards/chosen": -0.0941026359796524, "rewards/margins": 1.0305473804473877, "rewards/rejected": -1.124650001525879, "sft_loss": 0.9410263895988464, "step": 6725 }, { "epoch": 0.52, "grad_norm": 64.7518539428711, "learning_rate": 4.689243085067439e-06, "logits/chosen": -1.2837133407592773, "logits/rejected": -1.119122862815857, "logps/chosen": -0.6392735242843628, "logps/rejected": -7.493457794189453, "loss": 0.641, "odds_ratio_loss": 0.01754852756857872, "rewards/accuracies": 1.0, "rewards/chosen": -0.06392735242843628, "rewards/margins": 0.685418426990509, "rewards/rejected": -0.7493457794189453, "sft_loss": 0.6392735242843628, "step": 6730 }, { "epoch": 0.52, "grad_norm": 11.195655822753906, "learning_rate": 4.6830976556325995e-06, "logits/chosen": -1.4340559244155884, "logits/rejected": -1.155472755432129, "logps/chosen": -2.2630090713500977, "logps/rejected": -4.744475364685059, "loss": 2.2716, "odds_ratio_loss": 0.08597750961780548, "rewards/accuracies": 1.0, "rewards/chosen": -0.226300910115242, "rewards/margins": 0.24814662337303162, "rewards/rejected": -0.4744475483894348, "sft_loss": 2.2630090713500977, "step": 6735 }, { "epoch": 0.52, "grad_norm": 10.269758224487305, "learning_rate": 4.676952706820398e-06, "logits/chosen": -1.336328148841858, "logits/rejected": -1.4924776554107666, "logps/chosen": -1.1410115957260132, "logps/rejected": -12.29522705078125, "loss": 1.1414, "odds_ratio_loss": 0.004271526355296373, "rewards/accuracies": 1.0, "rewards/chosen": -0.11410115659236908, "rewards/margins": 1.1154215335845947, "rewards/rejected": -1.229522705078125, "sft_loss": 1.1410115957260132, "step": 6740 }, { "epoch": 0.52, "grad_norm": 9.63022232055664, "learning_rate": 4.670808247950435e-06, "logits/chosen": -1.1530532836914062, "logits/rejected": -1.0261757373809814, "logps/chosen": -1.232412338256836, "logps/rejected": -6.78533935546875, "loss": 1.2401, "odds_ratio_loss": 0.0765550285577774, "rewards/accuracies": 1.0, "rewards/chosen": -0.12324123084545135, "rewards/margins": 0.5552927851676941, "rewards/rejected": -0.6785339117050171, "sft_loss": 1.232412338256836, "step": 6745 }, { "epoch": 0.53, "grad_norm": 5.54863166809082, "learning_rate": 4.664664288341559e-06, "logits/chosen": -0.951921284198761, "logits/rejected": -1.042023777961731, "logps/chosen": -1.1395822763442993, "logps/rejected": -9.88968276977539, "loss": 1.1433, "odds_ratio_loss": 0.037159256637096405, "rewards/accuracies": 1.0, "rewards/chosen": -0.1139582172036171, "rewards/margins": 0.8750101327896118, "rewards/rejected": -0.9889682531356812, "sft_loss": 1.1395822763442993, "step": 6750 }, { "epoch": 0.53, "grad_norm": 41.68501663208008, "learning_rate": 4.658520837311865e-06, "logits/chosen": -1.3054512739181519, "logits/rejected": -0.7933279871940613, "logps/chosen": -0.9936810731887817, "logps/rejected": -3.1271064281463623, "loss": 1.0154, "odds_ratio_loss": 0.21723489463329315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09936810284852982, "rewards/margins": 0.2133425772190094, "rewards/rejected": -0.312710702419281, "sft_loss": 0.9936810731887817, "step": 6755 }, { "epoch": 0.53, "grad_norm": 6.217238426208496, "learning_rate": 4.652377904178677e-06, "logits/chosen": -1.4349156618118286, "logits/rejected": -0.9307647943496704, "logps/chosen": -1.1279726028442383, "logps/rejected": -7.524369716644287, "loss": 1.1818, "odds_ratio_loss": 0.5385130047798157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11279726028442383, "rewards/margins": 0.6396397352218628, "rewards/rejected": -0.7524369955062866, "sft_loss": 1.1279726028442383, "step": 6760 }, { "epoch": 0.53, "grad_norm": 9.548029899597168, "learning_rate": 4.646235498258534e-06, "logits/chosen": -1.3545303344726562, "logits/rejected": -1.1169371604919434, "logps/chosen": -0.8710645437240601, "logps/rejected": -3.7101237773895264, "loss": 0.8912, "odds_ratio_loss": 0.20155823230743408, "rewards/accuracies": 1.0, "rewards/chosen": -0.08710645884275436, "rewards/margins": 0.28390592336654663, "rewards/rejected": -0.3710123896598816, "sft_loss": 0.8710645437240601, "step": 6765 }, { "epoch": 0.53, "grad_norm": 33.7285041809082, "learning_rate": 4.6400936288671746e-06, "logits/chosen": -1.2456495761871338, "logits/rejected": -1.3842296600341797, "logps/chosen": -0.8866599798202515, "logps/rejected": -4.286937236785889, "loss": 0.9015, "odds_ratio_loss": 0.14859908819198608, "rewards/accuracies": 1.0, "rewards/chosen": -0.08866600692272186, "rewards/margins": 0.34002774953842163, "rewards/rejected": -0.4286937713623047, "sft_loss": 0.8866599798202515, "step": 6770 }, { "epoch": 0.53, "grad_norm": 5.694341659545898, "learning_rate": 4.6339523053195204e-06, "logits/chosen": -1.4577521085739136, "logits/rejected": -0.9543673396110535, "logps/chosen": -0.9715708494186401, "logps/rejected": -3.9672107696533203, "loss": 1.0079, "odds_ratio_loss": 0.36367106437683105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09715709090232849, "rewards/margins": 0.2995639443397522, "rewards/rejected": -0.3967210352420807, "sft_loss": 0.9715708494186401, "step": 6775 }, { "epoch": 0.53, "grad_norm": 5.955051422119141, "learning_rate": 4.6278115369296715e-06, "logits/chosen": -1.4041407108306885, "logits/rejected": -0.504332423210144, "logps/chosen": -0.9568581581115723, "logps/rejected": -5.345180988311768, "loss": 0.9666, "odds_ratio_loss": 0.09708087146282196, "rewards/accuracies": 1.0, "rewards/chosen": -0.09568581730127335, "rewards/margins": 0.43883222341537476, "rewards/rejected": -0.5345180630683899, "sft_loss": 0.9568581581115723, "step": 6780 }, { "epoch": 0.53, "grad_norm": 14.830814361572266, "learning_rate": 4.621671333010882e-06, "logits/chosen": -1.4793100357055664, "logits/rejected": -1.2408435344696045, "logps/chosen": -1.2611196041107178, "logps/rejected": -3.2941811084747314, "loss": 1.31, "odds_ratio_loss": 0.48905545473098755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1261119544506073, "rewards/margins": 0.2033061534166336, "rewards/rejected": -0.3294180929660797, "sft_loss": 1.2611196041107178, "step": 6785 }, { "epoch": 0.53, "grad_norm": 10.222573280334473, "learning_rate": 4.6155317028755484e-06, "logits/chosen": -1.5000641345977783, "logits/rejected": -1.1039273738861084, "logps/chosen": -1.0030043125152588, "logps/rejected": -5.1502885818481445, "loss": 1.0356, "odds_ratio_loss": 0.3260100185871124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10030041635036469, "rewards/margins": 0.41472840309143066, "rewards/rejected": -0.5150288343429565, "sft_loss": 1.0030043125152588, "step": 6790 }, { "epoch": 0.53, "grad_norm": 11.880167007446289, "learning_rate": 4.609392655835203e-06, "logits/chosen": -1.4088056087493896, "logits/rejected": -1.3552888631820679, "logps/chosen": -1.0446202754974365, "logps/rejected": -10.438191413879395, "loss": 1.0486, "odds_ratio_loss": 0.0402403399348259, "rewards/accuracies": 1.0, "rewards/chosen": -0.10446202754974365, "rewards/margins": 0.9393571019172668, "rewards/rejected": -1.0438191890716553, "sft_loss": 1.0446202754974365, "step": 6795 }, { "epoch": 0.53, "grad_norm": 4.905477046966553, "learning_rate": 4.603254201200489e-06, "logits/chosen": -1.3935184478759766, "logits/rejected": -0.6968734860420227, "logps/chosen": -0.8401309251785278, "logps/rejected": -5.342778205871582, "loss": 0.8616, "odds_ratio_loss": 0.21473821997642517, "rewards/accuracies": 1.0, "rewards/chosen": -0.08401308953762054, "rewards/margins": 0.45026469230651855, "rewards/rejected": -0.5342777967453003, "sft_loss": 0.8401309251785278, "step": 6800 }, { "epoch": 0.53, "grad_norm": 32.383575439453125, "learning_rate": 4.59711634828115e-06, "logits/chosen": -1.3621585369110107, "logits/rejected": -1.4311379194259644, "logps/chosen": -0.9719074964523315, "logps/rejected": -3.5250167846679688, "loss": 1.0021, "odds_ratio_loss": 0.30197301506996155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0971907526254654, "rewards/margins": 0.25531092286109924, "rewards/rejected": -0.35250169038772583, "sft_loss": 0.9719074964523315, "step": 6805 }, { "epoch": 0.53, "grad_norm": 157.6527557373047, "learning_rate": 4.5909791063860225e-06, "logits/chosen": -1.343641757965088, "logits/rejected": -1.1741821765899658, "logps/chosen": -1.591430902481079, "logps/rejected": -9.511109352111816, "loss": 1.5978, "odds_ratio_loss": 0.06342393904924393, "rewards/accuracies": 1.0, "rewards/chosen": -0.15914307534694672, "rewards/margins": 0.7919678688049316, "rewards/rejected": -0.9511110186576843, "sft_loss": 1.591430902481079, "step": 6810 }, { "epoch": 0.53, "grad_norm": 10.832589149475098, "learning_rate": 4.584842484823011e-06, "logits/chosen": -1.4096262454986572, "logits/rejected": -1.4636614322662354, "logps/chosen": -0.7325119376182556, "logps/rejected": -8.924284934997559, "loss": 0.7382, "odds_ratio_loss": 0.056823063641786575, "rewards/accuracies": 1.0, "rewards/chosen": -0.07325119525194168, "rewards/margins": 0.8191774487495422, "rewards/rejected": -0.8924285769462585, "sft_loss": 0.7325119376182556, "step": 6815 }, { "epoch": 0.53, "grad_norm": 5.163097858428955, "learning_rate": 4.578706492899082e-06, "logits/chosen": -1.421924352645874, "logits/rejected": -0.8165718913078308, "logps/chosen": -1.1318721771240234, "logps/rejected": -12.301141738891602, "loss": 1.1504, "odds_ratio_loss": 0.18545493483543396, "rewards/accuracies": 1.0, "rewards/chosen": -0.11318721622228622, "rewards/margins": 1.116927146911621, "rewards/rejected": -1.230114221572876, "sft_loss": 1.1318721771240234, "step": 6820 }, { "epoch": 0.53, "grad_norm": 6.831690788269043, "learning_rate": 4.572571139920244e-06, "logits/chosen": -1.389262318611145, "logits/rejected": -0.9818083047866821, "logps/chosen": -0.8672173619270325, "logps/rejected": -2.39469313621521, "loss": 0.9093, "odds_ratio_loss": 0.42063069343566895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0867217406630516, "rewards/margins": 0.15274758636951447, "rewards/rejected": -0.23946928977966309, "sft_loss": 0.8672173619270325, "step": 6825 }, { "epoch": 0.53, "grad_norm": 15.991640090942383, "learning_rate": 4.566436435191543e-06, "logits/chosen": -1.4767954349517822, "logits/rejected": -0.9890767931938171, "logps/chosen": -0.7637965083122253, "logps/rejected": -3.739415407180786, "loss": 0.7842, "odds_ratio_loss": 0.20384731888771057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07637965679168701, "rewards/margins": 0.2975618839263916, "rewards/rejected": -0.3739415109157562, "sft_loss": 0.7637965083122253, "step": 6830 }, { "epoch": 0.53, "grad_norm": 17.176393508911133, "learning_rate": 4.5603023880170355e-06, "logits/chosen": -1.3882057666778564, "logits/rejected": -1.4111310243606567, "logps/chosen": -0.9381965398788452, "logps/rejected": -6.771797180175781, "loss": 0.95, "odds_ratio_loss": 0.1176723837852478, "rewards/accuracies": 1.0, "rewards/chosen": -0.09381966292858124, "rewards/margins": 0.5833600163459778, "rewards/rejected": -0.6771796941757202, "sft_loss": 0.9381965398788452, "step": 6835 }, { "epoch": 0.53, "grad_norm": 5.813627243041992, "learning_rate": 4.554169007699782e-06, "logits/chosen": -1.424593210220337, "logits/rejected": -1.1305902004241943, "logps/chosen": -1.1007654666900635, "logps/rejected": -6.940686225891113, "loss": 1.1305, "odds_ratio_loss": 0.2974892556667328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11007654666900635, "rewards/margins": 0.5839920043945312, "rewards/rejected": -0.6940685510635376, "sft_loss": 1.1007654666900635, "step": 6840 }, { "epoch": 0.53, "grad_norm": 6.086479663848877, "learning_rate": 4.548036303541834e-06, "logits/chosen": -1.4879542589187622, "logits/rejected": -1.0121488571166992, "logps/chosen": -1.0957660675048828, "logps/rejected": -2.0224268436431885, "loss": 1.1405, "odds_ratio_loss": 0.4473304748535156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10957660526037216, "rewards/margins": 0.09266607463359833, "rewards/rejected": -0.2022426575422287, "sft_loss": 1.0957660675048828, "step": 6845 }, { "epoch": 0.53, "grad_norm": 15.212538719177246, "learning_rate": 4.541904284844214e-06, "logits/chosen": -1.3758288621902466, "logits/rejected": -1.157755970954895, "logps/chosen": -0.7310072183609009, "logps/rejected": -3.5768425464630127, "loss": 0.7626, "odds_ratio_loss": 0.3155810832977295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07310072332620621, "rewards/margins": 0.28458356857299805, "rewards/rejected": -0.35768428444862366, "sft_loss": 0.7310072183609009, "step": 6850 }, { "epoch": 0.53, "grad_norm": 8.853516578674316, "learning_rate": 4.535772960906907e-06, "logits/chosen": -1.384530782699585, "logits/rejected": -1.0214731693267822, "logps/chosen": -1.0086669921875, "logps/rejected": -2.941141128540039, "loss": 1.0577, "odds_ratio_loss": 0.49013274908065796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10086669027805328, "rewards/margins": 0.19324742257595062, "rewards/rejected": -0.2941141128540039, "sft_loss": 1.0086669921875, "step": 6855 }, { "epoch": 0.53, "grad_norm": 26.822484970092773, "learning_rate": 4.529642341028847e-06, "logits/chosen": -1.4491193294525146, "logits/rejected": -1.2508859634399414, "logps/chosen": -1.03815495967865, "logps/rejected": -8.596417427062988, "loss": 1.0479, "odds_ratio_loss": 0.09718579053878784, "rewards/accuracies": 1.0, "rewards/chosen": -0.10381549596786499, "rewards/margins": 0.7558261752128601, "rewards/rejected": -0.8596416711807251, "sft_loss": 1.03815495967865, "step": 6860 }, { "epoch": 0.53, "grad_norm": 15.145536422729492, "learning_rate": 4.523512434507897e-06, "logits/chosen": -1.2769041061401367, "logits/rejected": -1.0685583353042603, "logps/chosen": -1.0163986682891846, "logps/rejected": -8.629759788513184, "loss": 1.0191, "odds_ratio_loss": 0.026911329478025436, "rewards/accuracies": 1.0, "rewards/chosen": -0.10163986682891846, "rewards/margins": 0.7613360285758972, "rewards/rejected": -0.8629759550094604, "sft_loss": 1.0163986682891846, "step": 6865 }, { "epoch": 0.53, "grad_norm": 6.6643757820129395, "learning_rate": 4.517383250640836e-06, "logits/chosen": -1.304107666015625, "logits/rejected": -0.9942609667778015, "logps/chosen": -0.9170368313789368, "logps/rejected": -3.983776569366455, "loss": 0.9379, "odds_ratio_loss": 0.20818495750427246, "rewards/accuracies": 1.0, "rewards/chosen": -0.09170368313789368, "rewards/margins": 0.30667397379875183, "rewards/rejected": -0.3983776867389679, "sft_loss": 0.9170368313789368, "step": 6870 }, { "epoch": 0.53, "grad_norm": 25.229204177856445, "learning_rate": 4.511254798723351e-06, "logits/chosen": -1.3273017406463623, "logits/rejected": -1.038374900817871, "logps/chosen": -1.1942617893218994, "logps/rejected": -10.367258071899414, "loss": 1.2342, "odds_ratio_loss": 0.3994136452674866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1194261908531189, "rewards/margins": 0.9172995686531067, "rewards/rejected": -1.0367257595062256, "sft_loss": 1.1942617893218994, "step": 6875 }, { "epoch": 0.54, "grad_norm": 12.490259170532227, "learning_rate": 4.505127088050018e-06, "logits/chosen": -1.1804896593093872, "logits/rejected": -1.1206060647964478, "logps/chosen": -1.0563093423843384, "logps/rejected": -8.599047660827637, "loss": 1.0905, "odds_ratio_loss": 0.34177225828170776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10563093423843384, "rewards/margins": 0.7542737722396851, "rewards/rejected": -0.8599047660827637, "sft_loss": 1.0563093423843384, "step": 6880 }, { "epoch": 0.54, "grad_norm": 6.394746780395508, "learning_rate": 4.499000127914286e-06, "logits/chosen": -1.455170750617981, "logits/rejected": -1.206276297569275, "logps/chosen": -0.9744254946708679, "logps/rejected": -7.1251678466796875, "loss": 0.9991, "odds_ratio_loss": 0.24634718894958496, "rewards/accuracies": 1.0, "rewards/chosen": -0.09744254499673843, "rewards/margins": 0.6150742769241333, "rewards/rejected": -0.7125169038772583, "sft_loss": 0.9744254946708679, "step": 6885 }, { "epoch": 0.54, "grad_norm": 8.001043319702148, "learning_rate": 4.49287392760847e-06, "logits/chosen": -1.1762328147888184, "logits/rejected": -1.4600152969360352, "logps/chosen": -1.2551082372665405, "logps/rejected": -5.348014831542969, "loss": 1.275, "odds_ratio_loss": 0.1989041119813919, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12551084160804749, "rewards/margins": 0.4092906415462494, "rewards/rejected": -0.5348014831542969, "sft_loss": 1.2551082372665405, "step": 6890 }, { "epoch": 0.54, "grad_norm": 9.176358222961426, "learning_rate": 4.48674849642373e-06, "logits/chosen": -1.4300482273101807, "logits/rejected": -1.6007181406021118, "logps/chosen": -1.1959967613220215, "logps/rejected": -6.814823150634766, "loss": 1.2665, "odds_ratio_loss": 0.7045713663101196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11959967762231827, "rewards/margins": 0.5618826150894165, "rewards/rejected": -0.6814823746681213, "sft_loss": 1.1959967613220215, "step": 6895 }, { "epoch": 0.54, "grad_norm": 16.08429718017578, "learning_rate": 4.480623843650061e-06, "logits/chosen": -1.4096542596817017, "logits/rejected": -1.1567366123199463, "logps/chosen": -0.8154433965682983, "logps/rejected": -2.1745846271514893, "loss": 0.9268, "odds_ratio_loss": 1.113632082939148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08154434710741043, "rewards/margins": 0.1359141319990158, "rewards/rejected": -0.21745848655700684, "sft_loss": 0.8154433965682983, "step": 6900 }, { "epoch": 0.54, "grad_norm": 23.622493743896484, "learning_rate": 4.474499978576274e-06, "logits/chosen": -1.3838725090026855, "logits/rejected": -0.9869769811630249, "logps/chosen": -0.7865809202194214, "logps/rejected": -3.4415574073791504, "loss": 0.8022, "odds_ratio_loss": 0.15634959936141968, "rewards/accuracies": 1.0, "rewards/chosen": -0.0786580964922905, "rewards/margins": 0.2654976546764374, "rewards/rejected": -0.34415578842163086, "sft_loss": 0.7865809202194214, "step": 6905 }, { "epoch": 0.54, "grad_norm": 9.461322784423828, "learning_rate": 4.4683769104899905e-06, "logits/chosen": -1.5936346054077148, "logits/rejected": -1.3136579990386963, "logps/chosen": -0.9142619967460632, "logps/rejected": -6.642866611480713, "loss": 0.9418, "odds_ratio_loss": 0.27570387721061707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09142619371414185, "rewards/margins": 0.5728604197502136, "rewards/rejected": -0.6642866134643555, "sft_loss": 0.9142619967460632, "step": 6910 }, { "epoch": 0.54, "grad_norm": 5.414888381958008, "learning_rate": 4.46225464867762e-06, "logits/chosen": -1.4732753038406372, "logits/rejected": -1.5724318027496338, "logps/chosen": -1.603231430053711, "logps/rejected": -10.14500617980957, "loss": 1.623, "odds_ratio_loss": 0.19741734862327576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1603231579065323, "rewards/margins": 0.8541774749755859, "rewards/rejected": -1.014500617980957, "sft_loss": 1.603231430053711, "step": 6915 }, { "epoch": 0.54, "grad_norm": 4.721510410308838, "learning_rate": 4.456133202424349e-06, "logits/chosen": -1.4627020359039307, "logits/rejected": -1.1769201755523682, "logps/chosen": -1.2980149984359741, "logps/rejected": -12.185821533203125, "loss": 1.2998, "odds_ratio_loss": 0.018322288990020752, "rewards/accuracies": 1.0, "rewards/chosen": -0.12980152666568756, "rewards/margins": 1.0887806415557861, "rewards/rejected": -1.2185821533203125, "sft_loss": 1.2980149984359741, "step": 6920 }, { "epoch": 0.54, "grad_norm": 8.03736400604248, "learning_rate": 4.450012581014129e-06, "logits/chosen": -1.320770025253296, "logits/rejected": -1.0708155632019043, "logps/chosen": -1.1251728534698486, "logps/rejected": -2.3679592609405518, "loss": 1.2122, "odds_ratio_loss": 0.8706277012825012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11251727491617203, "rewards/margins": 0.12427864223718643, "rewards/rejected": -0.23679594695568085, "sft_loss": 1.1251728534698486, "step": 6925 }, { "epoch": 0.54, "grad_norm": 13.415603637695312, "learning_rate": 4.443892793729659e-06, "logits/chosen": -1.4199503660202026, "logits/rejected": -1.0905369520187378, "logps/chosen": -1.3850047588348389, "logps/rejected": -6.778962135314941, "loss": 1.4477, "odds_ratio_loss": 0.6268793940544128, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13850049674510956, "rewards/margins": 0.5393957495689392, "rewards/rejected": -0.67789626121521, "sft_loss": 1.3850047588348389, "step": 6930 }, { "epoch": 0.54, "grad_norm": 7.6707892417907715, "learning_rate": 4.437773849852371e-06, "logits/chosen": -1.3866112232208252, "logits/rejected": -1.0096814632415771, "logps/chosen": -0.9563423991203308, "logps/rejected": -4.1207780838012695, "loss": 0.9747, "odds_ratio_loss": 0.18360617756843567, "rewards/accuracies": 1.0, "rewards/chosen": -0.09563424438238144, "rewards/margins": 0.3164435923099518, "rewards/rejected": -0.41207781434059143, "sft_loss": 0.9563423991203308, "step": 6935 }, { "epoch": 0.54, "grad_norm": 15.404536247253418, "learning_rate": 4.431655758662426e-06, "logits/chosen": -1.465071439743042, "logits/rejected": -0.9004141688346863, "logps/chosen": -1.0210731029510498, "logps/rejected": -5.286839485168457, "loss": 1.0485, "odds_ratio_loss": 0.27380725741386414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10210730880498886, "rewards/margins": 0.4265766143798828, "rewards/rejected": -0.5286839604377747, "sft_loss": 1.0210731029510498, "step": 6940 }, { "epoch": 0.54, "grad_norm": 9.173187255859375, "learning_rate": 4.425538529438682e-06, "logits/chosen": -1.4739725589752197, "logits/rejected": -1.0984523296356201, "logps/chosen": -1.8843532800674438, "logps/rejected": -9.872546195983887, "loss": 1.8863, "odds_ratio_loss": 0.01930277980864048, "rewards/accuracies": 1.0, "rewards/chosen": -0.18843533098697662, "rewards/margins": 0.798819363117218, "rewards/rejected": -0.9872547388076782, "sft_loss": 1.8843532800674438, "step": 6945 }, { "epoch": 0.54, "grad_norm": 29.652374267578125, "learning_rate": 4.419422171458695e-06, "logits/chosen": -1.5096216201782227, "logits/rejected": -1.1181193590164185, "logps/chosen": -0.8287372589111328, "logps/rejected": -3.0704493522644043, "loss": 0.8485, "odds_ratio_loss": 0.19784730672836304, "rewards/accuracies": 1.0, "rewards/chosen": -0.08287372440099716, "rewards/margins": 0.22417119145393372, "rewards/rejected": -0.3070449233055115, "sft_loss": 0.8287372589111328, "step": 6950 }, { "epoch": 0.54, "grad_norm": 27.080087661743164, "learning_rate": 4.413306693998697e-06, "logits/chosen": -1.5426450967788696, "logits/rejected": -1.2096660137176514, "logps/chosen": -1.0421079397201538, "logps/rejected": -1.9055150747299194, "loss": 1.1174, "odds_ratio_loss": 0.7526431083679199, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10421079397201538, "rewards/margins": 0.08634071052074432, "rewards/rejected": -0.1905515044927597, "sft_loss": 1.0421079397201538, "step": 6955 }, { "epoch": 0.54, "grad_norm": 39.57759094238281, "learning_rate": 4.407192106333588e-06, "logits/chosen": -1.599490761756897, "logits/rejected": -1.2356152534484863, "logps/chosen": -0.9575656056404114, "logps/rejected": -4.758363246917725, "loss": 0.965, "odds_ratio_loss": 0.0743914544582367, "rewards/accuracies": 1.0, "rewards/chosen": -0.09575656801462173, "rewards/margins": 0.3800797462463379, "rewards/rejected": -0.4758363366127014, "sft_loss": 0.9575656056404114, "step": 6960 }, { "epoch": 0.54, "grad_norm": 28.02701187133789, "learning_rate": 4.401078417736915e-06, "logits/chosen": -1.3408010005950928, "logits/rejected": -0.9972102046012878, "logps/chosen": -0.8788027763366699, "logps/rejected": -3.238997220993042, "loss": 0.9065, "odds_ratio_loss": 0.2769816815853119, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08788027614355087, "rewards/margins": 0.23601946234703064, "rewards/rejected": -0.3238997459411621, "sft_loss": 0.8788027763366699, "step": 6965 }, { "epoch": 0.54, "grad_norm": 9.541007041931152, "learning_rate": 4.394965637480862e-06, "logits/chosen": -1.416936993598938, "logits/rejected": -0.934307873249054, "logps/chosen": -0.6852422952651978, "logps/rejected": -3.539524555206299, "loss": 0.6948, "odds_ratio_loss": 0.09601191431283951, "rewards/accuracies": 1.0, "rewards/chosen": -0.06852422654628754, "rewards/margins": 0.2854282259941101, "rewards/rejected": -0.35395246744155884, "sft_loss": 0.6852422952651978, "step": 6970 }, { "epoch": 0.54, "grad_norm": 11.116430282592773, "learning_rate": 4.38885377483624e-06, "logits/chosen": -1.503397822380066, "logits/rejected": -1.2043156623840332, "logps/chosen": -1.1527019739151, "logps/rejected": -10.427370071411133, "loss": 1.1733, "odds_ratio_loss": 0.20619888603687286, "rewards/accuracies": 1.0, "rewards/chosen": -0.1152702122926712, "rewards/margins": 0.9274666905403137, "rewards/rejected": -1.0427368879318237, "sft_loss": 1.1527019739151, "step": 6975 }, { "epoch": 0.54, "grad_norm": 5.822205543518066, "learning_rate": 4.3827428390724625e-06, "logits/chosen": -1.523048758506775, "logits/rejected": -0.8828157186508179, "logps/chosen": -1.0152368545532227, "logps/rejected": -6.31392765045166, "loss": 1.0357, "odds_ratio_loss": 0.20493432879447937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10152368247509003, "rewards/margins": 0.529869019985199, "rewards/rejected": -0.631392776966095, "sft_loss": 1.0152368545532227, "step": 6980 }, { "epoch": 0.54, "grad_norm": 31.748807907104492, "learning_rate": 4.376632839457538e-06, "logits/chosen": -1.49868905544281, "logits/rejected": -0.8400151133537292, "logps/chosen": -1.004535436630249, "logps/rejected": -4.5008649826049805, "loss": 1.0315, "odds_ratio_loss": 0.2695631682872772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10045354068279266, "rewards/margins": 0.34963294863700867, "rewards/rejected": -0.4500865042209625, "sft_loss": 1.004535436630249, "step": 6985 }, { "epoch": 0.54, "grad_norm": 19.450061798095703, "learning_rate": 4.37052378525806e-06, "logits/chosen": -1.3124674558639526, "logits/rejected": -1.3847830295562744, "logps/chosen": -0.941940426826477, "logps/rejected": -3.816953182220459, "loss": 0.9567, "odds_ratio_loss": 0.14801433682441711, "rewards/accuracies": 1.0, "rewards/chosen": -0.09419403970241547, "rewards/margins": 0.2875012755393982, "rewards/rejected": -0.38169533014297485, "sft_loss": 0.941940426826477, "step": 6990 }, { "epoch": 0.54, "grad_norm": 9.087126731872559, "learning_rate": 4.364415685739183e-06, "logits/chosen": -1.498969316482544, "logits/rejected": -1.654484510421753, "logps/chosen": -0.8401981592178345, "logps/rejected": -13.466900825500488, "loss": 0.8517, "odds_ratio_loss": 0.11514023691415787, "rewards/accuracies": 1.0, "rewards/chosen": -0.08401981741189957, "rewards/margins": 1.2626702785491943, "rewards/rejected": -1.346690058708191, "sft_loss": 0.8401981592178345, "step": 6995 }, { "epoch": 0.54, "grad_norm": 18.640514373779297, "learning_rate": 4.358308550164616e-06, "logits/chosen": -1.3498393297195435, "logits/rejected": -1.2818952798843384, "logps/chosen": -1.1116533279418945, "logps/rejected": -9.627163887023926, "loss": 1.1231, "odds_ratio_loss": 0.11460292339324951, "rewards/accuracies": 1.0, "rewards/chosen": -0.11116534471511841, "rewards/margins": 0.8515509366989136, "rewards/rejected": -0.9627164006233215, "sft_loss": 1.1116533279418945, "step": 7000 }, { "epoch": 0.54, "grad_norm": 6.592902183532715, "learning_rate": 4.352202387796602e-06, "logits/chosen": -1.4888490438461304, "logits/rejected": -0.92115318775177, "logps/chosen": -1.0439634323120117, "logps/rejected": -7.5101637840271, "loss": 1.0464, "odds_ratio_loss": 0.023921433836221695, "rewards/accuracies": 1.0, "rewards/chosen": -0.10439634323120117, "rewards/margins": 0.6466200351715088, "rewards/rejected": -0.7510164380073547, "sft_loss": 1.0439634323120117, "step": 7005 }, { "epoch": 0.55, "grad_norm": 19.845417022705078, "learning_rate": 4.346097207895917e-06, "logits/chosen": -1.3229620456695557, "logits/rejected": -1.0637264251708984, "logps/chosen": -1.2682688236236572, "logps/rejected": -11.699102401733398, "loss": 1.2807, "odds_ratio_loss": 0.12445087730884552, "rewards/accuracies": 1.0, "rewards/chosen": -0.12682689726352692, "rewards/margins": 1.0430833101272583, "rewards/rejected": -1.169910192489624, "sft_loss": 1.2682688236236572, "step": 7010 }, { "epoch": 0.55, "grad_norm": 7.902325630187988, "learning_rate": 4.339993019721839e-06, "logits/chosen": -1.4001775979995728, "logits/rejected": -0.9012830853462219, "logps/chosen": -0.9438796043395996, "logps/rejected": -6.606081485748291, "loss": 1.0343, "odds_ratio_loss": 0.9038652181625366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0943879634141922, "rewards/margins": 0.5662201642990112, "rewards/rejected": -0.660608172416687, "sft_loss": 0.9438796043395996, "step": 7015 }, { "epoch": 0.55, "grad_norm": 7.624948978424072, "learning_rate": 4.333889832532142e-06, "logits/chosen": -1.5406997203826904, "logits/rejected": -1.1605985164642334, "logps/chosen": -1.087935447692871, "logps/rejected": -6.6805596351623535, "loss": 1.1146, "odds_ratio_loss": 0.26643556356430054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10879355669021606, "rewards/margins": 0.5592623949050903, "rewards/rejected": -0.6680558919906616, "sft_loss": 1.087935447692871, "step": 7020 }, { "epoch": 0.55, "grad_norm": 11.003002166748047, "learning_rate": 4.327787655583089e-06, "logits/chosen": -1.5272648334503174, "logits/rejected": -0.9672309756278992, "logps/chosen": -1.0818145275115967, "logps/rejected": -5.941993713378906, "loss": 1.1105, "odds_ratio_loss": 0.28728801012039185, "rewards/accuracies": 1.0, "rewards/chosen": -0.10818145424127579, "rewards/margins": 0.48601800203323364, "rewards/rejected": -0.5941994190216064, "sft_loss": 1.0818145275115967, "step": 7025 }, { "epoch": 0.55, "grad_norm": 20.621822357177734, "learning_rate": 4.321686498129404e-06, "logits/chosen": -1.5019843578338623, "logits/rejected": -1.1324069499969482, "logps/chosen": -1.1026496887207031, "logps/rejected": -8.86848258972168, "loss": 1.1324, "odds_ratio_loss": 0.2977941036224365, "rewards/accuracies": 1.0, "rewards/chosen": -0.11026497185230255, "rewards/margins": 0.7765833139419556, "rewards/rejected": -0.8868482708930969, "sft_loss": 1.1026496887207031, "step": 7030 }, { "epoch": 0.55, "grad_norm": 15.918548583984375, "learning_rate": 4.315586369424265e-06, "logits/chosen": -1.4344863891601562, "logits/rejected": -0.7808696627616882, "logps/chosen": -0.8610206842422485, "logps/rejected": -4.585818290710449, "loss": 0.8939, "odds_ratio_loss": 0.3287752866744995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08610206097364426, "rewards/margins": 0.37247976660728455, "rewards/rejected": -0.4585818350315094, "sft_loss": 0.8610206842422485, "step": 7035 }, { "epoch": 0.55, "grad_norm": 45.551788330078125, "learning_rate": 4.309487278719294e-06, "logits/chosen": -1.3992388248443604, "logits/rejected": -0.9382654428482056, "logps/chosen": -1.136695146560669, "logps/rejected": -4.336820125579834, "loss": 1.1791, "odds_ratio_loss": 0.4242839813232422, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11366952955722809, "rewards/margins": 0.32001250982284546, "rewards/rejected": -0.43368202447891235, "sft_loss": 1.136695146560669, "step": 7040 }, { "epoch": 0.55, "grad_norm": 18.471166610717773, "learning_rate": 4.303389235264536e-06, "logits/chosen": -1.2700679302215576, "logits/rejected": -0.6969276070594788, "logps/chosen": -0.911207377910614, "logps/rejected": -1.6082451343536377, "loss": 0.9555, "odds_ratio_loss": 0.44246095418930054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09112074226140976, "rewards/margins": 0.06970375031232834, "rewards/rejected": -0.1608245074748993, "sft_loss": 0.911207377910614, "step": 7045 }, { "epoch": 0.55, "grad_norm": 17.821630477905273, "learning_rate": 4.297292248308446e-06, "logits/chosen": -1.3343194723129272, "logits/rejected": -0.8145080804824829, "logps/chosen": -0.9822369813919067, "logps/rejected": -6.457598686218262, "loss": 0.996, "odds_ratio_loss": 0.1377163976430893, "rewards/accuracies": 1.0, "rewards/chosen": -0.09822369366884232, "rewards/margins": 0.5475361943244934, "rewards/rejected": -0.6457598805427551, "sft_loss": 0.9822369813919067, "step": 7050 }, { "epoch": 0.55, "grad_norm": 10.077775955200195, "learning_rate": 4.291196327097883e-06, "logits/chosen": -1.3986437320709229, "logits/rejected": -0.9943048357963562, "logps/chosen": -1.0187883377075195, "logps/rejected": -6.489192962646484, "loss": 1.0399, "odds_ratio_loss": 0.2116069495677948, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10187884420156479, "rewards/margins": 0.5470404624938965, "rewards/rejected": -0.6489192247390747, "sft_loss": 1.0187883377075195, "step": 7055 }, { "epoch": 0.55, "grad_norm": 4.37607479095459, "learning_rate": 4.285101480878083e-06, "logits/chosen": -1.4077237844467163, "logits/rejected": -1.0786534547805786, "logps/chosen": -1.0912609100341797, "logps/rejected": -7.178577423095703, "loss": 1.1096, "odds_ratio_loss": 0.18366660177707672, "rewards/accuracies": 1.0, "rewards/chosen": -0.10912607610225677, "rewards/margins": 0.6087316870689392, "rewards/rejected": -0.7178577184677124, "sft_loss": 1.0912609100341797, "step": 7060 }, { "epoch": 0.55, "grad_norm": 8.530895233154297, "learning_rate": 4.279007718892654e-06, "logits/chosen": -1.3784153461456299, "logits/rejected": -1.297706961631775, "logps/chosen": -1.073346495628357, "logps/rejected": -7.513908386230469, "loss": 1.0977, "odds_ratio_loss": 0.24378827214241028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10733465105295181, "rewards/margins": 0.6440561413764954, "rewards/rejected": -0.7513907551765442, "sft_loss": 1.073346495628357, "step": 7065 }, { "epoch": 0.55, "grad_norm": 28.515674591064453, "learning_rate": 4.272915050383559e-06, "logits/chosen": -1.4155004024505615, "logits/rejected": -1.6100749969482422, "logps/chosen": -1.1467911005020142, "logps/rejected": -6.216679573059082, "loss": 1.1542, "odds_ratio_loss": 0.07433410733938217, "rewards/accuracies": 1.0, "rewards/chosen": -0.11467909812927246, "rewards/margins": 0.5069888830184937, "rewards/rejected": -0.6216680407524109, "sft_loss": 1.1467911005020142, "step": 7070 }, { "epoch": 0.55, "grad_norm": 5.460112571716309, "learning_rate": 4.266823484591106e-06, "logits/chosen": -1.4371813535690308, "logits/rejected": -1.3315809965133667, "logps/chosen": -0.8955456018447876, "logps/rejected": -4.667305946350098, "loss": 0.9309, "odds_ratio_loss": 0.353960245847702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.089554563164711, "rewards/margins": 0.37717610597610474, "rewards/rejected": -0.46673065423965454, "sft_loss": 0.8955456018447876, "step": 7075 }, { "epoch": 0.55, "grad_norm": 26.168041229248047, "learning_rate": 4.260733030753926e-06, "logits/chosen": -1.454080581665039, "logits/rejected": -1.0602571964263916, "logps/chosen": -1.140600562095642, "logps/rejected": -5.033745765686035, "loss": 1.1835, "odds_ratio_loss": 0.429409921169281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11406006664037704, "rewards/margins": 0.38931459188461304, "rewards/rejected": -0.5033746361732483, "sft_loss": 1.140600562095642, "step": 7080 }, { "epoch": 0.55, "grad_norm": 27.514564514160156, "learning_rate": 4.254643698108963e-06, "logits/chosen": -1.4346212148666382, "logits/rejected": -0.9562221765518188, "logps/chosen": -0.9034181833267212, "logps/rejected": -3.7591049671173096, "loss": 0.9172, "odds_ratio_loss": 0.1378771811723709, "rewards/accuracies": 1.0, "rewards/chosen": -0.09034182131290436, "rewards/margins": 0.2855686545372009, "rewards/rejected": -0.3759104609489441, "sft_loss": 0.9034181833267212, "step": 7085 }, { "epoch": 0.55, "grad_norm": 23.025066375732422, "learning_rate": 4.2485554958914695e-06, "logits/chosen": -1.4962995052337646, "logits/rejected": -0.750170111656189, "logps/chosen": -1.304537057876587, "logps/rejected": -6.853229522705078, "loss": 1.3358, "odds_ratio_loss": 0.3125781714916229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1304537057876587, "rewards/margins": 0.5548692941665649, "rewards/rejected": -0.6853229999542236, "sft_loss": 1.304537057876587, "step": 7090 }, { "epoch": 0.55, "grad_norm": 91.04281616210938, "learning_rate": 4.2424684333349725e-06, "logits/chosen": -1.413205862045288, "logits/rejected": -1.2042224407196045, "logps/chosen": -1.3621070384979248, "logps/rejected": -4.676912307739258, "loss": 1.3821, "odds_ratio_loss": 0.19968989491462708, "rewards/accuracies": 1.0, "rewards/chosen": -0.13621070981025696, "rewards/margins": 0.3314804434776306, "rewards/rejected": -0.4676911234855652, "sft_loss": 1.3621070384979248, "step": 7095 }, { "epoch": 0.55, "grad_norm": 5.257567882537842, "learning_rate": 4.236382519671276e-06, "logits/chosen": -1.3120917081832886, "logits/rejected": -1.0594903230667114, "logps/chosen": -1.0127789974212646, "logps/rejected": -10.14476490020752, "loss": 1.0129, "odds_ratio_loss": 0.0008429423905909061, "rewards/accuracies": 1.0, "rewards/chosen": -0.10127788782119751, "rewards/margins": 0.9131986498832703, "rewards/rejected": -1.0144765377044678, "sft_loss": 1.0127789974212646, "step": 7100 }, { "epoch": 0.55, "grad_norm": 39.47378921508789, "learning_rate": 4.230297764130441e-06, "logits/chosen": -1.1948641538619995, "logits/rejected": -1.540321707725525, "logps/chosen": -0.913619875907898, "logps/rejected": -8.393054962158203, "loss": 0.9149, "odds_ratio_loss": 0.012436440214514732, "rewards/accuracies": 1.0, "rewards/chosen": -0.09136199951171875, "rewards/margins": 0.7479435205459595, "rewards/rejected": -0.8393055200576782, "sft_loss": 0.913619875907898, "step": 7105 }, { "epoch": 0.55, "grad_norm": 7.7148756980896, "learning_rate": 4.224214175940773e-06, "logits/chosen": -1.5498994588851929, "logits/rejected": -0.8623272776603699, "logps/chosen": -1.627855658531189, "logps/rejected": -3.274567127227783, "loss": 1.6546, "odds_ratio_loss": 0.2678052484989166, "rewards/accuracies": 1.0, "rewards/chosen": -0.16278555989265442, "rewards/margins": 0.1646711379289627, "rewards/rejected": -0.3274567127227783, "sft_loss": 1.627855658531189, "step": 7110 }, { "epoch": 0.55, "grad_norm": 30.301809310913086, "learning_rate": 4.218131764328802e-06, "logits/chosen": -1.5537168979644775, "logits/rejected": -1.0891371965408325, "logps/chosen": -0.9233828783035278, "logps/rejected": -5.229593753814697, "loss": 0.9502, "odds_ratio_loss": 0.2679787278175354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09233828634023666, "rewards/margins": 0.43062108755111694, "rewards/rejected": -0.5229593515396118, "sft_loss": 0.9233828783035278, "step": 7115 }, { "epoch": 0.55, "grad_norm": 11.14027214050293, "learning_rate": 4.2120505385192835e-06, "logits/chosen": -1.4834104776382446, "logits/rejected": -1.1844459772109985, "logps/chosen": -1.6177314519882202, "logps/rejected": -3.180201530456543, "loss": 1.676, "odds_ratio_loss": 0.5829809904098511, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16177313029766083, "rewards/margins": 0.15624700486660004, "rewards/rejected": -0.31802016496658325, "sft_loss": 1.6177314519882202, "step": 7120 }, { "epoch": 0.55, "grad_norm": 8.944758415222168, "learning_rate": 4.205970507735165e-06, "logits/chosen": -1.4102011919021606, "logits/rejected": -1.3182289600372314, "logps/chosen": -0.9230417013168335, "logps/rejected": -3.186675548553467, "loss": 0.9337, "odds_ratio_loss": 0.1070776954293251, "rewards/accuracies": 1.0, "rewards/chosen": -0.09230418503284454, "rewards/margins": 0.2263633906841278, "rewards/rejected": -0.31866756081581116, "sft_loss": 0.9230417013168335, "step": 7125 }, { "epoch": 0.55, "grad_norm": 12.016209602355957, "learning_rate": 4.199891681197585e-06, "logits/chosen": -1.3144750595092773, "logits/rejected": -1.002539873123169, "logps/chosen": -1.2599903345108032, "logps/rejected": -3.335179090499878, "loss": 1.2785, "odds_ratio_loss": 0.18462809920310974, "rewards/accuracies": 1.0, "rewards/chosen": -0.12599903345108032, "rewards/margins": 0.20751889050006866, "rewards/rejected": -0.3335179388523102, "sft_loss": 1.2599903345108032, "step": 7130 }, { "epoch": 0.56, "grad_norm": 6.1627631187438965, "learning_rate": 4.193814068125854e-06, "logits/chosen": -1.3824846744537354, "logits/rejected": -0.680531919002533, "logps/chosen": -0.9973615407943726, "logps/rejected": -3.085211992263794, "loss": 1.0166, "odds_ratio_loss": 0.19264943897724152, "rewards/accuracies": 1.0, "rewards/chosen": -0.09973615407943726, "rewards/margins": 0.2087850272655487, "rewards/rejected": -0.30852121114730835, "sft_loss": 0.9973615407943726, "step": 7135 }, { "epoch": 0.56, "grad_norm": 8.950169563293457, "learning_rate": 4.187737677737448e-06, "logits/chosen": -1.4213409423828125, "logits/rejected": -1.0563170909881592, "logps/chosen": -1.1497620344161987, "logps/rejected": -5.6467132568359375, "loss": 1.1613, "odds_ratio_loss": 0.11577478796243668, "rewards/accuracies": 1.0, "rewards/chosen": -0.11497620493173599, "rewards/margins": 0.44969505071640015, "rewards/rejected": -0.5646712779998779, "sft_loss": 1.1497620344161987, "step": 7140 }, { "epoch": 0.56, "grad_norm": 5.027875900268555, "learning_rate": 4.181662519247983e-06, "logits/chosen": -1.4923940896987915, "logits/rejected": -0.8564295768737793, "logps/chosen": -1.015838861465454, "logps/rejected": -4.046143531799316, "loss": 1.0314, "odds_ratio_loss": 0.1554424911737442, "rewards/accuracies": 1.0, "rewards/chosen": -0.10158388316631317, "rewards/margins": 0.30303049087524414, "rewards/rejected": -0.4046143591403961, "sft_loss": 1.015838861465454, "step": 7145 }, { "epoch": 0.56, "grad_norm": 7.206550121307373, "learning_rate": 4.175588601871206e-06, "logits/chosen": -1.4180933237075806, "logits/rejected": -0.9299715757369995, "logps/chosen": -0.8494499921798706, "logps/rejected": -3.278174877166748, "loss": 0.8656, "odds_ratio_loss": 0.16184845566749573, "rewards/accuracies": 1.0, "rewards/chosen": -0.08494500815868378, "rewards/margins": 0.24287250638008118, "rewards/rejected": -0.32781749963760376, "sft_loss": 0.8494499921798706, "step": 7150 }, { "epoch": 0.56, "grad_norm": 5.67272424697876, "learning_rate": 4.169515934818987e-06, "logits/chosen": -1.3000648021697998, "logits/rejected": -0.8862468600273132, "logps/chosen": -1.3208826780319214, "logps/rejected": -2.5383241176605225, "loss": 1.3589, "odds_ratio_loss": 0.3805355131626129, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13208827376365662, "rewards/margins": 0.12174415588378906, "rewards/rejected": -0.2538324296474457, "sft_loss": 1.3208826780319214, "step": 7155 }, { "epoch": 0.56, "grad_norm": 14.509788513183594, "learning_rate": 4.163444527301296e-06, "logits/chosen": -1.4848687648773193, "logits/rejected": -1.325732946395874, "logps/chosen": -0.8232647180557251, "logps/rejected": -5.763245105743408, "loss": 0.8311, "odds_ratio_loss": 0.07884080708026886, "rewards/accuracies": 1.0, "rewards/chosen": -0.08232647180557251, "rewards/margins": 0.49399805068969727, "rewards/rejected": -0.5763245224952698, "sft_loss": 0.8232647180557251, "step": 7160 }, { "epoch": 0.56, "grad_norm": 11.16965389251709, "learning_rate": 4.157374388526189e-06, "logits/chosen": -1.4290255308151245, "logits/rejected": -1.3512481451034546, "logps/chosen": -0.9287931323051453, "logps/rejected": -3.3987338542938232, "loss": 0.9486, "odds_ratio_loss": 0.19854740798473358, "rewards/accuracies": 1.0, "rewards/chosen": -0.09287931025028229, "rewards/margins": 0.24699406325817108, "rewards/rejected": -0.33987337350845337, "sft_loss": 0.9287931323051453, "step": 7165 }, { "epoch": 0.56, "grad_norm": 28.541109085083008, "learning_rate": 4.151305527699808e-06, "logits/chosen": -1.3716208934783936, "logits/rejected": -1.2792634963989258, "logps/chosen": -0.7543950080871582, "logps/rejected": -6.4099860191345215, "loss": 0.7629, "odds_ratio_loss": 0.08536773175001144, "rewards/accuracies": 1.0, "rewards/chosen": -0.07543949782848358, "rewards/margins": 0.5655592083930969, "rewards/rejected": -0.6409986615180969, "sft_loss": 0.7543950080871582, "step": 7170 }, { "epoch": 0.56, "grad_norm": 9.904860496520996, "learning_rate": 4.1452379540263495e-06, "logits/chosen": -1.4210548400878906, "logits/rejected": -1.423855185508728, "logps/chosen": -2.2208778858184814, "logps/rejected": -5.382429599761963, "loss": 2.279, "odds_ratio_loss": 0.5807275176048279, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2220877707004547, "rewards/margins": 0.31615522503852844, "rewards/rejected": -0.5382429957389832, "sft_loss": 2.2208778858184814, "step": 7175 }, { "epoch": 0.56, "grad_norm": 5.127261161804199, "learning_rate": 4.139171676708057e-06, "logits/chosen": -1.3772279024124146, "logits/rejected": -0.9157557487487793, "logps/chosen": -0.9644227027893066, "logps/rejected": -4.304810523986816, "loss": 1.0178, "odds_ratio_loss": 0.5333045125007629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09644227474927902, "rewards/margins": 0.33403879404067993, "rewards/rejected": -0.43048110604286194, "sft_loss": 0.9644227027893066, "step": 7180 }, { "epoch": 0.56, "grad_norm": 14.579924583435059, "learning_rate": 4.1331067049452134e-06, "logits/chosen": -1.3941256999969482, "logits/rejected": -1.0850193500518799, "logps/chosen": -0.9973868131637573, "logps/rejected": -6.488180637359619, "loss": 1.0413, "odds_ratio_loss": 0.4392642080783844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09973867982625961, "rewards/margins": 0.549079418182373, "rewards/rejected": -0.6488181352615356, "sft_loss": 0.9973868131637573, "step": 7185 }, { "epoch": 0.56, "grad_norm": 12.829072952270508, "learning_rate": 4.127043047936116e-06, "logits/chosen": -1.446582555770874, "logits/rejected": -1.1274049282073975, "logps/chosen": -1.1398777961730957, "logps/rejected": -5.959358215332031, "loss": 1.1571, "odds_ratio_loss": 0.17211410403251648, "rewards/accuracies": 1.0, "rewards/chosen": -0.11398778855800629, "rewards/margins": 0.48194804787635803, "rewards/rejected": -0.5959358215332031, "sft_loss": 1.1398777961730957, "step": 7190 }, { "epoch": 0.56, "grad_norm": 17.90237808227539, "learning_rate": 4.120980714877072e-06, "logits/chosen": -1.4279298782348633, "logits/rejected": -1.0952799320220947, "logps/chosen": -0.9773966073989868, "logps/rejected": -12.761029243469238, "loss": 0.9786, "odds_ratio_loss": 0.012265295721590519, "rewards/accuracies": 1.0, "rewards/chosen": -0.09773966670036316, "rewards/margins": 1.1783632040023804, "rewards/rejected": -1.2761030197143555, "sft_loss": 0.9773966073989868, "step": 7195 }, { "epoch": 0.56, "grad_norm": 34.895118713378906, "learning_rate": 4.114919714962376e-06, "logits/chosen": -1.4511187076568604, "logits/rejected": -1.0730235576629639, "logps/chosen": -1.06996750831604, "logps/rejected": -10.693025588989258, "loss": 1.0754, "odds_ratio_loss": 0.05451556295156479, "rewards/accuracies": 1.0, "rewards/chosen": -0.10699673742055893, "rewards/margins": 0.9623057246208191, "rewards/rejected": -1.0693025588989258, "sft_loss": 1.06996750831604, "step": 7200 }, { "epoch": 0.56, "grad_norm": 24.5500431060791, "learning_rate": 4.108860057384309e-06, "logits/chosen": -1.476646065711975, "logits/rejected": -1.196157455444336, "logps/chosen": -1.0712162256240845, "logps/rejected": -8.418623924255371, "loss": 1.0888, "odds_ratio_loss": 0.17610427737236023, "rewards/accuracies": 1.0, "rewards/chosen": -0.10712162405252457, "rewards/margins": 0.7347409129142761, "rewards/rejected": -0.8418625593185425, "sft_loss": 1.0712162256240845, "step": 7205 }, { "epoch": 0.56, "grad_norm": 11.665763854980469, "learning_rate": 4.1028017513331084e-06, "logits/chosen": -1.2933409214019775, "logits/rejected": -1.0372425317764282, "logps/chosen": -1.2550567388534546, "logps/rejected": -4.989316463470459, "loss": 1.3087, "odds_ratio_loss": 0.536535918712616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12550568580627441, "rewards/margins": 0.3734259605407715, "rewards/rejected": -0.4989316463470459, "sft_loss": 1.2550567388534546, "step": 7210 }, { "epoch": 0.56, "grad_norm": 186.90316772460938, "learning_rate": 4.096744805996964e-06, "logits/chosen": -1.380571722984314, "logits/rejected": -1.3022321462631226, "logps/chosen": -1.0640513896942139, "logps/rejected": -10.813997268676758, "loss": 1.1025, "odds_ratio_loss": 0.3841148018836975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10640515387058258, "rewards/margins": 0.9749946594238281, "rewards/rejected": -1.08139967918396, "sft_loss": 1.0640513896942139, "step": 7215 }, { "epoch": 0.56, "grad_norm": 14.99019718170166, "learning_rate": 4.090689230562003e-06, "logits/chosen": -1.0091339349746704, "logits/rejected": -1.4177144765853882, "logps/chosen": -1.391606092453003, "logps/rejected": -7.845396995544434, "loss": 1.3975, "odds_ratio_loss": 0.05848420783877373, "rewards/accuracies": 1.0, "rewards/chosen": -0.13916060328483582, "rewards/margins": 0.6453791856765747, "rewards/rejected": -0.7845398187637329, "sft_loss": 1.391606092453003, "step": 7220 }, { "epoch": 0.56, "grad_norm": 9.070110321044922, "learning_rate": 4.0846350342122746e-06, "logits/chosen": -1.4411298036575317, "logits/rejected": -1.2582954168319702, "logps/chosen": -0.8484258651733398, "logps/rejected": -8.242280960083008, "loss": 0.8629, "odds_ratio_loss": 0.14485874772071838, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08484258502721786, "rewards/margins": 0.7393856644630432, "rewards/rejected": -0.8242281675338745, "sft_loss": 0.8484258651733398, "step": 7225 }, { "epoch": 0.56, "grad_norm": 31.72794532775879, "learning_rate": 4.078582226129735e-06, "logits/chosen": -1.2538648843765259, "logits/rejected": -1.2442595958709717, "logps/chosen": -0.754325270652771, "logps/rejected": -8.127649307250977, "loss": 0.7984, "odds_ratio_loss": 0.4406220018863678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07543252408504486, "rewards/margins": 0.737332284450531, "rewards/rejected": -0.8127648234367371, "sft_loss": 0.754325270652771, "step": 7230 }, { "epoch": 0.56, "grad_norm": 8.865771293640137, "learning_rate": 4.0725308154942395e-06, "logits/chosen": -1.4666943550109863, "logits/rejected": -1.2816884517669678, "logps/chosen": -1.100095272064209, "logps/rejected": -2.0770721435546875, "loss": 1.1549, "odds_ratio_loss": 0.5476905107498169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11000952869653702, "rewards/margins": 0.09769769012928009, "rewards/rejected": -0.2077072113752365, "sft_loss": 1.100095272064209, "step": 7235 }, { "epoch": 0.56, "grad_norm": 21.18962860107422, "learning_rate": 4.066480811483518e-06, "logits/chosen": -1.3888037204742432, "logits/rejected": -0.7071020007133484, "logps/chosen": -0.9945389032363892, "logps/rejected": -5.384260177612305, "loss": 1.0017, "odds_ratio_loss": 0.07145805656909943, "rewards/accuracies": 1.0, "rewards/chosen": -0.0994538888335228, "rewards/margins": 0.4389721751213074, "rewards/rejected": -0.5384260416030884, "sft_loss": 0.9945389032363892, "step": 7240 }, { "epoch": 0.56, "grad_norm": 335.705810546875, "learning_rate": 4.060432223273169e-06, "logits/chosen": -1.4586912393569946, "logits/rejected": -1.1600415706634521, "logps/chosen": -1.4400997161865234, "logps/rejected": -2.9663405418395996, "loss": 1.4646, "odds_ratio_loss": 0.2453519105911255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14400997757911682, "rewards/margins": 0.15262408554553986, "rewards/rejected": -0.2966340482234955, "sft_loss": 1.4400997161865234, "step": 7245 }, { "epoch": 0.56, "grad_norm": 11.154559135437012, "learning_rate": 4.0543850600366444e-06, "logits/chosen": -1.4864270687103271, "logits/rejected": -1.2301304340362549, "logps/chosen": -1.3381130695343018, "logps/rejected": -2.762622833251953, "loss": 1.3802, "odds_ratio_loss": 0.42071300745010376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13381129503250122, "rewards/margins": 0.1424509882926941, "rewards/rejected": -0.2762622833251953, "sft_loss": 1.3381130695343018, "step": 7250 }, { "epoch": 0.56, "grad_norm": 38.43938446044922, "learning_rate": 4.048339330945238e-06, "logits/chosen": -1.1480789184570312, "logits/rejected": -0.9946290254592896, "logps/chosen": -0.8657535314559937, "logps/rejected": -6.532303810119629, "loss": 0.8688, "odds_ratio_loss": 0.03087017312645912, "rewards/accuracies": 1.0, "rewards/chosen": -0.08657535910606384, "rewards/margins": 0.5666549801826477, "rewards/rejected": -0.6532303690910339, "sft_loss": 0.8657535314559937, "step": 7255 }, { "epoch": 0.56, "grad_norm": 5.126986980438232, "learning_rate": 4.042295045168064e-06, "logits/chosen": -1.4064311981201172, "logits/rejected": -0.7540382146835327, "logps/chosen": -1.1285769939422607, "logps/rejected": -5.043487548828125, "loss": 1.1424, "odds_ratio_loss": 0.1382966786623001, "rewards/accuracies": 1.0, "rewards/chosen": -0.11285771429538727, "rewards/margins": 0.3914910852909088, "rewards/rejected": -0.5043487548828125, "sft_loss": 1.1285769939422607, "step": 7260 }, { "epoch": 0.57, "grad_norm": 8.36670970916748, "learning_rate": 4.036252211872047e-06, "logits/chosen": -1.245692253112793, "logits/rejected": -1.0072195529937744, "logps/chosen": -0.7486428022384644, "logps/rejected": -5.857022285461426, "loss": 0.7628, "odds_ratio_loss": 0.14205805957317352, "rewards/accuracies": 1.0, "rewards/chosen": -0.07486427575349808, "rewards/margins": 0.5108379125595093, "rewards/rejected": -0.5857021808624268, "sft_loss": 0.7486428022384644, "step": 7265 }, { "epoch": 0.57, "grad_norm": 1.4005019664764404, "learning_rate": 4.030210840221915e-06, "logits/chosen": -1.2849702835083008, "logits/rejected": -1.1393325328826904, "logps/chosen": -0.7389216423034668, "logps/rejected": -4.627842426300049, "loss": 0.7584, "odds_ratio_loss": 0.19480088353157043, "rewards/accuracies": 1.0, "rewards/chosen": -0.07389216125011444, "rewards/margins": 0.3888920843601227, "rewards/rejected": -0.4627842307090759, "sft_loss": 0.7389216423034668, "step": 7270 }, { "epoch": 0.57, "grad_norm": 14.450980186462402, "learning_rate": 4.024170939380172e-06, "logits/chosen": -1.3889250755310059, "logits/rejected": -1.1132410764694214, "logps/chosen": -1.0050677061080933, "logps/rejected": -5.148253440856934, "loss": 1.0419, "odds_ratio_loss": 0.36850666999816895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10050676763057709, "rewards/margins": 0.4143185615539551, "rewards/rejected": -0.5148253440856934, "sft_loss": 1.0050677061080933, "step": 7275 }, { "epoch": 0.57, "grad_norm": 5.981418609619141, "learning_rate": 4.018132518507095e-06, "logits/chosen": -1.227460265159607, "logits/rejected": -1.2487907409667969, "logps/chosen": -0.8153915405273438, "logps/rejected": -3.5961098670959473, "loss": 0.8424, "odds_ratio_loss": 0.26995450258255005, "rewards/accuracies": 1.0, "rewards/chosen": -0.08153915405273438, "rewards/margins": 0.2780718207359314, "rewards/rejected": -0.35961097478866577, "sft_loss": 0.8153915405273438, "step": 7280 }, { "epoch": 0.57, "grad_norm": 88.4582748413086, "learning_rate": 4.012095586760718e-06, "logits/chosen": -1.1984305381774902, "logits/rejected": -1.5410726070404053, "logps/chosen": -1.7070376873016357, "logps/rejected": -4.591531276702881, "loss": 1.745, "odds_ratio_loss": 0.3800917863845825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17070379853248596, "rewards/margins": 0.28844937682151794, "rewards/rejected": -0.4591531753540039, "sft_loss": 1.7070376873016357, "step": 7285 }, { "epoch": 0.57, "grad_norm": 22.212352752685547, "learning_rate": 4.006060153296812e-06, "logits/chosen": -1.34489107131958, "logits/rejected": -1.3902560472488403, "logps/chosen": -0.8336170315742493, "logps/rejected": -2.8464300632476807, "loss": 0.8576, "odds_ratio_loss": 0.23946678638458252, "rewards/accuracies": 1.0, "rewards/chosen": -0.08336170762777328, "rewards/margins": 0.20128127932548523, "rewards/rejected": -0.2846429944038391, "sft_loss": 0.8336170315742493, "step": 7290 }, { "epoch": 0.57, "grad_norm": 16.92748260498047, "learning_rate": 4.000026227268878e-06, "logits/chosen": -1.4852524995803833, "logits/rejected": -1.3302969932556152, "logps/chosen": -1.6487728357315063, "logps/rejected": -8.657622337341309, "loss": 1.7169, "odds_ratio_loss": 0.6810190081596375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1648772954940796, "rewards/margins": 0.7008849382400513, "rewards/rejected": -0.8657622337341309, "sft_loss": 1.6487728357315063, "step": 7295 }, { "epoch": 0.57, "grad_norm": 40.37065887451172, "learning_rate": 3.993993817828134e-06, "logits/chosen": -1.383131742477417, "logits/rejected": -1.620224952697754, "logps/chosen": -1.0177613496780396, "logps/rejected": -8.058442115783691, "loss": 1.0251, "odds_ratio_loss": 0.07358211278915405, "rewards/accuracies": 1.0, "rewards/chosen": -0.1017761379480362, "rewards/margins": 0.7040680646896362, "rewards/rejected": -0.8058441281318665, "sft_loss": 1.0177613496780396, "step": 7300 }, { "epoch": 0.57, "grad_norm": 9.612100601196289, "learning_rate": 3.9879629341234925e-06, "logits/chosen": -1.328662633895874, "logits/rejected": -0.8497580289840698, "logps/chosen": -0.9461971521377563, "logps/rejected": -2.255105495452881, "loss": 0.9915, "odds_ratio_loss": 0.4530237317085266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09461971372365952, "rewards/margins": 0.1308908313512802, "rewards/rejected": -0.22551055252552032, "sft_loss": 0.9461971521377563, "step": 7305 }, { "epoch": 0.57, "grad_norm": 8.014002799987793, "learning_rate": 3.981933585301555e-06, "logits/chosen": -1.294582486152649, "logits/rejected": -1.406262755393982, "logps/chosen": -0.8181141018867493, "logps/rejected": -8.624747276306152, "loss": 0.8296, "odds_ratio_loss": 0.11477307975292206, "rewards/accuracies": 1.0, "rewards/chosen": -0.08181141316890717, "rewards/margins": 0.7806633710861206, "rewards/rejected": -0.862474799156189, "sft_loss": 0.8181141018867493, "step": 7310 }, { "epoch": 0.57, "grad_norm": 16.23911476135254, "learning_rate": 3.975905780506591e-06, "logits/chosen": -1.2872745990753174, "logits/rejected": -1.3416489362716675, "logps/chosen": -0.9090186357498169, "logps/rejected": -5.8794755935668945, "loss": 0.9113, "odds_ratio_loss": 0.022481894120573997, "rewards/accuracies": 1.0, "rewards/chosen": -0.09090186655521393, "rewards/margins": 0.4970456659793854, "rewards/rejected": -0.5879475474357605, "sft_loss": 0.9090186357498169, "step": 7315 }, { "epoch": 0.57, "grad_norm": 8.851611137390137, "learning_rate": 3.9698795288805375e-06, "logits/chosen": -1.352212905883789, "logits/rejected": -0.933361828327179, "logps/chosen": -0.7633185386657715, "logps/rejected": -4.345231056213379, "loss": 0.7798, "odds_ratio_loss": 0.1644791066646576, "rewards/accuracies": 1.0, "rewards/chosen": -0.07633186131715775, "rewards/margins": 0.35819125175476074, "rewards/rejected": -0.43452316522598267, "sft_loss": 0.7633185386657715, "step": 7320 }, { "epoch": 0.57, "grad_norm": 7.002867698669434, "learning_rate": 3.963854839562968e-06, "logits/chosen": -1.4375827312469482, "logits/rejected": -0.7585693001747131, "logps/chosen": -0.97300785779953, "logps/rejected": -3.8505489826202393, "loss": 0.9821, "odds_ratio_loss": 0.09092002362012863, "rewards/accuracies": 1.0, "rewards/chosen": -0.09730079770088196, "rewards/margins": 0.2877541184425354, "rewards/rejected": -0.38505488634109497, "sft_loss": 0.97300785779953, "step": 7325 }, { "epoch": 0.57, "grad_norm": 5.4109978675842285, "learning_rate": 3.957831721691086e-06, "logits/chosen": -1.3101989030838013, "logits/rejected": -1.083234190940857, "logps/chosen": -0.9228937029838562, "logps/rejected": -4.4622979164123535, "loss": 0.9336, "odds_ratio_loss": 0.10741086304187775, "rewards/accuracies": 1.0, "rewards/chosen": -0.09228937327861786, "rewards/margins": 0.3539404273033142, "rewards/rejected": -0.4462297856807709, "sft_loss": 0.9228937029838562, "step": 7330 }, { "epoch": 0.57, "grad_norm": 27.10301399230957, "learning_rate": 3.95181018439972e-06, "logits/chosen": -1.1312278509140015, "logits/rejected": -1.573578119277954, "logps/chosen": -1.2438294887542725, "logps/rejected": -6.356667995452881, "loss": 1.2918, "odds_ratio_loss": 0.47944560647010803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12438295781612396, "rewards/margins": 0.5112838745117188, "rewards/rejected": -0.6356668472290039, "sft_loss": 1.2438294887542725, "step": 7335 }, { "epoch": 0.57, "grad_norm": 16.846214294433594, "learning_rate": 3.945790236821292e-06, "logits/chosen": -1.4615051746368408, "logits/rejected": -1.215092420578003, "logps/chosen": -0.9354572296142578, "logps/rejected": -4.896553993225098, "loss": 0.9407, "odds_ratio_loss": 0.05228148028254509, "rewards/accuracies": 1.0, "rewards/chosen": -0.09354571998119354, "rewards/margins": 0.3961096405982971, "rewards/rejected": -0.48965540528297424, "sft_loss": 0.9354572296142578, "step": 7340 }, { "epoch": 0.57, "grad_norm": 9.336442947387695, "learning_rate": 3.939771888085815e-06, "logits/chosen": -1.3849732875823975, "logits/rejected": -1.2363064289093018, "logps/chosen": -0.8775486946105957, "logps/rejected": -3.843815326690674, "loss": 0.9062, "odds_ratio_loss": 0.286162793636322, "rewards/accuracies": 1.0, "rewards/chosen": -0.08775487542152405, "rewards/margins": 0.2966266870498657, "rewards/rejected": -0.3843815326690674, "sft_loss": 0.8775486946105957, "step": 7345 }, { "epoch": 0.57, "grad_norm": 12.346092224121094, "learning_rate": 3.933755147320884e-06, "logits/chosen": -1.4507148265838623, "logits/rejected": -1.133741855621338, "logps/chosen": -1.1160967350006104, "logps/rejected": -5.653210639953613, "loss": 1.1234, "odds_ratio_loss": 0.07335402816534042, "rewards/accuracies": 1.0, "rewards/chosen": -0.11160967499017715, "rewards/margins": 0.4537113308906555, "rewards/rejected": -0.5653210878372192, "sft_loss": 1.1160967350006104, "step": 7350 }, { "epoch": 0.57, "grad_norm": 9.299036979675293, "learning_rate": 3.927740023651648e-06, "logits/chosen": -1.4600021839141846, "logits/rejected": -0.7748432755470276, "logps/chosen": -0.8878051042556763, "logps/rejected": -2.7120416164398193, "loss": 0.9038, "odds_ratio_loss": 0.15970009565353394, "rewards/accuracies": 1.0, "rewards/chosen": -0.08878050744533539, "rewards/margins": 0.1824236661195755, "rewards/rejected": -0.2712041735649109, "sft_loss": 0.8878051042556763, "step": 7355 }, { "epoch": 0.57, "grad_norm": 29.598922729492188, "learning_rate": 3.921726526200803e-06, "logits/chosen": -1.5297552347183228, "logits/rejected": -1.322796106338501, "logps/chosen": -0.7642945051193237, "logps/rejected": -3.3941707611083984, "loss": 0.8392, "odds_ratio_loss": 0.7493532299995422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07642945647239685, "rewards/margins": 0.2629876136779785, "rewards/rejected": -0.33941707015037537, "sft_loss": 0.7642945051193237, "step": 7360 }, { "epoch": 0.57, "grad_norm": 6.58057165145874, "learning_rate": 3.915714664088586e-06, "logits/chosen": -1.465050220489502, "logits/rejected": -0.8858221769332886, "logps/chosen": -1.0814603567123413, "logps/rejected": -8.027473449707031, "loss": 1.0989, "odds_ratio_loss": 0.17474313080310822, "rewards/accuracies": 1.0, "rewards/chosen": -0.10814603418111801, "rewards/margins": 0.6946013569831848, "rewards/rejected": -0.8027472496032715, "sft_loss": 1.0814603567123413, "step": 7365 }, { "epoch": 0.57, "grad_norm": 47.6906852722168, "learning_rate": 3.909704446432748e-06, "logits/chosen": -1.4229741096496582, "logits/rejected": -1.2047765254974365, "logps/chosen": -1.368956208229065, "logps/rejected": -4.672844409942627, "loss": 1.387, "odds_ratio_loss": 0.1804140955209732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13689562678337097, "rewards/margins": 0.33038878440856934, "rewards/rejected": -0.4672844409942627, "sft_loss": 1.368956208229065, "step": 7370 }, { "epoch": 0.57, "grad_norm": 13.896078109741211, "learning_rate": 3.903695882348545e-06, "logits/chosen": -1.5089856386184692, "logits/rejected": -1.2921054363250732, "logps/chosen": -1.1158466339111328, "logps/rejected": -2.5853195190429688, "loss": 1.1437, "odds_ratio_loss": 0.27874505519866943, "rewards/accuracies": 1.0, "rewards/chosen": -0.11158466339111328, "rewards/margins": 0.14694729447364807, "rewards/rejected": -0.25853198766708374, "sft_loss": 1.1158466339111328, "step": 7375 }, { "epoch": 0.57, "grad_norm": 6.536021709442139, "learning_rate": 3.897688980948729e-06, "logits/chosen": -1.3587908744812012, "logits/rejected": -1.3771960735321045, "logps/chosen": -1.0086991786956787, "logps/rejected": -5.609964847564697, "loss": 1.0279, "odds_ratio_loss": 0.19150960445404053, "rewards/accuracies": 1.0, "rewards/chosen": -0.10086993128061295, "rewards/margins": 0.4601265490055084, "rewards/rejected": -0.5609964728355408, "sft_loss": 1.0086991786956787, "step": 7380 }, { "epoch": 0.57, "grad_norm": 4.9608283042907715, "learning_rate": 3.891683751343528e-06, "logits/chosen": -1.3274409770965576, "logits/rejected": -0.616326630115509, "logps/chosen": -1.0203479528427124, "logps/rejected": -10.478364944458008, "loss": 1.0275, "odds_ratio_loss": 0.07164148986339569, "rewards/accuracies": 1.0, "rewards/chosen": -0.1020348072052002, "rewards/margins": 0.945801854133606, "rewards/rejected": -1.0478365421295166, "sft_loss": 1.0203479528427124, "step": 7385 }, { "epoch": 0.57, "grad_norm": 20.92188835144043, "learning_rate": 3.8856802026406355e-06, "logits/chosen": -1.3875770568847656, "logits/rejected": -1.0865342617034912, "logps/chosen": -0.9741819500923157, "logps/rejected": -10.687253952026367, "loss": 0.9878, "odds_ratio_loss": 0.1360916644334793, "rewards/accuracies": 1.0, "rewards/chosen": -0.09741818159818649, "rewards/margins": 0.971307098865509, "rewards/rejected": -1.068725347518921, "sft_loss": 0.9741819500923157, "step": 7390 }, { "epoch": 0.58, "grad_norm": 22.333799362182617, "learning_rate": 3.879678343945193e-06, "logits/chosen": -1.153346300125122, "logits/rejected": -1.1226739883422852, "logps/chosen": -3.0002946853637695, "logps/rejected": -3.063096284866333, "loss": 3.1454, "odds_ratio_loss": 1.450598955154419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3000294864177704, "rewards/margins": 0.006280136294662952, "rewards/rejected": -0.30630961060523987, "sft_loss": 3.0002946853637695, "step": 7395 }, { "epoch": 0.58, "grad_norm": 7.445085048675537, "learning_rate": 3.873678184359787e-06, "logits/chosen": -1.3794811964035034, "logits/rejected": -1.1986474990844727, "logps/chosen": -0.830730140209198, "logps/rejected": -10.583611488342285, "loss": 0.8329, "odds_ratio_loss": 0.021612081676721573, "rewards/accuracies": 1.0, "rewards/chosen": -0.08307301253080368, "rewards/margins": 0.9752882122993469, "rewards/rejected": -1.058361291885376, "sft_loss": 0.830730140209198, "step": 7400 }, { "epoch": 0.58, "grad_norm": 17.45865249633789, "learning_rate": 3.867679732984417e-06, "logits/chosen": -1.4199925661087036, "logits/rejected": -1.3598154783248901, "logps/chosen": -1.0561368465423584, "logps/rejected": -4.694639682769775, "loss": 1.1353, "odds_ratio_loss": 0.7915878295898438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10561369359493256, "rewards/margins": 0.36385026574134827, "rewards/rejected": -0.469463974237442, "sft_loss": 1.0561368465423584, "step": 7405 }, { "epoch": 0.58, "grad_norm": 7.087828159332275, "learning_rate": 3.861682998916495e-06, "logits/chosen": -1.4668176174163818, "logits/rejected": -1.0923296213150024, "logps/chosen": -1.2441160678863525, "logps/rejected": -5.291169166564941, "loss": 1.2933, "odds_ratio_loss": 0.4921974539756775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12441160529851913, "rewards/margins": 0.4047052264213562, "rewards/rejected": -0.5291168689727783, "sft_loss": 1.2441160678863525, "step": 7410 }, { "epoch": 0.58, "grad_norm": 20.64565658569336, "learning_rate": 3.855687991250833e-06, "logits/chosen": -1.3258016109466553, "logits/rejected": -0.9141836166381836, "logps/chosen": -1.0131573677062988, "logps/rejected": -4.818573951721191, "loss": 1.0382, "odds_ratio_loss": 0.250355064868927, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10131573677062988, "rewards/margins": 0.38054168224334717, "rewards/rejected": -0.48185744881629944, "sft_loss": 1.0131573677062988, "step": 7415 }, { "epoch": 0.58, "grad_norm": 31.21662712097168, "learning_rate": 3.84969471907962e-06, "logits/chosen": -1.4796231985092163, "logits/rejected": -1.3987324237823486, "logps/chosen": -1.1544537544250488, "logps/rejected": -7.364099025726318, "loss": 1.2128, "odds_ratio_loss": 0.5836321115493774, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11544537544250488, "rewards/margins": 0.6209646463394165, "rewards/rejected": -0.7364099621772766, "sft_loss": 1.1544537544250488, "step": 7420 }, { "epoch": 0.58, "grad_norm": 12.404463768005371, "learning_rate": 3.843703191492412e-06, "logits/chosen": -1.3646366596221924, "logits/rejected": -0.7645974159240723, "logps/chosen": -0.9818285703659058, "logps/rejected": -2.8497519493103027, "loss": 1.0267, "odds_ratio_loss": 0.44823575019836426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09818285703659058, "rewards/margins": 0.18679234385490417, "rewards/rejected": -0.28497520089149475, "sft_loss": 0.9818285703659058, "step": 7425 }, { "epoch": 0.58, "grad_norm": 10.508298873901367, "learning_rate": 3.837713417576125e-06, "logits/chosen": -1.3647921085357666, "logits/rejected": -0.8251982927322388, "logps/chosen": -1.1779931783676147, "logps/rejected": -9.686933517456055, "loss": 1.1988, "odds_ratio_loss": 0.20841288566589355, "rewards/accuracies": 1.0, "rewards/chosen": -0.11779932677745819, "rewards/margins": 0.8508940935134888, "rewards/rejected": -0.9686933755874634, "sft_loss": 1.1779931783676147, "step": 7430 }, { "epoch": 0.58, "grad_norm": 105.03116607666016, "learning_rate": 3.831725406415011e-06, "logits/chosen": -1.3835184574127197, "logits/rejected": -1.4062840938568115, "logps/chosen": -1.071958303451538, "logps/rejected": -2.3874545097351074, "loss": 1.1013, "odds_ratio_loss": 0.29344767332077026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10719583183526993, "rewards/margins": 0.1315496265888214, "rewards/rejected": -0.23874545097351074, "sft_loss": 1.071958303451538, "step": 7435 }, { "epoch": 0.58, "grad_norm": 13.58053207397461, "learning_rate": 3.825739167090648e-06, "logits/chosen": -1.4159271717071533, "logits/rejected": -0.8211624026298523, "logps/chosen": -1.19142484664917, "logps/rejected": -3.6063523292541504, "loss": 1.2506, "odds_ratio_loss": 0.5918790102005005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11914249509572983, "rewards/margins": 0.24149274826049805, "rewards/rejected": -0.36063528060913086, "sft_loss": 1.19142484664917, "step": 7440 }, { "epoch": 0.58, "grad_norm": 28.174692153930664, "learning_rate": 3.819754708681925e-06, "logits/chosen": -1.5469629764556885, "logits/rejected": -1.4011662006378174, "logps/chosen": -1.069461703300476, "logps/rejected": -7.015917778015137, "loss": 1.0704, "odds_ratio_loss": 0.009362577460706234, "rewards/accuracies": 1.0, "rewards/chosen": -0.1069461852312088, "rewards/margins": 0.594645619392395, "rewards/rejected": -0.7015917897224426, "sft_loss": 1.069461703300476, "step": 7445 }, { "epoch": 0.58, "grad_norm": 4.565928936004639, "learning_rate": 3.813772040265039e-06, "logits/chosen": -1.329349398612976, "logits/rejected": -1.0876749753952026, "logps/chosen": -0.9946697354316711, "logps/rejected": -4.11696720123291, "loss": 1.0389, "odds_ratio_loss": 0.4418897032737732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.099466972053051, "rewards/margins": 0.31222978234291077, "rewards/rejected": -0.41169673204421997, "sft_loss": 0.9946697354316711, "step": 7450 }, { "epoch": 0.58, "grad_norm": 16.70859146118164, "learning_rate": 3.8077911709134625e-06, "logits/chosen": -1.4293928146362305, "logits/rejected": -1.2371962070465088, "logps/chosen": -0.7824566960334778, "logps/rejected": -3.111362934112549, "loss": 0.7949, "odds_ratio_loss": 0.12397966533899307, "rewards/accuracies": 1.0, "rewards/chosen": -0.07824566960334778, "rewards/margins": 0.23289060592651367, "rewards/rejected": -0.31113627552986145, "sft_loss": 0.7824566960334778, "step": 7455 }, { "epoch": 0.58, "grad_norm": 5.1738200187683105, "learning_rate": 3.8018121096979432e-06, "logits/chosen": -1.3276052474975586, "logits/rejected": -1.1583489179611206, "logps/chosen": -0.8614734411239624, "logps/rejected": -13.287282943725586, "loss": 0.8701, "odds_ratio_loss": 0.08607557415962219, "rewards/accuracies": 1.0, "rewards/chosen": -0.08614735305309296, "rewards/margins": 1.2425811290740967, "rewards/rejected": -1.328728437423706, "sft_loss": 0.8614734411239624, "step": 7460 }, { "epoch": 0.58, "grad_norm": 8.078058242797852, "learning_rate": 3.7958348656864883e-06, "logits/chosen": -1.433990240097046, "logits/rejected": -0.9150580167770386, "logps/chosen": -0.9640260934829712, "logps/rejected": -3.0823981761932373, "loss": 0.9896, "odds_ratio_loss": 0.2554711699485779, "rewards/accuracies": 1.0, "rewards/chosen": -0.0964026004076004, "rewards/margins": 0.21183724701404572, "rewards/rejected": -0.3082398474216461, "sft_loss": 0.9640260934829712, "step": 7465 }, { "epoch": 0.58, "grad_norm": 93.4781494140625, "learning_rate": 3.7898594479443467e-06, "logits/chosen": -1.2909369468688965, "logits/rejected": -1.0158801078796387, "logps/chosen": -1.3513132333755493, "logps/rejected": -6.237979888916016, "loss": 1.3717, "odds_ratio_loss": 0.2035665214061737, "rewards/accuracies": 1.0, "rewards/chosen": -0.1351313292980194, "rewards/margins": 0.48866668343544006, "rewards/rejected": -0.6237980127334595, "sft_loss": 1.3513132333755493, "step": 7470 }, { "epoch": 0.58, "grad_norm": 7.653111457824707, "learning_rate": 3.7838858655339956e-06, "logits/chosen": -1.4438180923461914, "logits/rejected": -1.0234767198562622, "logps/chosen": -0.9316795468330383, "logps/rejected": -6.642031669616699, "loss": 0.9507, "odds_ratio_loss": 0.19040152430534363, "rewards/accuracies": 1.0, "rewards/chosen": -0.09316794574260712, "rewards/margins": 0.5710352659225464, "rewards/rejected": -0.6642031669616699, "sft_loss": 0.9316795468330383, "step": 7475 }, { "epoch": 0.58, "grad_norm": 24.714967727661133, "learning_rate": 3.777914127515135e-06, "logits/chosen": -1.0196001529693604, "logits/rejected": -1.8837283849716187, "logps/chosen": -1.2547192573547363, "logps/rejected": -16.984920501708984, "loss": 1.2547, "odds_ratio_loss": 0.00013136306370142847, "rewards/accuracies": 1.0, "rewards/chosen": -0.12547191977500916, "rewards/margins": 1.573020339012146, "rewards/rejected": -1.6984922885894775, "sft_loss": 1.2547192573547363, "step": 7480 }, { "epoch": 0.58, "grad_norm": 18.038217544555664, "learning_rate": 3.7719442429446624e-06, "logits/chosen": -1.3554340600967407, "logits/rejected": -1.111193299293518, "logps/chosen": -1.7390228509902954, "logps/rejected": -7.009710788726807, "loss": 1.7415, "odds_ratio_loss": 0.024865852668881416, "rewards/accuracies": 1.0, "rewards/chosen": -0.17390227317810059, "rewards/margins": 0.5270687937736511, "rewards/rejected": -0.7009710669517517, "sft_loss": 1.7390228509902954, "step": 7485 }, { "epoch": 0.58, "grad_norm": 6.1074676513671875, "learning_rate": 3.7659762208766653e-06, "logits/chosen": -1.3575494289398193, "logits/rejected": -0.9257568120956421, "logps/chosen": -1.1369520425796509, "logps/rejected": -8.370031356811523, "loss": 1.1644, "odds_ratio_loss": 0.27419567108154297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11369520425796509, "rewards/margins": 0.7233079075813293, "rewards/rejected": -0.8370031118392944, "sft_loss": 1.1369520425796509, "step": 7490 }, { "epoch": 0.58, "grad_norm": 17.318450927734375, "learning_rate": 3.760010070362406e-06, "logits/chosen": -1.277895212173462, "logits/rejected": -1.1367418766021729, "logps/chosen": -0.588483989238739, "logps/rejected": -6.119777202606201, "loss": 0.5932, "odds_ratio_loss": 0.04762459173798561, "rewards/accuracies": 1.0, "rewards/chosen": -0.05884840339422226, "rewards/margins": 0.5531293749809265, "rewards/rejected": -0.6119778156280518, "sft_loss": 0.588483989238739, "step": 7495 }, { "epoch": 0.58, "grad_norm": 9.198938369750977, "learning_rate": 3.754045800450311e-06, "logits/chosen": -1.3233236074447632, "logits/rejected": -1.4493837356567383, "logps/chosen": -1.0046014785766602, "logps/rejected": -7.714364528656006, "loss": 1.0113, "odds_ratio_loss": 0.06727956980466843, "rewards/accuracies": 1.0, "rewards/chosen": -0.10046014934778214, "rewards/margins": 0.6709764003753662, "rewards/rejected": -0.7714365124702454, "sft_loss": 1.0046014785766602, "step": 7500 }, { "epoch": 0.58, "grad_norm": 9.631113052368164, "learning_rate": 3.7480834201859527e-06, "logits/chosen": -1.3423171043395996, "logits/rejected": -0.9379371404647827, "logps/chosen": -0.8321939706802368, "logps/rejected": -4.562464237213135, "loss": 0.8415, "odds_ratio_loss": 0.09319966286420822, "rewards/accuracies": 1.0, "rewards/chosen": -0.08321939408779144, "rewards/margins": 0.3730270266532898, "rewards/rejected": -0.45624643564224243, "sft_loss": 0.8321939706802368, "step": 7505 }, { "epoch": 0.58, "grad_norm": 7.952220916748047, "learning_rate": 3.7421229386120352e-06, "logits/chosen": -1.4574992656707764, "logits/rejected": -1.5705512762069702, "logps/chosen": -1.4259287118911743, "logps/rejected": -13.200403213500977, "loss": 1.4282, "odds_ratio_loss": 0.022520998492836952, "rewards/accuracies": 1.0, "rewards/chosen": -0.1425928771495819, "rewards/margins": 1.1774473190307617, "rewards/rejected": -1.3200403451919556, "sft_loss": 1.4259287118911743, "step": 7510 }, { "epoch": 0.58, "grad_norm": 4.627284049987793, "learning_rate": 3.7361643647683887e-06, "logits/chosen": -1.4829736948013306, "logits/rejected": -1.054227590560913, "logps/chosen": -0.9570187330245972, "logps/rejected": -5.006585121154785, "loss": 0.9731, "odds_ratio_loss": 0.16131040453910828, "rewards/accuracies": 1.0, "rewards/chosen": -0.09570188075304031, "rewards/margins": 0.4049566388130188, "rewards/rejected": -0.5006585121154785, "sft_loss": 0.9570187330245972, "step": 7515 }, { "epoch": 0.58, "grad_norm": 13.184547424316406, "learning_rate": 3.7302077076919463e-06, "logits/chosen": -1.3098251819610596, "logits/rejected": -1.133541226387024, "logps/chosen": -1.29055917263031, "logps/rejected": -8.865058898925781, "loss": 1.3587, "odds_ratio_loss": 0.6810811758041382, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.129055917263031, "rewards/margins": 0.7574499845504761, "rewards/rejected": -0.8865059018135071, "sft_loss": 1.29055917263031, "step": 7520 }, { "epoch": 0.59, "grad_norm": 20.698753356933594, "learning_rate": 3.7242529764167336e-06, "logits/chosen": -1.2861082553863525, "logits/rejected": -1.3361252546310425, "logps/chosen": -1.0132089853286743, "logps/rejected": -2.560932159423828, "loss": 1.0499, "odds_ratio_loss": 0.36650410294532776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10132090002298355, "rewards/margins": 0.1547723114490509, "rewards/rejected": -0.25609320402145386, "sft_loss": 1.0132089853286743, "step": 7525 }, { "epoch": 0.59, "grad_norm": 20.249622344970703, "learning_rate": 3.7183001799738583e-06, "logits/chosen": -1.3709007501602173, "logits/rejected": -1.1397546529769897, "logps/chosen": -0.8171932101249695, "logps/rejected": -14.227322578430176, "loss": 0.8206, "odds_ratio_loss": 0.03364390507340431, "rewards/accuracies": 1.0, "rewards/chosen": -0.08171931654214859, "rewards/margins": 1.341012954711914, "rewards/rejected": -1.4227323532104492, "sft_loss": 0.8171932101249695, "step": 7530 }, { "epoch": 0.59, "grad_norm": 11.157368659973145, "learning_rate": 3.7123493273914913e-06, "logits/chosen": -1.4680503606796265, "logits/rejected": -1.0822898149490356, "logps/chosen": -0.9170717000961304, "logps/rejected": -6.117377758026123, "loss": 0.9235, "odds_ratio_loss": 0.0642508715391159, "rewards/accuracies": 1.0, "rewards/chosen": -0.09170717746019363, "rewards/margins": 0.520030677318573, "rewards/rejected": -0.611737847328186, "sft_loss": 0.9170717000961304, "step": 7535 }, { "epoch": 0.59, "grad_norm": 21.787038803100586, "learning_rate": 3.706400427694853e-06, "logits/chosen": -1.509522795677185, "logits/rejected": -1.2455151081085205, "logps/chosen": -0.729500949382782, "logps/rejected": -4.698983669281006, "loss": 0.7769, "odds_ratio_loss": 0.47362834215164185, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0729500949382782, "rewards/margins": 0.39694833755493164, "rewards/rejected": -0.4698984622955322, "sft_loss": 0.729500949382782, "step": 7540 }, { "epoch": 0.59, "grad_norm": 7.119065284729004, "learning_rate": 3.70045348990621e-06, "logits/chosen": -1.234168291091919, "logits/rejected": -1.2230815887451172, "logps/chosen": -0.9674831628799438, "logps/rejected": -6.17057991027832, "loss": 0.9699, "odds_ratio_loss": 0.023831719532608986, "rewards/accuracies": 1.0, "rewards/chosen": -0.09674831479787827, "rewards/margins": 0.5203096866607666, "rewards/rejected": -0.6170579791069031, "sft_loss": 0.9674831628799438, "step": 7545 }, { "epoch": 0.59, "grad_norm": 5.420483112335205, "learning_rate": 3.694508523044847e-06, "logits/chosen": -1.41762375831604, "logits/rejected": -1.1230498552322388, "logps/chosen": -0.95631343126297, "logps/rejected": -2.7387397289276123, "loss": 0.9797, "odds_ratio_loss": 0.23361878097057343, "rewards/accuracies": 1.0, "rewards/chosen": -0.09563133865594864, "rewards/margins": 0.17824262380599976, "rewards/rejected": -0.2738739848136902, "sft_loss": 0.95631343126297, "step": 7550 }, { "epoch": 0.59, "grad_norm": 16.290616989135742, "learning_rate": 3.68856553612706e-06, "logits/chosen": -1.213830590248108, "logits/rejected": -0.8929702043533325, "logps/chosen": -0.9933937191963196, "logps/rejected": -7.666130065917969, "loss": 1.0055, "odds_ratio_loss": 0.12060995399951935, "rewards/accuracies": 1.0, "rewards/chosen": -0.09933938086032867, "rewards/margins": 0.6672737002372742, "rewards/rejected": -0.7666130661964417, "sft_loss": 0.9933937191963196, "step": 7555 }, { "epoch": 0.59, "grad_norm": 14.791046142578125, "learning_rate": 3.682624538166143e-06, "logits/chosen": -1.2562824487686157, "logits/rejected": -1.0913159847259521, "logps/chosen": -1.080479383468628, "logps/rejected": -4.734437465667725, "loss": 1.0864, "odds_ratio_loss": 0.059390682727098465, "rewards/accuracies": 1.0, "rewards/chosen": -0.10804794728755951, "rewards/margins": 0.36539584398269653, "rewards/rejected": -0.47344380617141724, "sft_loss": 1.080479383468628, "step": 7560 }, { "epoch": 0.59, "grad_norm": 367.5003356933594, "learning_rate": 3.6766855381723756e-06, "logits/chosen": -1.426210641860962, "logits/rejected": -1.2520123720169067, "logps/chosen": -1.1438326835632324, "logps/rejected": -3.119784116744995, "loss": 1.1587, "odds_ratio_loss": 0.1487005054950714, "rewards/accuracies": 1.0, "rewards/chosen": -0.1143832802772522, "rewards/margins": 0.19759513437747955, "rewards/rejected": -0.31197839975357056, "sft_loss": 1.1438326835632324, "step": 7565 }, { "epoch": 0.59, "grad_norm": 16.09073257446289, "learning_rate": 3.6707485451530035e-06, "logits/chosen": -1.3527429103851318, "logits/rejected": -0.7134038209915161, "logps/chosen": -0.8533148765563965, "logps/rejected": -2.4193506240844727, "loss": 0.87, "odds_ratio_loss": 0.16703931987285614, "rewards/accuracies": 1.0, "rewards/chosen": -0.085331492125988, "rewards/margins": 0.1566035896539688, "rewards/rejected": -0.24193505942821503, "sft_loss": 0.8533148765563965, "step": 7570 }, { "epoch": 0.59, "grad_norm": 5.578355312347412, "learning_rate": 3.66481356811223e-06, "logits/chosen": -1.459567666053772, "logits/rejected": -0.7954663038253784, "logps/chosen": -0.8611226081848145, "logps/rejected": -5.201925754547119, "loss": 0.8792, "odds_ratio_loss": 0.18051791191101074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08611226081848145, "rewards/margins": 0.4340802729129791, "rewards/rejected": -0.520192563533783, "sft_loss": 0.8611226081848145, "step": 7575 }, { "epoch": 0.59, "grad_norm": 21.39191436767578, "learning_rate": 3.658880616051204e-06, "logits/chosen": -1.3536994457244873, "logits/rejected": -1.4201124906539917, "logps/chosen": -1.0324753522872925, "logps/rejected": -2.731003522872925, "loss": 1.0648, "odds_ratio_loss": 0.32371786236763, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10324753820896149, "rewards/margins": 0.16985280811786652, "rewards/rejected": -0.273100346326828, "sft_loss": 1.0324753522872925, "step": 7580 }, { "epoch": 0.59, "grad_norm": 4.845981121063232, "learning_rate": 3.652949697967998e-06, "logits/chosen": -1.3896377086639404, "logits/rejected": -0.6858119368553162, "logps/chosen": -0.9702582359313965, "logps/rejected": -11.030991554260254, "loss": 0.993, "odds_ratio_loss": 0.22788207232952118, "rewards/accuracies": 1.0, "rewards/chosen": -0.09702581912279129, "rewards/margins": 1.0060734748840332, "rewards/rejected": -1.1030992269515991, "sft_loss": 0.9702582359313965, "step": 7585 }, { "epoch": 0.59, "grad_norm": 8.864776611328125, "learning_rate": 3.6470208228576017e-06, "logits/chosen": -1.3436756134033203, "logits/rejected": -1.6786384582519531, "logps/chosen": -0.9208037257194519, "logps/rejected": -13.882339477539062, "loss": 0.9308, "odds_ratio_loss": 0.09978379309177399, "rewards/accuracies": 1.0, "rewards/chosen": -0.09208037704229355, "rewards/margins": 1.2961535453796387, "rewards/rejected": -1.3882339000701904, "sft_loss": 0.9208037257194519, "step": 7590 }, { "epoch": 0.59, "grad_norm": 9.687556266784668, "learning_rate": 3.6410939997119097e-06, "logits/chosen": -1.390998125076294, "logits/rejected": -1.4484277963638306, "logps/chosen": -1.0292510986328125, "logps/rejected": -4.382474422454834, "loss": 1.0741, "odds_ratio_loss": 0.44898781180381775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10292510688304901, "rewards/margins": 0.3353223204612732, "rewards/rejected": -0.4382474422454834, "sft_loss": 1.0292510986328125, "step": 7595 }, { "epoch": 0.59, "grad_norm": 39.966758728027344, "learning_rate": 3.6351692375197018e-06, "logits/chosen": -1.3556313514709473, "logits/rejected": -1.0924326181411743, "logps/chosen": -1.3269069194793701, "logps/rejected": -8.789861679077148, "loss": 1.3478, "odds_ratio_loss": 0.2092055082321167, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1326906979084015, "rewards/margins": 0.7462955713272095, "rewards/rejected": -0.8789861798286438, "sft_loss": 1.3269069194793701, "step": 7600 }, { "epoch": 0.59, "grad_norm": 14.269389152526855, "learning_rate": 3.629246545266629e-06, "logits/chosen": -1.324946641921997, "logits/rejected": -1.335137128829956, "logps/chosen": -1.1242374181747437, "logps/rejected": -7.613337516784668, "loss": 1.1367, "odds_ratio_loss": 0.1243775486946106, "rewards/accuracies": 1.0, "rewards/chosen": -0.11242374032735825, "rewards/margins": 0.6489100456237793, "rewards/rejected": -0.7613337635993958, "sft_loss": 1.1242374181747437, "step": 7605 }, { "epoch": 0.59, "grad_norm": 7.829433917999268, "learning_rate": 3.6233259319352098e-06, "logits/chosen": -1.3544420003890991, "logits/rejected": -1.10740327835083, "logps/chosen": -1.101051926612854, "logps/rejected": -2.454871416091919, "loss": 1.1316, "odds_ratio_loss": 0.30578336119651794, "rewards/accuracies": 1.0, "rewards/chosen": -0.11010519415140152, "rewards/margins": 0.13538196682929993, "rewards/rejected": -0.24548716843128204, "sft_loss": 1.101051926612854, "step": 7610 }, { "epoch": 0.59, "grad_norm": 108.93167114257812, "learning_rate": 3.6174074065048035e-06, "logits/chosen": -1.2685353755950928, "logits/rejected": -1.3089756965637207, "logps/chosen": -0.8494631052017212, "logps/rejected": -5.920241832733154, "loss": 0.8577, "odds_ratio_loss": 0.08219350874423981, "rewards/accuracies": 1.0, "rewards/chosen": -0.08494630455970764, "rewards/margins": 0.5070779323577881, "rewards/rejected": -0.5920242071151733, "sft_loss": 0.8494631052017212, "step": 7615 }, { "epoch": 0.59, "grad_norm": 24.65989875793457, "learning_rate": 3.611490977951606e-06, "logits/chosen": -1.4084243774414062, "logits/rejected": -1.1269807815551758, "logps/chosen": -1.041349172592163, "logps/rejected": -4.547548294067383, "loss": 1.0507, "odds_ratio_loss": 0.09329565614461899, "rewards/accuracies": 1.0, "rewards/chosen": -0.10413491725921631, "rewards/margins": 0.35061994194984436, "rewards/rejected": -0.45475488901138306, "sft_loss": 1.041349172592163, "step": 7620 }, { "epoch": 0.59, "grad_norm": 6.330051898956299, "learning_rate": 3.6055766552486304e-06, "logits/chosen": -1.4215527772903442, "logits/rejected": -1.018003225326538, "logps/chosen": -1.4540941715240479, "logps/rejected": -3.3146910667419434, "loss": 1.4778, "odds_ratio_loss": 0.23683035373687744, "rewards/accuracies": 1.0, "rewards/chosen": -0.14540942013263702, "rewards/margins": 0.18605968356132507, "rewards/rejected": -0.3314690589904785, "sft_loss": 1.4540941715240479, "step": 7625 }, { "epoch": 0.59, "grad_norm": 10.327467918395996, "learning_rate": 3.5996644473657026e-06, "logits/chosen": -1.3508819341659546, "logits/rejected": -1.580324411392212, "logps/chosen": -0.6978863477706909, "logps/rejected": -7.110561370849609, "loss": 0.7048, "odds_ratio_loss": 0.06865421682596207, "rewards/accuracies": 1.0, "rewards/chosen": -0.06978863477706909, "rewards/margins": 0.6412675380706787, "rewards/rejected": -0.7110562324523926, "sft_loss": 0.6978863477706909, "step": 7630 }, { "epoch": 0.59, "grad_norm": 29.400272369384766, "learning_rate": 3.593754363269434e-06, "logits/chosen": -1.2756131887435913, "logits/rejected": -1.0476934909820557, "logps/chosen": -0.9183648824691772, "logps/rejected": -1.5128974914550781, "loss": 0.9769, "odds_ratio_loss": 0.5853749513626099, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09183648973703384, "rewards/margins": 0.05945326015353203, "rewards/rejected": -0.15128974616527557, "sft_loss": 0.9183648824691772, "step": 7635 }, { "epoch": 0.59, "grad_norm": 17.112735748291016, "learning_rate": 3.587846411923215e-06, "logits/chosen": -1.3515738248825073, "logits/rejected": -1.3550692796707153, "logps/chosen": -1.1381123065948486, "logps/rejected": -5.934088230133057, "loss": 1.1593, "odds_ratio_loss": 0.21174244582653046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11381123960018158, "rewards/margins": 0.47959762811660767, "rewards/rejected": -0.5934088230133057, "sft_loss": 1.1381123065948486, "step": 7640 }, { "epoch": 0.59, "grad_norm": 15.542469024658203, "learning_rate": 3.581940602287208e-06, "logits/chosen": -1.2181787490844727, "logits/rejected": -1.3325717449188232, "logps/chosen": -0.8493123054504395, "logps/rejected": -5.115633487701416, "loss": 0.8657, "odds_ratio_loss": 0.16385197639465332, "rewards/accuracies": 1.0, "rewards/chosen": -0.08493123948574066, "rewards/margins": 0.4266320765018463, "rewards/rejected": -0.5115633606910706, "sft_loss": 0.8493123054504395, "step": 7645 }, { "epoch": 0.6, "grad_norm": 6.590150356292725, "learning_rate": 3.576036943318322e-06, "logits/chosen": -1.3251354694366455, "logits/rejected": -0.8361200094223022, "logps/chosen": -1.0244470834732056, "logps/rejected": -3.8385987281799316, "loss": 1.0434, "odds_ratio_loss": 0.1891406774520874, "rewards/accuracies": 1.0, "rewards/chosen": -0.10244472324848175, "rewards/margins": 0.2814151644706726, "rewards/rejected": -0.38385987281799316, "sft_loss": 1.0244470834732056, "step": 7650 }, { "epoch": 0.6, "grad_norm": 21.922080993652344, "learning_rate": 3.570135443970203e-06, "logits/chosen": -1.3503806591033936, "logits/rejected": -1.3192524909973145, "logps/chosen": -0.9770252108573914, "logps/rejected": -4.227506160736084, "loss": 0.9929, "odds_ratio_loss": 0.15905950963497162, "rewards/accuracies": 1.0, "rewards/chosen": -0.0977025181055069, "rewards/margins": 0.3250480592250824, "rewards/rejected": -0.4227506220340729, "sft_loss": 0.9770252108573914, "step": 7655 }, { "epoch": 0.6, "grad_norm": 11.335700988769531, "learning_rate": 3.5642361131932274e-06, "logits/chosen": -1.413520097732544, "logits/rejected": -1.1838710308074951, "logps/chosen": -1.6299006938934326, "logps/rejected": -4.449075222015381, "loss": 1.7312, "odds_ratio_loss": 1.0133378505706787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16299009323120117, "rewards/margins": 0.2819174826145172, "rewards/rejected": -0.4449075162410736, "sft_loss": 1.6299006938934326, "step": 7660 }, { "epoch": 0.6, "grad_norm": 33.036643981933594, "learning_rate": 3.5583389599344775e-06, "logits/chosen": -1.4875004291534424, "logits/rejected": -1.414227843284607, "logps/chosen": -0.8248567581176758, "logps/rejected": -17.03693199157715, "loss": 0.8278, "odds_ratio_loss": 0.029040660709142685, "rewards/accuracies": 1.0, "rewards/chosen": -0.08248567581176758, "rewards/margins": 1.6212074756622314, "rewards/rejected": -1.703693151473999, "sft_loss": 0.8248567581176758, "step": 7665 }, { "epoch": 0.6, "grad_norm": 21.289030075073242, "learning_rate": 3.552443993137735e-06, "logits/chosen": -1.4467661380767822, "logits/rejected": -0.9300572276115417, "logps/chosen": -1.0593516826629639, "logps/rejected": -2.2387285232543945, "loss": 1.0911, "odds_ratio_loss": 0.31711629033088684, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10593517869710922, "rewards/margins": 0.11793768405914307, "rewards/rejected": -0.2238728553056717, "sft_loss": 1.0593516826629639, "step": 7670 }, { "epoch": 0.6, "grad_norm": 4.732230186462402, "learning_rate": 3.5465512217434663e-06, "logits/chosen": -1.2538764476776123, "logits/rejected": -1.163688063621521, "logps/chosen": -1.445475697517395, "logps/rejected": -8.06837272644043, "loss": 1.4526, "odds_ratio_loss": 0.07167679071426392, "rewards/accuracies": 1.0, "rewards/chosen": -0.14454758167266846, "rewards/margins": 0.6622897386550903, "rewards/rejected": -0.8068373799324036, "sft_loss": 1.445475697517395, "step": 7675 }, { "epoch": 0.6, "grad_norm": 5.590085506439209, "learning_rate": 3.5406606546888072e-06, "logits/chosen": -1.4303420782089233, "logits/rejected": -1.1193432807922363, "logps/chosen": -1.1342713832855225, "logps/rejected": -5.6936726570129395, "loss": 1.1835, "odds_ratio_loss": 0.49192532896995544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11342713981866837, "rewards/margins": 0.4559400975704193, "rewards/rejected": -0.5693672895431519, "sft_loss": 1.1342713832855225, "step": 7680 }, { "epoch": 0.6, "grad_norm": 5.165656089782715, "learning_rate": 3.5347723009075496e-06, "logits/chosen": -1.3710882663726807, "logits/rejected": -1.0793540477752686, "logps/chosen": -0.9359593391418457, "logps/rejected": -7.214070796966553, "loss": 0.9424, "odds_ratio_loss": 0.06422661244869232, "rewards/accuracies": 1.0, "rewards/chosen": -0.09359593689441681, "rewards/margins": 0.627811074256897, "rewards/rejected": -0.7214070558547974, "sft_loss": 0.9359593391418457, "step": 7685 }, { "epoch": 0.6, "grad_norm": 12.285545349121094, "learning_rate": 3.52888616933013e-06, "logits/chosen": -1.3175373077392578, "logits/rejected": -1.4724985361099243, "logps/chosen": -1.6350700855255127, "logps/rejected": -7.500203609466553, "loss": 1.6442, "odds_ratio_loss": 0.09161634743213654, "rewards/accuracies": 1.0, "rewards/chosen": -0.16350701451301575, "rewards/margins": 0.586513340473175, "rewards/rejected": -0.7500203251838684, "sft_loss": 1.6350700855255127, "step": 7690 }, { "epoch": 0.6, "grad_norm": 6.217746257781982, "learning_rate": 3.523002268883615e-06, "logits/chosen": -1.2274795770645142, "logits/rejected": -1.316551923751831, "logps/chosen": -0.7977105975151062, "logps/rejected": -9.286924362182617, "loss": 0.816, "odds_ratio_loss": 0.1833195835351944, "rewards/accuracies": 1.0, "rewards/chosen": -0.07977106422185898, "rewards/margins": 0.8489214181900024, "rewards/rejected": -0.9286924600601196, "sft_loss": 0.7977105975151062, "step": 7695 }, { "epoch": 0.6, "grad_norm": 9.02701187133789, "learning_rate": 3.5171206084916865e-06, "logits/chosen": -1.4551115036010742, "logits/rejected": -0.822216808795929, "logps/chosen": -1.205783486366272, "logps/rejected": -16.62922477722168, "loss": 1.206, "odds_ratio_loss": 0.002027118345722556, "rewards/accuracies": 1.0, "rewards/chosen": -0.1205783486366272, "rewards/margins": 1.542344331741333, "rewards/rejected": -1.6629226207733154, "sft_loss": 1.205783486366272, "step": 7700 }, { "epoch": 0.6, "grad_norm": 25.063343048095703, "learning_rate": 3.5112411970746263e-06, "logits/chosen": -1.3958866596221924, "logits/rejected": -0.9426444172859192, "logps/chosen": -0.8656539916992188, "logps/rejected": -8.09262466430664, "loss": 0.8748, "odds_ratio_loss": 0.0913071259856224, "rewards/accuracies": 1.0, "rewards/chosen": -0.08656540513038635, "rewards/margins": 0.7226970791816711, "rewards/rejected": -0.8092624545097351, "sft_loss": 0.8656539916992188, "step": 7705 }, { "epoch": 0.6, "grad_norm": 10.038653373718262, "learning_rate": 3.5053640435493136e-06, "logits/chosen": -1.1906545162200928, "logits/rejected": -1.2683629989624023, "logps/chosen": -0.6818944215774536, "logps/rejected": -5.234747886657715, "loss": 0.7035, "odds_ratio_loss": 0.21561065316200256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06818944960832596, "rewards/margins": 0.45528537034988403, "rewards/rejected": -0.5234748125076294, "sft_loss": 0.6818944215774536, "step": 7710 }, { "epoch": 0.6, "grad_norm": 6.408538818359375, "learning_rate": 3.4994891568291955e-06, "logits/chosen": -1.3831173181533813, "logits/rejected": -0.8486045002937317, "logps/chosen": -0.8704649209976196, "logps/rejected": -2.675227403640747, "loss": 0.9098, "odds_ratio_loss": 0.39366015791893005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08704648911952972, "rewards/margins": 0.18047623336315155, "rewards/rejected": -0.26752275228500366, "sft_loss": 0.8704649209976196, "step": 7715 }, { "epoch": 0.6, "grad_norm": 58.75336456298828, "learning_rate": 3.4936165458242817e-06, "logits/chosen": -1.4805707931518555, "logits/rejected": -1.1970739364624023, "logps/chosen": -0.9983084797859192, "logps/rejected": -4.621636867523193, "loss": 1.0295, "odds_ratio_loss": 0.31158146262168884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09983084350824356, "rewards/margins": 0.36233288049697876, "rewards/rejected": -0.4621637463569641, "sft_loss": 0.9983084797859192, "step": 7720 }, { "epoch": 0.6, "grad_norm": 4.402565956115723, "learning_rate": 3.487746219441135e-06, "logits/chosen": -1.3285706043243408, "logits/rejected": -0.7722166180610657, "logps/chosen": -1.158427357673645, "logps/rejected": -6.5065107345581055, "loss": 1.1778, "odds_ratio_loss": 0.19363968074321747, "rewards/accuracies": 1.0, "rewards/chosen": -0.11584273725748062, "rewards/margins": 0.534808337688446, "rewards/rejected": -0.6506510376930237, "sft_loss": 1.158427357673645, "step": 7725 }, { "epoch": 0.6, "grad_norm": 13.3212308883667, "learning_rate": 3.48187818658285e-06, "logits/chosen": -1.465038537979126, "logits/rejected": -0.6518747806549072, "logps/chosen": -1.0956566333770752, "logps/rejected": -5.187521934509277, "loss": 1.1181, "odds_ratio_loss": 0.2242608368396759, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10956566035747528, "rewards/margins": 0.4091865122318268, "rewards/rejected": -0.5187522172927856, "sft_loss": 1.0956566333770752, "step": 7730 }, { "epoch": 0.6, "grad_norm": 4.018584251403809, "learning_rate": 3.476012456149043e-06, "logits/chosen": -1.2446355819702148, "logits/rejected": -1.1161531209945679, "logps/chosen": -0.7302519083023071, "logps/rejected": -2.7706727981567383, "loss": 0.7551, "odds_ratio_loss": 0.2483852356672287, "rewards/accuracies": 1.0, "rewards/chosen": -0.0730251893401146, "rewards/margins": 0.20404212176799774, "rewards/rejected": -0.27706727385520935, "sft_loss": 0.7302519083023071, "step": 7735 }, { "epoch": 0.6, "grad_norm": 4.515651702880859, "learning_rate": 3.4701490370358375e-06, "logits/chosen": -1.3448576927185059, "logits/rejected": -1.0877039432525635, "logps/chosen": -1.065352201461792, "logps/rejected": -7.31790828704834, "loss": 1.0974, "odds_ratio_loss": 0.3203319013118744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10653523355722427, "rewards/margins": 0.6252555847167969, "rewards/rejected": -0.7317909002304077, "sft_loss": 1.065352201461792, "step": 7740 }, { "epoch": 0.6, "grad_norm": 7.58742094039917, "learning_rate": 3.464287938135857e-06, "logits/chosen": -1.4285423755645752, "logits/rejected": -0.9537609815597534, "logps/chosen": -0.7986385226249695, "logps/rejected": -4.994203090667725, "loss": 0.8187, "odds_ratio_loss": 0.2002413272857666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07986384630203247, "rewards/margins": 0.4195564389228821, "rewards/rejected": -0.49942031502723694, "sft_loss": 0.7986385226249695, "step": 7745 }, { "epoch": 0.6, "grad_norm": 15.741750717163086, "learning_rate": 3.4584291683382e-06, "logits/chosen": -1.4255239963531494, "logits/rejected": -0.8259667158126831, "logps/chosen": -0.9038424491882324, "logps/rejected": -9.146833419799805, "loss": 0.9213, "odds_ratio_loss": 0.17470580339431763, "rewards/accuracies": 1.0, "rewards/chosen": -0.09038425981998444, "rewards/margins": 0.824299156665802, "rewards/rejected": -0.91468346118927, "sft_loss": 0.9038424491882324, "step": 7750 }, { "epoch": 0.6, "grad_norm": 39.928260803222656, "learning_rate": 3.452572736528433e-06, "logits/chosen": -1.365174651145935, "logits/rejected": -1.3115659952163696, "logps/chosen": -1.3342235088348389, "logps/rejected": -2.6797633171081543, "loss": 1.3923, "odds_ratio_loss": 0.5805569887161255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1334223449230194, "rewards/margins": 0.13455398380756378, "rewards/rejected": -0.2679763436317444, "sft_loss": 1.3342235088348389, "step": 7755 }, { "epoch": 0.6, "grad_norm": 7.70234489440918, "learning_rate": 3.4467186515885816e-06, "logits/chosen": -1.4198806285858154, "logits/rejected": -0.9018239974975586, "logps/chosen": -1.1378166675567627, "logps/rejected": -9.194976806640625, "loss": 1.1409, "odds_ratio_loss": 0.03051009215414524, "rewards/accuracies": 1.0, "rewards/chosen": -0.1137816533446312, "rewards/margins": 0.8057159185409546, "rewards/rejected": -0.9194976687431335, "sft_loss": 1.1378166675567627, "step": 7760 }, { "epoch": 0.6, "grad_norm": 8.322214126586914, "learning_rate": 3.440866922397107e-06, "logits/chosen": -1.3290245532989502, "logits/rejected": -1.185469150543213, "logps/chosen": -1.0787431001663208, "logps/rejected": -3.751833438873291, "loss": 1.1054, "odds_ratio_loss": 0.266160786151886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1078743115067482, "rewards/margins": 0.2673090696334839, "rewards/rejected": -0.3751833736896515, "sft_loss": 1.0787431001663208, "step": 7765 }, { "epoch": 0.6, "grad_norm": 22.85993003845215, "learning_rate": 3.435017557828898e-06, "logits/chosen": -1.079116702079773, "logits/rejected": -0.8420137166976929, "logps/chosen": -1.093390941619873, "logps/rejected": -5.883313179016113, "loss": 1.1123, "odds_ratio_loss": 0.18955589830875397, "rewards/accuracies": 1.0, "rewards/chosen": -0.10933909565210342, "rewards/margins": 0.4789922833442688, "rewards/rejected": -0.5883313417434692, "sft_loss": 1.093390941619873, "step": 7770 }, { "epoch": 0.6, "grad_norm": 3.7217321395874023, "learning_rate": 3.4291705667552623e-06, "logits/chosen": -1.2628605365753174, "logits/rejected": -0.8282767534255981, "logps/chosen": -0.9780300259590149, "logps/rejected": -7.870736122131348, "loss": 0.9806, "odds_ratio_loss": 0.026035413146018982, "rewards/accuracies": 1.0, "rewards/chosen": -0.0978030115365982, "rewards/margins": 0.689270555973053, "rewards/rejected": -0.78707355260849, "sft_loss": 0.9780300259590149, "step": 7775 }, { "epoch": 0.61, "grad_norm": 22.497316360473633, "learning_rate": 3.423325958043903e-06, "logits/chosen": -0.9871518015861511, "logits/rejected": -1.1091338396072388, "logps/chosen": -1.0703319311141968, "logps/rejected": -2.1713783740997314, "loss": 1.107, "odds_ratio_loss": 0.3669655919075012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10703320801258087, "rewards/margins": 0.11010462045669556, "rewards/rejected": -0.21713781356811523, "sft_loss": 1.0703319311141968, "step": 7780 }, { "epoch": 0.61, "grad_norm": 6.523499011993408, "learning_rate": 3.417483740558909e-06, "logits/chosen": -1.3533470630645752, "logits/rejected": -0.9151613116264343, "logps/chosen": -0.9925488233566284, "logps/rejected": -6.206988334655762, "loss": 0.9971, "odds_ratio_loss": 0.045540980994701385, "rewards/accuracies": 1.0, "rewards/chosen": -0.09925489127635956, "rewards/margins": 0.5214439630508423, "rewards/rejected": -0.6206988096237183, "sft_loss": 0.9925488233566284, "step": 7785 }, { "epoch": 0.61, "grad_norm": 5.865688800811768, "learning_rate": 3.411643923160748e-06, "logits/chosen": -1.4720577001571655, "logits/rejected": -1.1578564643859863, "logps/chosen": -0.9088132977485657, "logps/rejected": -3.383127212524414, "loss": 0.9215, "odds_ratio_loss": 0.12700459361076355, "rewards/accuracies": 1.0, "rewards/chosen": -0.09088132530450821, "rewards/margins": 0.24743135273456573, "rewards/rejected": -0.33831268548965454, "sft_loss": 0.9088132977485657, "step": 7790 }, { "epoch": 0.61, "grad_norm": 5.861291885375977, "learning_rate": 3.4058065147062423e-06, "logits/chosen": -1.3404837846755981, "logits/rejected": -0.8289557695388794, "logps/chosen": -1.0182613134384155, "logps/rejected": -7.828372955322266, "loss": 1.0499, "odds_ratio_loss": 0.3160038888454437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10182613134384155, "rewards/margins": 0.6810111403465271, "rewards/rejected": -0.7828372716903687, "sft_loss": 1.0182613134384155, "step": 7795 }, { "epoch": 0.61, "grad_norm": 12.210633277893066, "learning_rate": 3.3999715240485643e-06, "logits/chosen": -1.4205188751220703, "logits/rejected": -1.0572283267974854, "logps/chosen": -0.8838884234428406, "logps/rejected": -20.02041244506836, "loss": 0.8892, "odds_ratio_loss": 0.05316939204931259, "rewards/accuracies": 1.0, "rewards/chosen": -0.0883888453245163, "rewards/margins": 1.9136524200439453, "rewards/rejected": -2.0020413398742676, "sft_loss": 0.8838884234428406, "step": 7800 }, { "epoch": 0.61, "grad_norm": 4.870990753173828, "learning_rate": 3.3941389600372166e-06, "logits/chosen": -1.3752845525741577, "logits/rejected": -1.2575510740280151, "logps/chosen": -0.6893249750137329, "logps/rejected": -13.15601634979248, "loss": 0.6917, "odds_ratio_loss": 0.024077700451016426, "rewards/accuracies": 1.0, "rewards/chosen": -0.06893249601125717, "rewards/margins": 1.2466691732406616, "rewards/rejected": -1.3156015872955322, "sft_loss": 0.6893249750137329, "step": 7805 }, { "epoch": 0.61, "grad_norm": 27.354835510253906, "learning_rate": 3.3883088315180252e-06, "logits/chosen": -1.4232900142669678, "logits/rejected": -0.8507736325263977, "logps/chosen": -1.111433506011963, "logps/rejected": -5.32407283782959, "loss": 1.1536, "odds_ratio_loss": 0.4220332205295563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11114335060119629, "rewards/margins": 0.4212639331817627, "rewards/rejected": -0.532407283782959, "sft_loss": 1.111433506011963, "step": 7810 }, { "epoch": 0.61, "grad_norm": 228.3631134033203, "learning_rate": 3.3824811473331187e-06, "logits/chosen": -1.4606685638427734, "logits/rejected": -1.24955153465271, "logps/chosen": -1.389299988746643, "logps/rejected": -14.165156364440918, "loss": 1.3971, "odds_ratio_loss": 0.07790975272655487, "rewards/accuracies": 1.0, "rewards/chosen": -0.13892999291419983, "rewards/margins": 1.2775856256484985, "rewards/rejected": -1.4165157079696655, "sft_loss": 1.389299988746643, "step": 7815 }, { "epoch": 0.61, "grad_norm": 23.588632583618164, "learning_rate": 3.3766559163209187e-06, "logits/chosen": -1.5015602111816406, "logits/rejected": -1.1102159023284912, "logps/chosen": -0.867430567741394, "logps/rejected": -4.193625450134277, "loss": 0.9145, "odds_ratio_loss": 0.47047194838523865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08674306422472, "rewards/margins": 0.33261948823928833, "rewards/rejected": -0.41936248540878296, "sft_loss": 0.867430567741394, "step": 7820 }, { "epoch": 0.61, "grad_norm": 6.6012983322143555, "learning_rate": 3.3708331473161314e-06, "logits/chosen": -1.3114466667175293, "logits/rejected": -1.0500717163085938, "logps/chosen": -1.4407585859298706, "logps/rejected": -3.9645659923553467, "loss": 1.4913, "odds_ratio_loss": 0.5049613118171692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14407587051391602, "rewards/margins": 0.25238072872161865, "rewards/rejected": -0.39645659923553467, "sft_loss": 1.4407585859298706, "step": 7825 }, { "epoch": 0.61, "grad_norm": 14.379029273986816, "learning_rate": 3.3650128491497235e-06, "logits/chosen": -1.4079885482788086, "logits/rejected": -1.43578040599823, "logps/chosen": -1.104499101638794, "logps/rejected": -11.191978454589844, "loss": 1.1123, "odds_ratio_loss": 0.07822314649820328, "rewards/accuracies": 1.0, "rewards/chosen": -0.1104498952627182, "rewards/margins": 1.008747935295105, "rewards/rejected": -1.1191978454589844, "sft_loss": 1.104499101638794, "step": 7830 }, { "epoch": 0.61, "grad_norm": 71.34578704833984, "learning_rate": 3.3591950306489144e-06, "logits/chosen": -1.327294945716858, "logits/rejected": -1.5994123220443726, "logps/chosen": -0.9668186902999878, "logps/rejected": -12.173527717590332, "loss": 0.9724, "odds_ratio_loss": 0.055557817220687866, "rewards/accuracies": 1.0, "rewards/chosen": -0.0966818779706955, "rewards/margins": 1.1206709146499634, "rewards/rejected": -1.2173527479171753, "sft_loss": 0.9668186902999878, "step": 7835 }, { "epoch": 0.61, "grad_norm": 157.09326171875, "learning_rate": 3.353379700637167e-06, "logits/chosen": -1.5097315311431885, "logits/rejected": -1.1535099744796753, "logps/chosen": -1.4491995573043823, "logps/rejected": -6.186688423156738, "loss": 1.4735, "odds_ratio_loss": 0.2425946295261383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14491994678974152, "rewards/margins": 0.47374892234802246, "rewards/rejected": -0.6186689138412476, "sft_loss": 1.4491995573043823, "step": 7840 }, { "epoch": 0.61, "grad_norm": 7.054977893829346, "learning_rate": 3.3475668679341678e-06, "logits/chosen": -1.1684037446975708, "logits/rejected": -1.116753101348877, "logps/chosen": -1.122152328491211, "logps/rejected": -12.81079387664795, "loss": 1.1305, "odds_ratio_loss": 0.08336828649044037, "rewards/accuracies": 1.0, "rewards/chosen": -0.11221524327993393, "rewards/margins": 1.1688642501831055, "rewards/rejected": -1.2810795307159424, "sft_loss": 1.122152328491211, "step": 7845 }, { "epoch": 0.61, "grad_norm": 52.86894226074219, "learning_rate": 3.341756541355811e-06, "logits/chosen": -1.309032678604126, "logits/rejected": -1.1107581853866577, "logps/chosen": -1.2603918313980103, "logps/rejected": -10.970057487487793, "loss": 1.2608, "odds_ratio_loss": 0.003665325464680791, "rewards/accuracies": 1.0, "rewards/chosen": -0.12603919208049774, "rewards/margins": 0.970966637134552, "rewards/rejected": -1.0970057249069214, "sft_loss": 1.2603918313980103, "step": 7850 }, { "epoch": 0.61, "grad_norm": 231.02935791015625, "learning_rate": 3.3359487297142014e-06, "logits/chosen": -1.4242833852767944, "logits/rejected": -0.9572874903678894, "logps/chosen": -1.0314557552337646, "logps/rejected": -6.515725612640381, "loss": 1.0588, "odds_ratio_loss": 0.27349480986595154, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10314557701349258, "rewards/margins": 0.5484269857406616, "rewards/rejected": -0.6515725255012512, "sft_loss": 1.0314557552337646, "step": 7855 }, { "epoch": 0.61, "grad_norm": 12.31454849243164, "learning_rate": 3.330143441817618e-06, "logits/chosen": -1.397132158279419, "logits/rejected": -1.1553051471710205, "logps/chosen": -1.918043851852417, "logps/rejected": -10.284468650817871, "loss": 2.0299, "odds_ratio_loss": 1.1188890933990479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19180439412593842, "rewards/margins": 0.8366424441337585, "rewards/rejected": -1.028446912765503, "sft_loss": 1.918043851852417, "step": 7860 }, { "epoch": 0.61, "grad_norm": 36.400726318359375, "learning_rate": 3.3243406864705193e-06, "logits/chosen": -1.3096789121627808, "logits/rejected": -1.1991100311279297, "logps/chosen": -0.822433590888977, "logps/rejected": -8.101971626281738, "loss": 0.8241, "odds_ratio_loss": 0.01621662639081478, "rewards/accuracies": 1.0, "rewards/chosen": -0.08224336802959442, "rewards/margins": 0.7279537916183472, "rewards/rejected": -0.8101971745491028, "sft_loss": 0.822433590888977, "step": 7865 }, { "epoch": 0.61, "grad_norm": 48.41091537475586, "learning_rate": 3.318540472473518e-06, "logits/chosen": -1.3551379442214966, "logits/rejected": -1.2822260856628418, "logps/chosen": -1.39948308467865, "logps/rejected": -5.699338912963867, "loss": 1.4311, "odds_ratio_loss": 0.31617268919944763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13994832336902618, "rewards/margins": 0.42998558282852173, "rewards/rejected": -0.5699338912963867, "sft_loss": 1.39948308467865, "step": 7870 }, { "epoch": 0.61, "grad_norm": 6.471083164215088, "learning_rate": 3.312742808623378e-06, "logits/chosen": -1.265817403793335, "logits/rejected": -0.9677835702896118, "logps/chosen": -0.7799075841903687, "logps/rejected": -8.898427963256836, "loss": 0.8006, "odds_ratio_loss": 0.20699651539325714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07799076288938522, "rewards/margins": 0.8118520975112915, "rewards/rejected": -0.8898428678512573, "sft_loss": 0.7799075841903687, "step": 7875 }, { "epoch": 0.61, "grad_norm": 16.40986442565918, "learning_rate": 3.306947703712991e-06, "logits/chosen": -1.280167579650879, "logits/rejected": -0.9758197665214539, "logps/chosen": -0.8813843727111816, "logps/rejected": -6.394897937774658, "loss": 0.8844, "odds_ratio_loss": 0.030038166791200638, "rewards/accuracies": 1.0, "rewards/chosen": -0.08813843131065369, "rewards/margins": 0.5513514280319214, "rewards/rejected": -0.6394897699356079, "sft_loss": 0.8813843727111816, "step": 7880 }, { "epoch": 0.61, "grad_norm": 15.605619430541992, "learning_rate": 3.301155166531368e-06, "logits/chosen": -1.4267349243164062, "logits/rejected": -1.3646270036697388, "logps/chosen": -1.072645664215088, "logps/rejected": -7.3083038330078125, "loss": 1.1466, "odds_ratio_loss": 0.7396418452262878, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10726456344127655, "rewards/margins": 0.6235657930374146, "rewards/rejected": -0.7308304309844971, "sft_loss": 1.072645664215088, "step": 7885 }, { "epoch": 0.61, "grad_norm": 7.9199981689453125, "learning_rate": 3.29536520586363e-06, "logits/chosen": -1.3843305110931396, "logits/rejected": -0.7178108096122742, "logps/chosen": -1.484226107597351, "logps/rejected": -17.763320922851562, "loss": 1.4988, "odds_ratio_loss": 0.14559972286224365, "rewards/accuracies": 1.0, "rewards/chosen": -0.14842261373996735, "rewards/margins": 1.6279094219207764, "rewards/rejected": -1.776332139968872, "sft_loss": 1.484226107597351, "step": 7890 }, { "epoch": 0.61, "grad_norm": 115.85680389404297, "learning_rate": 3.2895778304909865e-06, "logits/chosen": -1.3865851163864136, "logits/rejected": -0.9108538627624512, "logps/chosen": -0.9720360040664673, "logps/rejected": -9.35875415802002, "loss": 0.9808, "odds_ratio_loss": 0.08749934285879135, "rewards/accuracies": 1.0, "rewards/chosen": -0.09720361232757568, "rewards/margins": 0.8386718034744263, "rewards/rejected": -0.935875415802002, "sft_loss": 0.9720360040664673, "step": 7895 }, { "epoch": 0.61, "grad_norm": 6.711180686950684, "learning_rate": 3.2837930491907255e-06, "logits/chosen": -1.41152024269104, "logits/rejected": -0.9695557355880737, "logps/chosen": -1.0238697528839111, "logps/rejected": -5.494019508361816, "loss": 1.044, "odds_ratio_loss": 0.20168164372444153, "rewards/accuracies": 1.0, "rewards/chosen": -0.102386973798275, "rewards/margins": 0.4470149874687195, "rewards/rejected": -0.5494019389152527, "sft_loss": 1.0238697528839111, "step": 7900 }, { "epoch": 0.61, "grad_norm": 16.122793197631836, "learning_rate": 3.278010870736205e-06, "logits/chosen": -1.319061279296875, "logits/rejected": -1.18667733669281, "logps/chosen": -1.2115809917449951, "logps/rejected": -2.368157386779785, "loss": 1.2463, "odds_ratio_loss": 0.34768372774124146, "rewards/accuracies": 1.0, "rewards/chosen": -0.12115810066461563, "rewards/margins": 0.11565764993429184, "rewards/rejected": -0.23681576550006866, "sft_loss": 1.2115809917449951, "step": 7905 }, { "epoch": 0.62, "grad_norm": 10.517181396484375, "learning_rate": 3.2722313038968312e-06, "logits/chosen": -1.5105429887771606, "logits/rejected": -1.256507396697998, "logps/chosen": -0.960302472114563, "logps/rejected": -5.137580871582031, "loss": 0.9908, "odds_ratio_loss": 0.30541473627090454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09603025019168854, "rewards/margins": 0.4177277684211731, "rewards/rejected": -0.5137580633163452, "sft_loss": 0.960302472114563, "step": 7910 }, { "epoch": 0.62, "grad_norm": 6.5809502601623535, "learning_rate": 3.2664543574380493e-06, "logits/chosen": -1.3719203472137451, "logits/rejected": -1.3545509576797485, "logps/chosen": -0.741712212562561, "logps/rejected": -5.936234474182129, "loss": 0.7698, "odds_ratio_loss": 0.2805303633213043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07417122274637222, "rewards/margins": 0.5194522142410278, "rewards/rejected": -0.5936234593391418, "sft_loss": 0.741712212562561, "step": 7915 }, { "epoch": 0.62, "grad_norm": 5.230719566345215, "learning_rate": 3.260680040121336e-06, "logits/chosen": -1.4700971841812134, "logits/rejected": -0.9396473169326782, "logps/chosen": -1.037864327430725, "logps/rejected": -2.747180461883545, "loss": 1.0811, "odds_ratio_loss": 0.43252936005592346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10378643125295639, "rewards/margins": 0.1709316372871399, "rewards/rejected": -0.2747180759906769, "sft_loss": 1.037864327430725, "step": 7920 }, { "epoch": 0.62, "grad_norm": 5.8517866134643555, "learning_rate": 3.2549083607041743e-06, "logits/chosen": -1.2298557758331299, "logits/rejected": -1.1291000843048096, "logps/chosen": -0.8646572232246399, "logps/rejected": -5.618631839752197, "loss": 0.9033, "odds_ratio_loss": 0.38687339425086975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08646572381258011, "rewards/margins": 0.4753974378108978, "rewards/rejected": -0.561863124370575, "sft_loss": 0.8646572232246399, "step": 7925 }, { "epoch": 0.62, "grad_norm": 214.61062622070312, "learning_rate": 3.249139327940049e-06, "logits/chosen": -1.2199280261993408, "logits/rejected": -1.067989706993103, "logps/chosen": -1.4785853624343872, "logps/rejected": -3.4969024658203125, "loss": 1.5057, "odds_ratio_loss": 0.2707682251930237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14785853028297424, "rewards/margins": 0.20183169841766357, "rewards/rejected": -0.3496902585029602, "sft_loss": 1.4785853624343872, "step": 7930 }, { "epoch": 0.62, "grad_norm": 8.33104133605957, "learning_rate": 3.2433729505784283e-06, "logits/chosen": -1.3628246784210205, "logits/rejected": -1.022838830947876, "logps/chosen": -0.7507215738296509, "logps/rejected": -3.201002597808838, "loss": 0.7839, "odds_ratio_loss": 0.33168500661849976, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07507215440273285, "rewards/margins": 0.24502809345722198, "rewards/rejected": -0.32010024785995483, "sft_loss": 0.7507215738296509, "step": 7935 }, { "epoch": 0.62, "grad_norm": 5.3491363525390625, "learning_rate": 3.2376092373647604e-06, "logits/chosen": -1.3530322313308716, "logits/rejected": -0.5111430287361145, "logps/chosen": -0.979842483997345, "logps/rejected": -7.742257595062256, "loss": 1.0023, "odds_ratio_loss": 0.2243957221508026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09798424690961838, "rewards/margins": 0.6762415766716003, "rewards/rejected": -0.7742258310317993, "sft_loss": 0.979842483997345, "step": 7940 }, { "epoch": 0.62, "grad_norm": 16.427143096923828, "learning_rate": 3.231848197040446e-06, "logits/chosen": -1.4268476963043213, "logits/rejected": -1.4312551021575928, "logps/chosen": -0.7136311531066895, "logps/rejected": -4.48763370513916, "loss": 0.7225, "odds_ratio_loss": 0.08858003467321396, "rewards/accuracies": 1.0, "rewards/chosen": -0.07136311382055283, "rewards/margins": 0.377400279045105, "rewards/rejected": -0.4487634301185608, "sft_loss": 0.7136311531066895, "step": 7945 }, { "epoch": 0.62, "grad_norm": 10.100354194641113, "learning_rate": 3.226089838342833e-06, "logits/chosen": -1.4485676288604736, "logits/rejected": -1.1894410848617554, "logps/chosen": -0.9481072425842285, "logps/rejected": -9.236673355102539, "loss": 0.9591, "odds_ratio_loss": 0.11030948162078857, "rewards/accuracies": 1.0, "rewards/chosen": -0.09481072425842285, "rewards/margins": 0.8288565874099731, "rewards/rejected": -0.923667311668396, "sft_loss": 0.9481072425842285, "step": 7950 }, { "epoch": 0.62, "grad_norm": 30.110383987426758, "learning_rate": 3.220334170005206e-06, "logits/chosen": -1.4325348138809204, "logits/rejected": -1.4311264753341675, "logps/chosen": -1.306010127067566, "logps/rejected": -3.6128711700439453, "loss": 1.3562, "odds_ratio_loss": 0.5021435022354126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13060101866722107, "rewards/margins": 0.23068614304065704, "rewards/rejected": -0.3612871766090393, "sft_loss": 1.306010127067566, "step": 7955 }, { "epoch": 0.62, "grad_norm": 7.615264892578125, "learning_rate": 3.214581200756765e-06, "logits/chosen": -1.4307386875152588, "logits/rejected": -1.1426076889038086, "logps/chosen": -0.6602781414985657, "logps/rejected": -5.599646091461182, "loss": 0.6888, "odds_ratio_loss": 0.2852723300457001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06602781265974045, "rewards/margins": 0.49393683671951294, "rewards/rejected": -0.559964656829834, "sft_loss": 0.6602781414985657, "step": 7960 }, { "epoch": 0.62, "grad_norm": 8.364228248596191, "learning_rate": 3.208830939322617e-06, "logits/chosen": -1.3908792734146118, "logits/rejected": -1.1232960224151611, "logps/chosen": -0.7094835042953491, "logps/rejected": -10.671346664428711, "loss": 0.7219, "odds_ratio_loss": 0.1239551529288292, "rewards/accuracies": 1.0, "rewards/chosen": -0.07094834744930267, "rewards/margins": 0.996186375617981, "rewards/rejected": -1.0671346187591553, "sft_loss": 0.7094835042953491, "step": 7965 }, { "epoch": 0.62, "grad_norm": 21.444847106933594, "learning_rate": 3.203083394423766e-06, "logits/chosen": -1.3634107112884521, "logits/rejected": -1.0703684091567993, "logps/chosen": -1.527782678604126, "logps/rejected": -2.9710288047790527, "loss": 1.5714, "odds_ratio_loss": 0.43602442741394043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1527782678604126, "rewards/margins": 0.1443246304988861, "rewards/rejected": -0.2971028685569763, "sft_loss": 1.527782678604126, "step": 7970 }, { "epoch": 0.62, "grad_norm": 9.083678245544434, "learning_rate": 3.197338574777094e-06, "logits/chosen": -1.4616883993148804, "logits/rejected": -0.8809460401535034, "logps/chosen": -1.1917169094085693, "logps/rejected": -8.423823356628418, "loss": 1.1971, "odds_ratio_loss": 0.05426154285669327, "rewards/accuracies": 1.0, "rewards/chosen": -0.11917171627283096, "rewards/margins": 0.7232107520103455, "rewards/rejected": -0.8423824310302734, "sft_loss": 1.1917169094085693, "step": 7975 }, { "epoch": 0.62, "grad_norm": 5.379354953765869, "learning_rate": 3.191596489095348e-06, "logits/chosen": -1.2065356969833374, "logits/rejected": -1.0963044166564941, "logps/chosen": -1.290281057357788, "logps/rejected": -7.7128400802612305, "loss": 1.3069, "odds_ratio_loss": 0.16593563556671143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1290281116962433, "rewards/margins": 0.6422559022903442, "rewards/rejected": -0.7712840437889099, "sft_loss": 1.290281057357788, "step": 7980 }, { "epoch": 0.62, "grad_norm": 12.42475414276123, "learning_rate": 3.1858571460871284e-06, "logits/chosen": -1.3918287754058838, "logits/rejected": -1.3030837774276733, "logps/chosen": -0.8681005239486694, "logps/rejected": -8.609440803527832, "loss": 0.8756, "odds_ratio_loss": 0.07548637688159943, "rewards/accuracies": 1.0, "rewards/chosen": -0.08681005239486694, "rewards/margins": 0.7741340398788452, "rewards/rejected": -0.8609440922737122, "sft_loss": 0.8681005239486694, "step": 7985 }, { "epoch": 0.62, "grad_norm": 15.7763032913208, "learning_rate": 3.1801205544568816e-06, "logits/chosen": -1.2245934009552002, "logits/rejected": -1.5365649461746216, "logps/chosen": -1.184861421585083, "logps/rejected": -6.469397068023682, "loss": 1.2113, "odds_ratio_loss": 0.26478278636932373, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11848614364862442, "rewards/margins": 0.528453528881073, "rewards/rejected": -0.646939754486084, "sft_loss": 1.184861421585083, "step": 7990 }, { "epoch": 0.62, "grad_norm": 5.70435094833374, "learning_rate": 3.1743867229048734e-06, "logits/chosen": -1.4829033613204956, "logits/rejected": -0.8051961660385132, "logps/chosen": -1.914612054824829, "logps/rejected": -3.3208343982696533, "loss": 2.0397, "odds_ratio_loss": 1.2508898973464966, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1914612054824829, "rewards/margins": 0.14062225818634033, "rewards/rejected": -0.33208346366882324, "sft_loss": 1.914612054824829, "step": 7995 }, { "epoch": 0.62, "grad_norm": 8.394847869873047, "learning_rate": 3.168655660127188e-06, "logits/chosen": -1.3032631874084473, "logits/rejected": -1.1171185970306396, "logps/chosen": -1.410510778427124, "logps/rejected": -6.261443138122559, "loss": 1.4635, "odds_ratio_loss": 0.5297115445137024, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14105108380317688, "rewards/margins": 0.4850931763648987, "rewards/rejected": -0.6261442303657532, "sft_loss": 1.410510778427124, "step": 8000 }, { "epoch": 0.62, "grad_norm": 8.586276054382324, "learning_rate": 3.162927374815712e-06, "logits/chosen": -1.455913782119751, "logits/rejected": -1.1645771265029907, "logps/chosen": -1.0394628047943115, "logps/rejected": -3.195157766342163, "loss": 1.0581, "odds_ratio_loss": 0.1859007179737091, "rewards/accuracies": 1.0, "rewards/chosen": -0.10394628345966339, "rewards/margins": 0.21556949615478516, "rewards/rejected": -0.31951576471328735, "sft_loss": 1.0394628047943115, "step": 8005 }, { "epoch": 0.62, "grad_norm": 14.099467277526855, "learning_rate": 3.157201875658116e-06, "logits/chosen": -1.3971182107925415, "logits/rejected": -0.8789178133010864, "logps/chosen": -1.0584465265274048, "logps/rejected": -5.310263156890869, "loss": 1.0896, "odds_ratio_loss": 0.31155240535736084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1058446541428566, "rewards/margins": 0.42518168687820435, "rewards/rejected": -0.531026303768158, "sft_loss": 1.0584465265274048, "step": 8010 }, { "epoch": 0.62, "grad_norm": 72.75445556640625, "learning_rate": 3.1514791713378443e-06, "logits/chosen": -1.4388090372085571, "logits/rejected": -0.8740331530570984, "logps/chosen": -0.9836138486862183, "logps/rejected": -3.6888020038604736, "loss": 1.0094, "odds_ratio_loss": 0.2580524981021881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09836138039827347, "rewards/margins": 0.27051883935928345, "rewards/rejected": -0.3688802123069763, "sft_loss": 0.9836138486862183, "step": 8015 }, { "epoch": 0.62, "grad_norm": 9.547337532043457, "learning_rate": 3.1457592705341088e-06, "logits/chosen": -1.4236394166946411, "logits/rejected": -0.8941730260848999, "logps/chosen": -1.2104017734527588, "logps/rejected": -4.095586776733398, "loss": 1.2312, "odds_ratio_loss": 0.20825231075286865, "rewards/accuracies": 1.0, "rewards/chosen": -0.12104018032550812, "rewards/margins": 0.2885185182094574, "rewards/rejected": -0.4095586836338043, "sft_loss": 1.2104017734527588, "step": 8020 }, { "epoch": 0.62, "grad_norm": 5.413477420806885, "learning_rate": 3.140042181921863e-06, "logits/chosen": -1.3875473737716675, "logits/rejected": -1.175796389579773, "logps/chosen": -1.0537807941436768, "logps/rejected": -10.514541625976562, "loss": 1.0618, "odds_ratio_loss": 0.08046818524599075, "rewards/accuracies": 1.0, "rewards/chosen": -0.10537807643413544, "rewards/margins": 0.9460762143135071, "rewards/rejected": -1.0514543056488037, "sft_loss": 1.0537807941436768, "step": 8025 }, { "epoch": 0.62, "grad_norm": 57.867000579833984, "learning_rate": 3.1343279141717957e-06, "logits/chosen": -1.2222039699554443, "logits/rejected": -1.3033952713012695, "logps/chosen": -1.596881628036499, "logps/rejected": -6.566547393798828, "loss": 1.6783, "odds_ratio_loss": 0.8144750595092773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15968818962574005, "rewards/margins": 0.4969666004180908, "rewards/rejected": -0.6566547155380249, "sft_loss": 1.596881628036499, "step": 8030 }, { "epoch": 0.63, "grad_norm": 9.617599487304688, "learning_rate": 3.1286164759503245e-06, "logits/chosen": -1.3700895309448242, "logits/rejected": -1.3072712421417236, "logps/chosen": -1.059555172920227, "logps/rejected": -6.857283115386963, "loss": 1.0932, "odds_ratio_loss": 0.3364062011241913, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10595552623271942, "rewards/margins": 0.5797727704048157, "rewards/rejected": -0.6857283711433411, "sft_loss": 1.059555172920227, "step": 8035 }, { "epoch": 0.63, "grad_norm": 9.33051586151123, "learning_rate": 3.122907875919567e-06, "logits/chosen": -1.3103891611099243, "logits/rejected": -1.573803186416626, "logps/chosen": -1.0470023155212402, "logps/rejected": -11.741785049438477, "loss": 1.0516, "odds_ratio_loss": 0.04594879597425461, "rewards/accuracies": 1.0, "rewards/chosen": -0.1047002449631691, "rewards/margins": 1.0694782733917236, "rewards/rejected": -1.1741784811019897, "sft_loss": 1.0470023155212402, "step": 8040 }, { "epoch": 0.63, "grad_norm": 8.176512718200684, "learning_rate": 3.11720212273734e-06, "logits/chosen": -1.3815648555755615, "logits/rejected": -0.7566605806350708, "logps/chosen": -1.0081384181976318, "logps/rejected": -7.439121246337891, "loss": 1.0207, "odds_ratio_loss": 0.125940203666687, "rewards/accuracies": 1.0, "rewards/chosen": -0.1008138507604599, "rewards/margins": 0.6430982351303101, "rewards/rejected": -0.7439121007919312, "sft_loss": 1.0081384181976318, "step": 8045 }, { "epoch": 0.63, "grad_norm": 52.099769592285156, "learning_rate": 3.1114992250571415e-06, "logits/chosen": -1.2227838039398193, "logits/rejected": -0.7810730934143066, "logps/chosen": -1.0308719873428345, "logps/rejected": -8.28508186340332, "loss": 1.0421, "odds_ratio_loss": 0.11267662048339844, "rewards/accuracies": 1.0, "rewards/chosen": -0.10308720171451569, "rewards/margins": 0.7254210114479065, "rewards/rejected": -0.828508198261261, "sft_loss": 1.0308719873428345, "step": 8050 }, { "epoch": 0.63, "grad_norm": 10.428442001342773, "learning_rate": 3.105799191528144e-06, "logits/chosen": -1.2264387607574463, "logits/rejected": -1.103314995765686, "logps/chosen": -0.7794146537780762, "logps/rejected": -5.476459980010986, "loss": 0.8488, "odds_ratio_loss": 0.6937298774719238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07794146984815598, "rewards/margins": 0.4697045385837555, "rewards/rejected": -0.5476459264755249, "sft_loss": 0.7794146537780762, "step": 8055 }, { "epoch": 0.63, "grad_norm": 14.65079116821289, "learning_rate": 3.1001020307951684e-06, "logits/chosen": -1.411501169204712, "logits/rejected": -1.2931959629058838, "logps/chosen": -1.372267484664917, "logps/rejected": -9.434979438781738, "loss": 1.381, "odds_ratio_loss": 0.08757570385932922, "rewards/accuracies": 1.0, "rewards/chosen": -0.13722673058509827, "rewards/margins": 0.8062711954116821, "rewards/rejected": -0.9434979557991028, "sft_loss": 1.372267484664917, "step": 8060 }, { "epoch": 0.63, "grad_norm": 8.443821907043457, "learning_rate": 3.0944077514986837e-06, "logits/chosen": -1.2568533420562744, "logits/rejected": -1.489222526550293, "logps/chosen": -0.9871482849121094, "logps/rejected": -9.15410327911377, "loss": 0.9907, "odds_ratio_loss": 0.0351276621222496, "rewards/accuracies": 1.0, "rewards/chosen": -0.09871482849121094, "rewards/margins": 0.8166955709457397, "rewards/rejected": -0.9154103994369507, "sft_loss": 0.9871482849121094, "step": 8065 }, { "epoch": 0.63, "grad_norm": 6.821056842803955, "learning_rate": 3.0887163622747873e-06, "logits/chosen": -1.3466415405273438, "logits/rejected": -1.1124385595321655, "logps/chosen": -1.423811674118042, "logps/rejected": -5.253532886505127, "loss": 1.4455, "odds_ratio_loss": 0.21701247990131378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14238117635250092, "rewards/margins": 0.3829721510410309, "rewards/rejected": -0.5253533124923706, "sft_loss": 1.423811674118042, "step": 8070 }, { "epoch": 0.63, "grad_norm": 12.706870079040527, "learning_rate": 3.083027871755194e-06, "logits/chosen": -1.3704214096069336, "logits/rejected": -0.7175769805908203, "logps/chosen": -1.1178873777389526, "logps/rejected": -6.501991271972656, "loss": 1.1327, "odds_ratio_loss": 0.14839690923690796, "rewards/accuracies": 1.0, "rewards/chosen": -0.11178873479366302, "rewards/margins": 0.5384103059768677, "rewards/rejected": -0.6501990556716919, "sft_loss": 1.1178873777389526, "step": 8075 }, { "epoch": 0.63, "grad_norm": 15.04248332977295, "learning_rate": 3.07734228856722e-06, "logits/chosen": -1.2834182977676392, "logits/rejected": -1.0528788566589355, "logps/chosen": -1.3128502368927002, "logps/rejected": -3.2048892974853516, "loss": 1.3444, "odds_ratio_loss": 0.31525570154190063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13128502666950226, "rewards/margins": 0.1892039030790329, "rewards/rejected": -0.32048892974853516, "sft_loss": 1.3128502368927002, "step": 8080 }, { "epoch": 0.63, "grad_norm": 5.70639181137085, "learning_rate": 3.071659621333777e-06, "logits/chosen": -1.3128175735473633, "logits/rejected": -0.9505087733268738, "logps/chosen": -1.1137568950653076, "logps/rejected": -4.190314292907715, "loss": 1.147, "odds_ratio_loss": 0.33274489641189575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11137568950653076, "rewards/margins": 0.3076557219028473, "rewards/rejected": -0.41903138160705566, "sft_loss": 1.1137568950653076, "step": 8085 }, { "epoch": 0.63, "grad_norm": 6.7080607414245605, "learning_rate": 3.0659798786733497e-06, "logits/chosen": -1.3358865976333618, "logits/rejected": -1.1260464191436768, "logps/chosen": -0.951252281665802, "logps/rejected": -3.221665143966675, "loss": 0.9724, "odds_ratio_loss": 0.21111583709716797, "rewards/accuracies": 1.0, "rewards/chosen": -0.0951252207159996, "rewards/margins": 0.22704128921031952, "rewards/rejected": -0.3221665322780609, "sft_loss": 0.951252281665802, "step": 8090 }, { "epoch": 0.63, "grad_norm": 30.165071487426758, "learning_rate": 3.0603030691999885e-06, "logits/chosen": -1.3660657405853271, "logits/rejected": -1.3970402479171753, "logps/chosen": -0.8544301986694336, "logps/rejected": -3.3595690727233887, "loss": 0.8623, "odds_ratio_loss": 0.07849614322185516, "rewards/accuracies": 1.0, "rewards/chosen": -0.08544301986694336, "rewards/margins": 0.2505139112472534, "rewards/rejected": -0.3359569311141968, "sft_loss": 0.8544301986694336, "step": 8095 }, { "epoch": 0.63, "grad_norm": 7.015862464904785, "learning_rate": 3.054629201523297e-06, "logits/chosen": -1.2655341625213623, "logits/rejected": -0.8376556634902954, "logps/chosen": -1.1147531270980835, "logps/rejected": -4.9825758934021, "loss": 1.1226, "odds_ratio_loss": 0.07891669124364853, "rewards/accuracies": 1.0, "rewards/chosen": -0.11147532612085342, "rewards/margins": 0.3867822587490082, "rewards/rejected": -0.498257577419281, "sft_loss": 1.1147531270980835, "step": 8100 }, { "epoch": 0.63, "grad_norm": 97.99761962890625, "learning_rate": 3.0489582842484155e-06, "logits/chosen": -1.1555439233779907, "logits/rejected": -0.9075528383255005, "logps/chosen": -1.1690194606781006, "logps/rejected": -5.805201530456543, "loss": 1.1894, "odds_ratio_loss": 0.20332148671150208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1169019490480423, "rewards/margins": 0.46361827850341797, "rewards/rejected": -0.5805202126502991, "sft_loss": 1.1690194606781006, "step": 8105 }, { "epoch": 0.63, "grad_norm": 7.906203746795654, "learning_rate": 3.0432903259760103e-06, "logits/chosen": -1.3602170944213867, "logits/rejected": -0.9214040637016296, "logps/chosen": -1.0084471702575684, "logps/rejected": -8.591350555419922, "loss": 1.0197, "odds_ratio_loss": 0.11236198246479034, "rewards/accuracies": 1.0, "rewards/chosen": -0.10084471851587296, "rewards/margins": 0.7582904100418091, "rewards/rejected": -0.8591351509094238, "sft_loss": 1.0084471702575684, "step": 8110 }, { "epoch": 0.63, "grad_norm": 115.37716674804688, "learning_rate": 3.0376253353022565e-06, "logits/chosen": -1.4454014301300049, "logits/rejected": -1.1584488153457642, "logps/chosen": -1.3010876178741455, "logps/rejected": -4.5600905418396, "loss": 1.3204, "odds_ratio_loss": 0.19311176240444183, "rewards/accuracies": 1.0, "rewards/chosen": -0.1301087737083435, "rewards/margins": 0.3259003162384033, "rewards/rejected": -0.4560090899467468, "sft_loss": 1.3010876178741455, "step": 8115 }, { "epoch": 0.63, "grad_norm": 266.7450256347656, "learning_rate": 3.031963320818837e-06, "logits/chosen": -1.3959786891937256, "logits/rejected": -1.0264484882354736, "logps/chosen": -1.1273565292358398, "logps/rejected": -7.558053016662598, "loss": 1.1292, "odds_ratio_loss": 0.018850315362215042, "rewards/accuracies": 1.0, "rewards/chosen": -0.11273566633462906, "rewards/margins": 0.6430696249008179, "rewards/rejected": -0.7558053135871887, "sft_loss": 1.1273565292358398, "step": 8120 }, { "epoch": 0.63, "grad_norm": 8.022828102111816, "learning_rate": 3.026304291112914e-06, "logits/chosen": -1.48981773853302, "logits/rejected": -1.5464773178100586, "logps/chosen": -0.7510837316513062, "logps/rejected": -8.108277320861816, "loss": 0.7574, "odds_ratio_loss": 0.06287384778261185, "rewards/accuracies": 1.0, "rewards/chosen": -0.07510837912559509, "rewards/margins": 0.7357193827629089, "rewards/rejected": -0.8108277320861816, "sft_loss": 0.7510837316513062, "step": 8125 }, { "epoch": 0.63, "grad_norm": 174.53338623046875, "learning_rate": 3.020648254767121e-06, "logits/chosen": -1.2474849224090576, "logits/rejected": -1.1400511264801025, "logps/chosen": -1.2461121082305908, "logps/rejected": -13.192431449890137, "loss": 1.2663, "odds_ratio_loss": 0.20185616612434387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12461121380329132, "rewards/margins": 1.1946319341659546, "rewards/rejected": -1.3192431926727295, "sft_loss": 1.2461121082305908, "step": 8130 }, { "epoch": 0.63, "grad_norm": 15.472785949707031, "learning_rate": 3.01499522035956e-06, "logits/chosen": -1.3132785558700562, "logits/rejected": -1.2090857028961182, "logps/chosen": -1.164563775062561, "logps/rejected": -8.720190048217773, "loss": 1.1775, "odds_ratio_loss": 0.12893368303775787, "rewards/accuracies": 1.0, "rewards/chosen": -0.11645637452602386, "rewards/margins": 0.7555626630783081, "rewards/rejected": -0.8720189929008484, "sft_loss": 1.164563775062561, "step": 8135 }, { "epoch": 0.63, "grad_norm": 11.70803165435791, "learning_rate": 3.009345196463773e-06, "logits/chosen": -1.291359782218933, "logits/rejected": -0.9156149625778198, "logps/chosen": -0.9766289591789246, "logps/rejected": -5.966506004333496, "loss": 0.9985, "odds_ratio_loss": 0.2186693698167801, "rewards/accuracies": 1.0, "rewards/chosen": -0.09766290336847305, "rewards/margins": 0.49898767471313477, "rewards/rejected": -0.5966506004333496, "sft_loss": 0.9766289591789246, "step": 8140 }, { "epoch": 0.63, "grad_norm": 8.548707962036133, "learning_rate": 3.0036981916487366e-06, "logits/chosen": -1.440680742263794, "logits/rejected": -1.3260940313339233, "logps/chosen": -0.929384708404541, "logps/rejected": -15.503623962402344, "loss": 0.935, "odds_ratio_loss": 0.055864494293928146, "rewards/accuracies": 1.0, "rewards/chosen": -0.09293847531080246, "rewards/margins": 1.4574239253997803, "rewards/rejected": -1.550362467765808, "sft_loss": 0.929384708404541, "step": 8145 }, { "epoch": 0.63, "grad_norm": 12.262228012084961, "learning_rate": 2.9980542144788564e-06, "logits/chosen": -1.4146068096160889, "logits/rejected": -0.9239387512207031, "logps/chosen": -0.9395908117294312, "logps/rejected": -4.714138984680176, "loss": 0.9445, "odds_ratio_loss": 0.04877752065658569, "rewards/accuracies": 1.0, "rewards/chosen": -0.09395908564329147, "rewards/margins": 0.37745481729507446, "rewards/rejected": -0.4714139401912689, "sft_loss": 0.9395908117294312, "step": 8150 }, { "epoch": 0.63, "grad_norm": 23.141952514648438, "learning_rate": 2.9924132735139357e-06, "logits/chosen": -1.4843209981918335, "logits/rejected": -1.102158784866333, "logps/chosen": -0.9500603675842285, "logps/rejected": -4.377799987792969, "loss": 0.9585, "odds_ratio_loss": 0.08394896239042282, "rewards/accuracies": 1.0, "rewards/chosen": -0.09500603377819061, "rewards/margins": 0.342773973941803, "rewards/rejected": -0.4377799928188324, "sft_loss": 0.9500603675842285, "step": 8155 }, { "epoch": 0.63, "grad_norm": 9.641731262207031, "learning_rate": 2.9867753773091766e-06, "logits/chosen": -1.342581033706665, "logits/rejected": -1.1286296844482422, "logps/chosen": -0.7172707915306091, "logps/rejected": -10.408893585205078, "loss": 0.7257, "odds_ratio_loss": 0.08466891944408417, "rewards/accuracies": 1.0, "rewards/chosen": -0.07172708213329315, "rewards/margins": 0.9691622853279114, "rewards/rejected": -1.0408892631530762, "sft_loss": 0.7172707915306091, "step": 8160 }, { "epoch": 0.64, "grad_norm": 7.909165859222412, "learning_rate": 2.9811405344151702e-06, "logits/chosen": -1.615762710571289, "logits/rejected": -1.6490952968597412, "logps/chosen": -0.8346630930900574, "logps/rejected": -12.535600662231445, "loss": 0.8405, "odds_ratio_loss": 0.05827382951974869, "rewards/accuracies": 1.0, "rewards/chosen": -0.0834663137793541, "rewards/margins": 1.1700937747955322, "rewards/rejected": -1.2535600662231445, "sft_loss": 0.8346630930900574, "step": 8165 }, { "epoch": 0.64, "grad_norm": 47.357765197753906, "learning_rate": 2.975508753377866e-06, "logits/chosen": -1.412506341934204, "logits/rejected": -1.051579236984253, "logps/chosen": -1.016878604888916, "logps/rejected": -4.126791954040527, "loss": 1.0491, "odds_ratio_loss": 0.3220589756965637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10168787091970444, "rewards/margins": 0.3109913766384125, "rewards/rejected": -0.4126792550086975, "sft_loss": 1.016878604888916, "step": 8170 }, { "epoch": 0.64, "grad_norm": 8.349559783935547, "learning_rate": 2.9698800427385775e-06, "logits/chosen": -1.2682634592056274, "logits/rejected": -1.3927457332611084, "logps/chosen": -0.7896240949630737, "logps/rejected": -8.555567741394043, "loss": 0.7956, "odds_ratio_loss": 0.059539467096328735, "rewards/accuracies": 1.0, "rewards/chosen": -0.07896241545677185, "rewards/margins": 0.7765944600105286, "rewards/rejected": -0.855556845664978, "sft_loss": 0.7896240949630737, "step": 8175 }, { "epoch": 0.64, "grad_norm": 9.098254203796387, "learning_rate": 2.964254411033957e-06, "logits/chosen": -1.5638911724090576, "logits/rejected": -1.143739104270935, "logps/chosen": -0.834095299243927, "logps/rejected": -4.712614059448242, "loss": 0.8587, "odds_ratio_loss": 0.24565191566944122, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08340953290462494, "rewards/margins": 0.38785186409950256, "rewards/rejected": -0.4712614119052887, "sft_loss": 0.834095299243927, "step": 8180 }, { "epoch": 0.64, "grad_norm": 423.8319396972656, "learning_rate": 2.9586318667959917e-06, "logits/chosen": -1.2335931062698364, "logits/rejected": -1.4510310888290405, "logps/chosen": -1.3426623344421387, "logps/rejected": -13.651086807250977, "loss": 1.3431, "odds_ratio_loss": 0.004761195741593838, "rewards/accuracies": 1.0, "rewards/chosen": -0.13426624238491058, "rewards/margins": 1.2308424711227417, "rewards/rejected": -1.3651087284088135, "sft_loss": 1.3426623344421387, "step": 8185 }, { "epoch": 0.64, "grad_norm": 57.52622604370117, "learning_rate": 2.9530124185519824e-06, "logits/chosen": -1.390239953994751, "logits/rejected": -0.8772698640823364, "logps/chosen": -1.1922757625579834, "logps/rejected": -13.38306713104248, "loss": 1.2134, "odds_ratio_loss": 0.2110356092453003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1192275881767273, "rewards/margins": 1.2190791368484497, "rewards/rejected": -1.3383066654205322, "sft_loss": 1.1922757625579834, "step": 8190 }, { "epoch": 0.64, "grad_norm": 5.419862270355225, "learning_rate": 2.9473960748245344e-06, "logits/chosen": -1.2417716979980469, "logits/rejected": -1.1405150890350342, "logps/chosen": -0.9968358278274536, "logps/rejected": -12.170660972595215, "loss": 1.0025, "odds_ratio_loss": 0.056391291320323944, "rewards/accuracies": 1.0, "rewards/chosen": -0.09968358278274536, "rewards/margins": 1.117382526397705, "rewards/rejected": -1.2170660495758057, "sft_loss": 0.9968358278274536, "step": 8195 }, { "epoch": 0.64, "grad_norm": 15.264373779296875, "learning_rate": 2.9417828441315493e-06, "logits/chosen": -1.4618908166885376, "logits/rejected": -1.2241955995559692, "logps/chosen": -1.154833436012268, "logps/rejected": -5.7115254402160645, "loss": 1.192, "odds_ratio_loss": 0.37174850702285767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1154833436012268, "rewards/margins": 0.45566922426223755, "rewards/rejected": -0.5711525678634644, "sft_loss": 1.154833436012268, "step": 8200 }, { "epoch": 0.64, "grad_norm": 6.008475303649902, "learning_rate": 2.9361727349862025e-06, "logits/chosen": -1.318110704421997, "logits/rejected": -0.7864333987236023, "logps/chosen": -0.9257491230964661, "logps/rejected": -9.724273681640625, "loss": 0.9338, "odds_ratio_loss": 0.08093155920505524, "rewards/accuracies": 1.0, "rewards/chosen": -0.09257491677999496, "rewards/margins": 0.8798524737358093, "rewards/rejected": -0.9724273681640625, "sft_loss": 0.9257491230964661, "step": 8205 }, { "epoch": 0.64, "grad_norm": 5.947242259979248, "learning_rate": 2.930565755896936e-06, "logits/chosen": -1.3611711263656616, "logits/rejected": -0.9525600671768188, "logps/chosen": -1.174300193786621, "logps/rejected": -10.041463851928711, "loss": 1.1828, "odds_ratio_loss": 0.08481469750404358, "rewards/accuracies": 1.0, "rewards/chosen": -0.11743001639842987, "rewards/margins": 0.8867164850234985, "rewards/rejected": -1.0041465759277344, "sft_loss": 1.174300193786621, "step": 8210 }, { "epoch": 0.64, "grad_norm": 11.08918285369873, "learning_rate": 2.9249619153674475e-06, "logits/chosen": -1.2925969362258911, "logits/rejected": -1.3019688129425049, "logps/chosen": -0.9571875333786011, "logps/rejected": -1.910681128501892, "loss": 0.9907, "odds_ratio_loss": 0.33518046140670776, "rewards/accuracies": 1.0, "rewards/chosen": -0.09571875631809235, "rewards/margins": 0.09534934908151627, "rewards/rejected": -0.1910681277513504, "sft_loss": 0.9571875333786011, "step": 8215 }, { "epoch": 0.64, "grad_norm": 4.53801965713501, "learning_rate": 2.919361221896671e-06, "logits/chosen": -1.3792939186096191, "logits/rejected": -0.9149982333183289, "logps/chosen": -1.1741918325424194, "logps/rejected": -8.287775039672852, "loss": 1.2071, "odds_ratio_loss": 0.3294121325016022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11741918325424194, "rewards/margins": 0.711358368396759, "rewards/rejected": -0.8287774920463562, "sft_loss": 1.1741918325424194, "step": 8220 }, { "epoch": 0.64, "grad_norm": 9.603503227233887, "learning_rate": 2.913763683978768e-06, "logits/chosen": -1.4316697120666504, "logits/rejected": -0.9715169668197632, "logps/chosen": -1.007556438446045, "logps/rejected": -3.0127406120300293, "loss": 1.019, "odds_ratio_loss": 0.1143096312880516, "rewards/accuracies": 1.0, "rewards/chosen": -0.10075564682483673, "rewards/margins": 0.2005184143781662, "rewards/rejected": -0.30127406120300293, "sft_loss": 1.007556438446045, "step": 8225 }, { "epoch": 0.64, "grad_norm": 47.24553680419922, "learning_rate": 2.9081693101031193e-06, "logits/chosen": -1.402808666229248, "logits/rejected": -0.8077438473701477, "logps/chosen": -1.1371103525161743, "logps/rejected": -5.126477241516113, "loss": 1.1479, "odds_ratio_loss": 0.10753113031387329, "rewards/accuracies": 1.0, "rewards/chosen": -0.11371102184057236, "rewards/margins": 0.3989366888999939, "rewards/rejected": -0.5126477479934692, "sft_loss": 1.1371103525161743, "step": 8230 }, { "epoch": 0.64, "grad_norm": 70.96250915527344, "learning_rate": 2.9025781087543004e-06, "logits/chosen": -1.37126886844635, "logits/rejected": -0.8173860311508179, "logps/chosen": -1.2457082271575928, "logps/rejected": -5.8854546546936035, "loss": 1.25, "odds_ratio_loss": 0.04337408393621445, "rewards/accuracies": 1.0, "rewards/chosen": -0.124570831656456, "rewards/margins": 0.46397465467453003, "rewards/rejected": -0.5885455012321472, "sft_loss": 1.2457082271575928, "step": 8235 }, { "epoch": 0.64, "grad_norm": 14.82772445678711, "learning_rate": 2.8969900884120794e-06, "logits/chosen": -1.3153047561645508, "logits/rejected": -1.1230190992355347, "logps/chosen": -0.9597269892692566, "logps/rejected": -7.153435707092285, "loss": 0.984, "odds_ratio_loss": 0.24292488396167755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0959727019071579, "rewards/margins": 0.6193707585334778, "rewards/rejected": -0.7153435349464417, "sft_loss": 0.9597269892692566, "step": 8240 }, { "epoch": 0.64, "grad_norm": 31.036779403686523, "learning_rate": 2.891405257551395e-06, "logits/chosen": -1.388417363166809, "logits/rejected": -1.0832921266555786, "logps/chosen": -1.0220638513565063, "logps/rejected": -4.166182994842529, "loss": 1.0276, "odds_ratio_loss": 0.05575251579284668, "rewards/accuracies": 1.0, "rewards/chosen": -0.10220638662576675, "rewards/margins": 0.3144119381904602, "rewards/rejected": -0.41661834716796875, "sft_loss": 1.0220638513565063, "step": 8245 }, { "epoch": 0.64, "grad_norm": 3.953531265258789, "learning_rate": 2.8858236246423577e-06, "logits/chosen": -1.4015694856643677, "logits/rejected": -0.9179224967956543, "logps/chosen": -0.8062652349472046, "logps/rejected": -9.229033470153809, "loss": 0.8104, "odds_ratio_loss": 0.04182159900665283, "rewards/accuracies": 1.0, "rewards/chosen": -0.08062653243541718, "rewards/margins": 0.8422768712043762, "rewards/rejected": -0.9229034185409546, "sft_loss": 0.8062652349472046, "step": 8250 }, { "epoch": 0.64, "grad_norm": 29.65342903137207, "learning_rate": 2.8802451981502215e-06, "logits/chosen": -1.290475606918335, "logits/rejected": -1.1075242757797241, "logps/chosen": -0.8379208445549011, "logps/rejected": -1.7874600887298584, "loss": 0.9005, "odds_ratio_loss": 0.6255687475204468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08379209041595459, "rewards/margins": 0.09495393186807632, "rewards/rejected": -0.1787460297346115, "sft_loss": 0.8379208445549011, "step": 8255 }, { "epoch": 0.64, "grad_norm": 7.534077167510986, "learning_rate": 2.8746699865353735e-06, "logits/chosen": -1.4055616855621338, "logits/rejected": -0.6370494961738586, "logps/chosen": -0.9448844194412231, "logps/rejected": -3.3409061431884766, "loss": 0.9652, "odds_ratio_loss": 0.20320256054401398, "rewards/accuracies": 1.0, "rewards/chosen": -0.09448844194412231, "rewards/margins": 0.23960216343402863, "rewards/rejected": -0.33409062027931213, "sft_loss": 0.9448844194412231, "step": 8260 }, { "epoch": 0.64, "grad_norm": 2.534180164337158, "learning_rate": 2.869097998253335e-06, "logits/chosen": -1.3750858306884766, "logits/rejected": -1.4756567478179932, "logps/chosen": -0.6586253643035889, "logps/rejected": -3.427091121673584, "loss": 0.6749, "odds_ratio_loss": 0.16277530789375305, "rewards/accuracies": 1.0, "rewards/chosen": -0.06586253643035889, "rewards/margins": 0.27684658765792847, "rewards/rejected": -0.34270912408828735, "sft_loss": 0.6586253643035889, "step": 8265 }, { "epoch": 0.64, "grad_norm": 11.438091278076172, "learning_rate": 2.8635292417547316e-06, "logits/chosen": -1.2504771947860718, "logits/rejected": -1.1825075149536133, "logps/chosen": -0.7640448212623596, "logps/rejected": -2.9363181591033936, "loss": 0.7794, "odds_ratio_loss": 0.15384219586849213, "rewards/accuracies": 1.0, "rewards/chosen": -0.07640448957681656, "rewards/margins": 0.21722733974456787, "rewards/rejected": -0.2936318516731262, "sft_loss": 0.7640448212623596, "step": 8270 }, { "epoch": 0.64, "grad_norm": 7.1938157081604, "learning_rate": 2.857963725485289e-06, "logits/chosen": -1.3243801593780518, "logits/rejected": -0.9105132222175598, "logps/chosen": -1.1438543796539307, "logps/rejected": -12.201528549194336, "loss": 1.1494, "odds_ratio_loss": 0.055164773017168045, "rewards/accuracies": 1.0, "rewards/chosen": -0.1143854483962059, "rewards/margins": 1.1057674884796143, "rewards/rejected": -1.2201528549194336, "sft_loss": 1.1438543796539307, "step": 8275 }, { "epoch": 0.64, "grad_norm": 93.03870391845703, "learning_rate": 2.8524014578858212e-06, "logits/chosen": -1.358870267868042, "logits/rejected": -1.174807071685791, "logps/chosen": -1.59250009059906, "logps/rejected": -4.908066272735596, "loss": 1.6391, "odds_ratio_loss": 0.4656241536140442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15925002098083496, "rewards/margins": 0.33155661821365356, "rewards/rejected": -0.4908066391944885, "sft_loss": 1.59250009059906, "step": 8280 }, { "epoch": 0.64, "grad_norm": 7.601529598236084, "learning_rate": 2.846842447392212e-06, "logits/chosen": -1.2130687236785889, "logits/rejected": -0.8104062080383301, "logps/chosen": -1.2862504720687866, "logps/rejected": -5.174140453338623, "loss": 1.3071, "odds_ratio_loss": 0.20863895118236542, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1286250501871109, "rewards/margins": 0.38878899812698364, "rewards/rejected": -0.5174140334129333, "sft_loss": 1.2862504720687866, "step": 8285 }, { "epoch": 0.64, "grad_norm": 9.283108711242676, "learning_rate": 2.841286702435408e-06, "logits/chosen": -1.1821047067642212, "logits/rejected": -0.8123686909675598, "logps/chosen": -1.0744366645812988, "logps/rejected": -2.654536724090576, "loss": 1.1324, "odds_ratio_loss": 0.5792650580406189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.107443667948246, "rewards/margins": 0.15800999104976654, "rewards/rejected": -0.26545366644859314, "sft_loss": 1.0744366645812988, "step": 8290 }, { "epoch": 0.65, "grad_norm": 66.2201919555664, "learning_rate": 2.835734231441398e-06, "logits/chosen": -1.4393401145935059, "logits/rejected": -1.3402307033538818, "logps/chosen": -1.0411721467971802, "logps/rejected": -6.103694438934326, "loss": 1.046, "odds_ratio_loss": 0.04821588844060898, "rewards/accuracies": 1.0, "rewards/chosen": -0.10411719977855682, "rewards/margins": 0.5062522292137146, "rewards/rejected": -0.6103695034980774, "sft_loss": 1.0411721467971802, "step": 8295 }, { "epoch": 0.65, "grad_norm": 12.559268951416016, "learning_rate": 2.830185042831214e-06, "logits/chosen": -1.3654600381851196, "logits/rejected": -0.9841594696044922, "logps/chosen": -0.9705682992935181, "logps/rejected": -2.2234740257263184, "loss": 0.9994, "odds_ratio_loss": 0.2879412770271301, "rewards/accuracies": 1.0, "rewards/chosen": -0.09705682843923569, "rewards/margins": 0.12529058754444122, "rewards/rejected": -0.2223474234342575, "sft_loss": 0.9705682992935181, "step": 8300 }, { "epoch": 0.65, "grad_norm": 10.072586059570312, "learning_rate": 2.824639145020903e-06, "logits/chosen": -1.1897691488265991, "logits/rejected": -1.0593864917755127, "logps/chosen": -0.8479114770889282, "logps/rejected": -2.3160197734832764, "loss": 0.8767, "odds_ratio_loss": 0.28744903206825256, "rewards/accuracies": 1.0, "rewards/chosen": -0.0847911462187767, "rewards/margins": 0.14681082963943481, "rewards/rejected": -0.23160198330879211, "sft_loss": 0.8479114770889282, "step": 8305 }, { "epoch": 0.65, "grad_norm": 32.56950378417969, "learning_rate": 2.8190965464215236e-06, "logits/chosen": -1.2548611164093018, "logits/rejected": -1.3413254022598267, "logps/chosen": -0.9717265963554382, "logps/rejected": -10.914091110229492, "loss": 0.9799, "odds_ratio_loss": 0.0821453332901001, "rewards/accuracies": 1.0, "rewards/chosen": -0.09717267006635666, "rewards/margins": 0.9942364692687988, "rewards/rejected": -1.0914090871810913, "sft_loss": 0.9717265963554382, "step": 8310 }, { "epoch": 0.65, "grad_norm": 4.923733234405518, "learning_rate": 2.8135572554391287e-06, "logits/chosen": -1.285402536392212, "logits/rejected": -0.6866047978401184, "logps/chosen": -0.892236590385437, "logps/rejected": -6.376672267913818, "loss": 0.9094, "odds_ratio_loss": 0.17121195793151855, "rewards/accuracies": 1.0, "rewards/chosen": -0.08922366797924042, "rewards/margins": 0.548443615436554, "rewards/rejected": -0.6376672983169556, "sft_loss": 0.892236590385437, "step": 8315 }, { "epoch": 0.65, "grad_norm": 11.997573852539062, "learning_rate": 2.8080212804747587e-06, "logits/chosen": -1.3283909559249878, "logits/rejected": -1.0296344757080078, "logps/chosen": -0.7988203763961792, "logps/rejected": -10.331366539001465, "loss": 0.8117, "odds_ratio_loss": 0.12899550795555115, "rewards/accuracies": 1.0, "rewards/chosen": -0.07988204061985016, "rewards/margins": 0.9532546997070312, "rewards/rejected": -1.0331367254257202, "sft_loss": 0.7988203763961792, "step": 8320 }, { "epoch": 0.65, "grad_norm": 5.004197120666504, "learning_rate": 2.802488629924419e-06, "logits/chosen": -1.3715083599090576, "logits/rejected": -1.0818204879760742, "logps/chosen": -1.191572666168213, "logps/rejected": -2.8655409812927246, "loss": 1.2159, "odds_ratio_loss": 0.24345561861991882, "rewards/accuracies": 1.0, "rewards/chosen": -0.11915726959705353, "rewards/margins": 0.16739685833454132, "rewards/rejected": -0.28655415773391724, "sft_loss": 1.191572666168213, "step": 8325 }, { "epoch": 0.65, "grad_norm": 4.403100490570068, "learning_rate": 2.7969593121790804e-06, "logits/chosen": -1.1697700023651123, "logits/rejected": -0.8697908520698547, "logps/chosen": -0.9482523798942566, "logps/rejected": -17.272789001464844, "loss": 0.9486, "odds_ratio_loss": 0.003111905185505748, "rewards/accuracies": 1.0, "rewards/chosen": -0.09482523798942566, "rewards/margins": 1.6324536800384521, "rewards/rejected": -1.7272790670394897, "sft_loss": 0.9482523798942566, "step": 8330 }, { "epoch": 0.65, "grad_norm": 21.261398315429688, "learning_rate": 2.7914333356246546e-06, "logits/chosen": -1.2155040502548218, "logits/rejected": -1.5798513889312744, "logps/chosen": -1.0860846042633057, "logps/rejected": -12.345155715942383, "loss": 1.1098, "odds_ratio_loss": 0.23688821494579315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10860846191644669, "rewards/margins": 1.1259071826934814, "rewards/rejected": -1.23451566696167, "sft_loss": 1.0860846042633057, "step": 8335 }, { "epoch": 0.65, "grad_norm": 16.56570053100586, "learning_rate": 2.7859107086419834e-06, "logits/chosen": -1.4490469694137573, "logits/rejected": -1.29300057888031, "logps/chosen": -1.2202339172363281, "logps/rejected": -7.131228446960449, "loss": 1.2332, "odds_ratio_loss": 0.1300738900899887, "rewards/accuracies": 1.0, "rewards/chosen": -0.12202338874340057, "rewards/margins": 0.5910994410514832, "rewards/rejected": -0.7131228446960449, "sft_loss": 1.2202339172363281, "step": 8340 }, { "epoch": 0.65, "grad_norm": 6.498528003692627, "learning_rate": 2.7803914396068365e-06, "logits/chosen": -1.3390071392059326, "logits/rejected": -0.6799716353416443, "logps/chosen": -0.9280698895454407, "logps/rejected": -5.283810615539551, "loss": 0.9328, "odds_ratio_loss": 0.047305118292570114, "rewards/accuracies": 1.0, "rewards/chosen": -0.09280698001384735, "rewards/margins": 0.4355740547180176, "rewards/rejected": -0.5283809900283813, "sft_loss": 0.9280698895454407, "step": 8345 }, { "epoch": 0.65, "grad_norm": 5.988420486450195, "learning_rate": 2.774875536889884e-06, "logits/chosen": -1.4749171733856201, "logits/rejected": -1.2055251598358154, "logps/chosen": -1.0997960567474365, "logps/rejected": -5.799482822418213, "loss": 1.1034, "odds_ratio_loss": 0.03586059808731079, "rewards/accuracies": 1.0, "rewards/chosen": -0.10997961461544037, "rewards/margins": 0.46996861696243286, "rewards/rejected": -0.5799482464790344, "sft_loss": 1.0997960567474365, "step": 8350 }, { "epoch": 0.65, "grad_norm": 6.665837287902832, "learning_rate": 2.7693630088566927e-06, "logits/chosen": -1.4081647396087646, "logits/rejected": -1.2188732624053955, "logps/chosen": -0.9331458210945129, "logps/rejected": -5.226067543029785, "loss": 0.9604, "odds_ratio_loss": 0.2725379168987274, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09331458806991577, "rewards/margins": 0.4292922019958496, "rewards/rejected": -0.5226067900657654, "sft_loss": 0.9331458210945129, "step": 8355 }, { "epoch": 0.65, "grad_norm": 12.168383598327637, "learning_rate": 2.763853863867708e-06, "logits/chosen": -1.4207031726837158, "logits/rejected": -1.0859181880950928, "logps/chosen": -0.7639963626861572, "logps/rejected": -1.5648362636566162, "loss": 0.79, "odds_ratio_loss": 0.26012665033340454, "rewards/accuracies": 1.0, "rewards/chosen": -0.07639963924884796, "rewards/margins": 0.08008398115634918, "rewards/rejected": -0.15648362040519714, "sft_loss": 0.7639963626861572, "step": 8360 }, { "epoch": 0.65, "grad_norm": 5.338338851928711, "learning_rate": 2.758348110278254e-06, "logits/chosen": -1.3823060989379883, "logits/rejected": -1.1109060049057007, "logps/chosen": -0.8929181098937988, "logps/rejected": -10.302949905395508, "loss": 0.8986, "odds_ratio_loss": 0.05642740800976753, "rewards/accuracies": 1.0, "rewards/chosen": -0.08929181098937988, "rewards/margins": 0.9410032033920288, "rewards/rejected": -1.0302950143814087, "sft_loss": 0.8929181098937988, "step": 8365 }, { "epoch": 0.65, "grad_norm": 12.998638153076172, "learning_rate": 2.7528457564385036e-06, "logits/chosen": -1.2639782428741455, "logits/rejected": -1.2002880573272705, "logps/chosen": -1.0140728950500488, "logps/rejected": -4.678340911865234, "loss": 1.0299, "odds_ratio_loss": 0.15787038207054138, "rewards/accuracies": 1.0, "rewards/chosen": -0.10140728950500488, "rewards/margins": 0.3664267957210541, "rewards/rejected": -0.46783414483070374, "sft_loss": 1.0140728950500488, "step": 8370 }, { "epoch": 0.65, "grad_norm": 15.505352020263672, "learning_rate": 2.74734681069347e-06, "logits/chosen": -1.1000642776489258, "logits/rejected": -1.1080691814422607, "logps/chosen": -0.7868161797523499, "logps/rejected": -5.477625370025635, "loss": 0.8012, "odds_ratio_loss": 0.14340201020240784, "rewards/accuracies": 1.0, "rewards/chosen": -0.07868161052465439, "rewards/margins": 0.4690808653831482, "rewards/rejected": -0.5477625131607056, "sft_loss": 0.7868161797523499, "step": 8375 }, { "epoch": 0.65, "grad_norm": 8.352320671081543, "learning_rate": 2.7418512813830077e-06, "logits/chosen": -1.3259737491607666, "logits/rejected": -1.0996830463409424, "logps/chosen": -1.5631464719772339, "logps/rejected": -6.814671516418457, "loss": 1.6234, "odds_ratio_loss": 0.6024779081344604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1563146561384201, "rewards/margins": 0.5251525044441223, "rewards/rejected": -0.6814671754837036, "sft_loss": 1.5631464719772339, "step": 8380 }, { "epoch": 0.65, "grad_norm": 4.832741737365723, "learning_rate": 2.7363591768417825e-06, "logits/chosen": -1.333432912826538, "logits/rejected": -1.1490983963012695, "logps/chosen": -0.8162245750427246, "logps/rejected": -7.255775451660156, "loss": 0.8201, "odds_ratio_loss": 0.03829885274171829, "rewards/accuracies": 1.0, "rewards/chosen": -0.08162246644496918, "rewards/margins": 0.6439551115036011, "rewards/rejected": -0.7255775332450867, "sft_loss": 0.8162245750427246, "step": 8385 }, { "epoch": 0.65, "grad_norm": 9.65146255493164, "learning_rate": 2.730870505399267e-06, "logits/chosen": -1.4474256038665771, "logits/rejected": -1.1180304288864136, "logps/chosen": -0.7661795020103455, "logps/rejected": -4.420128345489502, "loss": 0.7841, "odds_ratio_loss": 0.17968173325061798, "rewards/accuracies": 1.0, "rewards/chosen": -0.07661795616149902, "rewards/margins": 0.3653948903083801, "rewards/rejected": -0.4420127868652344, "sft_loss": 0.7661795020103455, "step": 8390 }, { "epoch": 0.65, "grad_norm": 247.20248413085938, "learning_rate": 2.7253852753797315e-06, "logits/chosen": -1.2072155475616455, "logits/rejected": -1.2853707075119019, "logps/chosen": -1.166669249534607, "logps/rejected": -9.404703140258789, "loss": 1.1773, "odds_ratio_loss": 0.1061633825302124, "rewards/accuracies": 1.0, "rewards/chosen": -0.11666693538427353, "rewards/margins": 0.8238033056259155, "rewards/rejected": -0.9404702186584473, "sft_loss": 1.166669249534607, "step": 8395 }, { "epoch": 0.65, "grad_norm": 9.424349784851074, "learning_rate": 2.719903495102223e-06, "logits/chosen": -1.3545876741409302, "logits/rejected": -1.0364861488342285, "logps/chosen": -1.0813119411468506, "logps/rejected": -8.046969413757324, "loss": 1.095, "odds_ratio_loss": 0.13737812638282776, "rewards/accuracies": 1.0, "rewards/chosen": -0.10813118517398834, "rewards/margins": 0.6965658068656921, "rewards/rejected": -0.8046969175338745, "sft_loss": 1.0813119411468506, "step": 8400 }, { "epoch": 0.65, "grad_norm": 4.774113178253174, "learning_rate": 2.714425172880554e-06, "logits/chosen": -1.3201053142547607, "logits/rejected": -0.8986188769340515, "logps/chosen": -0.7992849946022034, "logps/rejected": -4.5283308029174805, "loss": 0.8134, "odds_ratio_loss": 0.14078517258167267, "rewards/accuracies": 1.0, "rewards/chosen": -0.07992849498987198, "rewards/margins": 0.37290462851524353, "rewards/rejected": -0.4528331160545349, "sft_loss": 0.7992849946022034, "step": 8405 }, { "epoch": 0.65, "grad_norm": 6.051595687866211, "learning_rate": 2.7089503170233e-06, "logits/chosen": -1.2442939281463623, "logits/rejected": -1.1158170700073242, "logps/chosen": -1.9196693897247314, "logps/rejected": -6.476452827453613, "loss": 1.9491, "odds_ratio_loss": 0.2945200800895691, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1919669210910797, "rewards/margins": 0.45567837357521057, "rewards/rejected": -0.6476452946662903, "sft_loss": 1.9196693897247314, "step": 8410 }, { "epoch": 0.65, "grad_norm": 5.730443000793457, "learning_rate": 2.7034789358337743e-06, "logits/chosen": -1.3807923793792725, "logits/rejected": -0.6011480689048767, "logps/chosen": -0.8076326251029968, "logps/rejected": -7.6511664390563965, "loss": 0.8141, "odds_ratio_loss": 0.06483285129070282, "rewards/accuracies": 1.0, "rewards/chosen": -0.08076325803995132, "rewards/margins": 0.6843534111976624, "rewards/rejected": -0.7651166915893555, "sft_loss": 0.8076326251029968, "step": 8415 }, { "epoch": 0.65, "grad_norm": 8.752543449401855, "learning_rate": 2.6980110376100187e-06, "logits/chosen": -1.373929738998413, "logits/rejected": -0.8665214776992798, "logps/chosen": -0.9539247751235962, "logps/rejected": -7.036995887756348, "loss": 0.9627, "odds_ratio_loss": 0.08804039657115936, "rewards/accuracies": 1.0, "rewards/chosen": -0.09539248049259186, "rewards/margins": 0.6083070635795593, "rewards/rejected": -0.7036995887756348, "sft_loss": 0.9539247751235962, "step": 8420 }, { "epoch": 0.66, "grad_norm": 13.451896667480469, "learning_rate": 2.692546630644797e-06, "logits/chosen": -1.3927185535430908, "logits/rejected": -1.1633400917053223, "logps/chosen": -0.5796695947647095, "logps/rejected": -7.085219383239746, "loss": 0.6154, "odds_ratio_loss": 0.3574976623058319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05796695873141289, "rewards/margins": 0.6505550146102905, "rewards/rejected": -0.7085219621658325, "sft_loss": 0.5796695947647095, "step": 8425 }, { "epoch": 0.66, "grad_norm": 43.32596206665039, "learning_rate": 2.6870857232255764e-06, "logits/chosen": -1.469041109085083, "logits/rejected": -0.9503668546676636, "logps/chosen": -1.0551387071609497, "logps/rejected": -6.713972568511963, "loss": 1.1806, "odds_ratio_loss": 1.255061388015747, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10551387071609497, "rewards/margins": 0.5658833384513855, "rewards/rejected": -0.6713972091674805, "sft_loss": 1.0551387071609497, "step": 8430 }, { "epoch": 0.66, "grad_norm": 4.562943458557129, "learning_rate": 2.6816283236345143e-06, "logits/chosen": -1.1838380098342896, "logits/rejected": -1.3772586584091187, "logps/chosen": -0.830003559589386, "logps/rejected": -10.05511474609375, "loss": 0.8466, "odds_ratio_loss": 0.165578693151474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08300035446882248, "rewards/margins": 0.922511100769043, "rewards/rejected": -1.0055115222930908, "sft_loss": 0.830003559589386, "step": 8435 }, { "epoch": 0.66, "grad_norm": 9.076092720031738, "learning_rate": 2.67617444014845e-06, "logits/chosen": -1.3671481609344482, "logits/rejected": -0.9834438562393188, "logps/chosen": -0.981173038482666, "logps/rejected": -4.925403594970703, "loss": 0.9975, "odds_ratio_loss": 0.1635417342185974, "rewards/accuracies": 1.0, "rewards/chosen": -0.09811730682849884, "rewards/margins": 0.39442306756973267, "rewards/rejected": -0.4925404191017151, "sft_loss": 0.981173038482666, "step": 8440 }, { "epoch": 0.66, "grad_norm": 31.632305145263672, "learning_rate": 2.6707240810388933e-06, "logits/chosen": -1.2191075086593628, "logits/rejected": -1.3692381381988525, "logps/chosen": -0.8065007925033569, "logps/rejected": -2.877467393875122, "loss": 0.8654, "odds_ratio_loss": 0.5888963341712952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08065007627010345, "rewards/margins": 0.2070966213941574, "rewards/rejected": -0.28774672746658325, "sft_loss": 0.8065007925033569, "step": 8445 }, { "epoch": 0.66, "grad_norm": 32.29814910888672, "learning_rate": 2.665277254572005e-06, "logits/chosen": -1.227386236190796, "logits/rejected": -0.7855269312858582, "logps/chosen": -1.0738370418548584, "logps/rejected": -2.9455013275146484, "loss": 1.0988, "odds_ratio_loss": 0.24924306571483612, "rewards/accuracies": 1.0, "rewards/chosen": -0.10738371312618256, "rewards/margins": 0.18716642260551453, "rewards/rejected": -0.2945501208305359, "sft_loss": 1.0738370418548584, "step": 8450 }, { "epoch": 0.66, "grad_norm": 6.155773639678955, "learning_rate": 2.659833969008585e-06, "logits/chosen": -1.3078839778900146, "logits/rejected": -0.8803592920303345, "logps/chosen": -0.9628534317016602, "logps/rejected": -6.471461296081543, "loss": 0.9819, "odds_ratio_loss": 0.19075781106948853, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09628535062074661, "rewards/margins": 0.5508608222007751, "rewards/rejected": -0.6471462249755859, "sft_loss": 0.9628534317016602, "step": 8455 }, { "epoch": 0.66, "grad_norm": 11.189478874206543, "learning_rate": 2.6543942326040728e-06, "logits/chosen": -1.4053690433502197, "logits/rejected": -0.7008453607559204, "logps/chosen": -1.1526126861572266, "logps/rejected": -7.940212249755859, "loss": 1.1566, "odds_ratio_loss": 0.039906956255435944, "rewards/accuracies": 1.0, "rewards/chosen": -0.1152612715959549, "rewards/margins": 0.6787599921226501, "rewards/rejected": -0.7940212488174438, "sft_loss": 1.1526126861572266, "step": 8460 }, { "epoch": 0.66, "grad_norm": 6.9324517250061035, "learning_rate": 2.6489580536085163e-06, "logits/chosen": -1.4052064418792725, "logits/rejected": -1.0391250848770142, "logps/chosen": -0.8362232446670532, "logps/rejected": -7.123330116271973, "loss": 0.8524, "odds_ratio_loss": 0.16156096756458282, "rewards/accuracies": 1.0, "rewards/chosen": -0.08362232148647308, "rewards/margins": 0.6287106871604919, "rewards/rejected": -0.712333083152771, "sft_loss": 0.8362232446670532, "step": 8465 }, { "epoch": 0.66, "grad_norm": 4.802675247192383, "learning_rate": 2.6435254402665695e-06, "logits/chosen": -1.352170705795288, "logits/rejected": -0.9224128723144531, "logps/chosen": -0.982093334197998, "logps/rejected": -5.067374229431152, "loss": 0.9844, "odds_ratio_loss": 0.02332504466176033, "rewards/accuracies": 1.0, "rewards/chosen": -0.09820933640003204, "rewards/margins": 0.40852808952331543, "rewards/rejected": -0.5067374110221863, "sft_loss": 0.982093334197998, "step": 8470 }, { "epoch": 0.66, "grad_norm": 14.048432350158691, "learning_rate": 2.6380964008174836e-06, "logits/chosen": -1.1690585613250732, "logits/rejected": -1.1874010562896729, "logps/chosen": -1.1043365001678467, "logps/rejected": -4.214531898498535, "loss": 1.1143, "odds_ratio_loss": 0.09924004226922989, "rewards/accuracies": 1.0, "rewards/chosen": -0.1104336529970169, "rewards/margins": 0.31101956963539124, "rewards/rejected": -0.42145317792892456, "sft_loss": 1.1043365001678467, "step": 8475 }, { "epoch": 0.66, "grad_norm": 26.32798194885254, "learning_rate": 2.632670943495086e-06, "logits/chosen": -1.2947049140930176, "logits/rejected": -0.8347790837287903, "logps/chosen": -1.1103200912475586, "logps/rejected": -4.632498741149902, "loss": 1.1331, "odds_ratio_loss": 0.22751355171203613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11103200912475586, "rewards/margins": 0.3522178828716278, "rewards/rejected": -0.4632498621940613, "sft_loss": 1.1103200912475586, "step": 8480 }, { "epoch": 0.66, "grad_norm": 9.29808235168457, "learning_rate": 2.6272490765277716e-06, "logits/chosen": -1.3132994174957275, "logits/rejected": -1.065861463546753, "logps/chosen": -1.1188445091247559, "logps/rejected": -5.130213260650635, "loss": 1.1395, "odds_ratio_loss": 0.20684120059013367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1118844598531723, "rewards/margins": 0.4011368751525879, "rewards/rejected": -0.5130213499069214, "sft_loss": 1.1188445091247559, "step": 8485 }, { "epoch": 0.66, "grad_norm": 10.75184154510498, "learning_rate": 2.621830808138485e-06, "logits/chosen": -1.3912122249603271, "logits/rejected": -1.1479923725128174, "logps/chosen": -1.0153714418411255, "logps/rejected": -7.170912742614746, "loss": 1.0245, "odds_ratio_loss": 0.0912996158003807, "rewards/accuracies": 1.0, "rewards/chosen": -0.10153714567422867, "rewards/margins": 0.6155540943145752, "rewards/rejected": -0.7170912027359009, "sft_loss": 1.0153714418411255, "step": 8490 }, { "epoch": 0.66, "grad_norm": 29.56340789794922, "learning_rate": 2.6164161465447235e-06, "logits/chosen": -1.2630358934402466, "logits/rejected": -0.7791035771369934, "logps/chosen": -1.0668691396713257, "logps/rejected": -2.717613935470581, "loss": 1.0958, "odds_ratio_loss": 0.28952598571777344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10668691247701645, "rewards/margins": 0.16507449746131897, "rewards/rejected": -0.27176138758659363, "sft_loss": 1.0668691396713257, "step": 8495 }, { "epoch": 0.66, "grad_norm": 9.063817977905273, "learning_rate": 2.611005099958508e-06, "logits/chosen": -1.3539373874664307, "logits/rejected": -0.797394871711731, "logps/chosen": -0.9918476343154907, "logps/rejected": -2.190828323364258, "loss": 1.0311, "odds_ratio_loss": 0.39265957474708557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09918475151062012, "rewards/margins": 0.11989808082580566, "rewards/rejected": -0.21908283233642578, "sft_loss": 0.9918476343154907, "step": 8500 }, { "epoch": 0.66, "grad_norm": 79.72722625732422, "learning_rate": 2.6055976765863744e-06, "logits/chosen": -1.391193151473999, "logits/rejected": -0.9862115979194641, "logps/chosen": -1.048738718032837, "logps/rejected": -6.968371391296387, "loss": 1.109, "odds_ratio_loss": 0.6028006672859192, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10487387329339981, "rewards/margins": 0.5919632315635681, "rewards/rejected": -0.6968370676040649, "sft_loss": 1.048738718032837, "step": 8505 }, { "epoch": 0.66, "grad_norm": 21.11467933654785, "learning_rate": 2.6001938846293717e-06, "logits/chosen": -1.4116090536117554, "logits/rejected": -1.1980160474777222, "logps/chosen": -1.0626676082611084, "logps/rejected": -5.518164157867432, "loss": 1.0703, "odds_ratio_loss": 0.07616396248340607, "rewards/accuracies": 1.0, "rewards/chosen": -0.10626678168773651, "rewards/margins": 0.4455496370792389, "rewards/rejected": -0.5518164038658142, "sft_loss": 1.0626676082611084, "step": 8510 }, { "epoch": 0.66, "grad_norm": 28.986602783203125, "learning_rate": 2.5947937322830346e-06, "logits/chosen": -1.4573067426681519, "logits/rejected": -0.9762972593307495, "logps/chosen": -1.0367553234100342, "logps/rejected": -5.620980739593506, "loss": 1.0498, "odds_ratio_loss": 0.12997016310691833, "rewards/accuracies": 1.0, "rewards/chosen": -0.10367554426193237, "rewards/margins": 0.45842257142066956, "rewards/rejected": -0.5620980858802795, "sft_loss": 1.0367553234100342, "step": 8515 }, { "epoch": 0.66, "grad_norm": 4.6948652267456055, "learning_rate": 2.589397227737377e-06, "logits/chosen": -1.3029415607452393, "logits/rejected": -0.7049711346626282, "logps/chosen": -1.0776302814483643, "logps/rejected": -4.658875942230225, "loss": 1.0874, "odds_ratio_loss": 0.09810274839401245, "rewards/accuracies": 1.0, "rewards/chosen": -0.10776302963495255, "rewards/margins": 0.35812458395957947, "rewards/rejected": -0.4658876359462738, "sft_loss": 1.0776302814483643, "step": 8520 }, { "epoch": 0.66, "grad_norm": 75.1124267578125, "learning_rate": 2.5840043791768876e-06, "logits/chosen": -1.0878616571426392, "logits/rejected": -1.3250956535339355, "logps/chosen": -1.1645736694335938, "logps/rejected": -5.899996280670166, "loss": 1.1723, "odds_ratio_loss": 0.07772447168827057, "rewards/accuracies": 1.0, "rewards/chosen": -0.11645736545324326, "rewards/margins": 0.4735422730445862, "rewards/rejected": -0.5899996161460876, "sft_loss": 1.1645736694335938, "step": 8525 }, { "epoch": 0.66, "grad_norm": 6.188475608825684, "learning_rate": 2.5786151947805045e-06, "logits/chosen": -1.4203815460205078, "logits/rejected": -0.9625504612922668, "logps/chosen": -0.9616947174072266, "logps/rejected": -6.568534851074219, "loss": 0.9895, "odds_ratio_loss": 0.2777538299560547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09616947174072266, "rewards/margins": 0.560684084892273, "rewards/rejected": -0.6568534970283508, "sft_loss": 0.9616947174072266, "step": 8530 }, { "epoch": 0.66, "grad_norm": 25.69866180419922, "learning_rate": 2.5732296827216086e-06, "logits/chosen": -1.266811490058899, "logits/rejected": -1.3779981136322021, "logps/chosen": -1.1356754302978516, "logps/rejected": -10.114517211914062, "loss": 1.1773, "odds_ratio_loss": 0.4164787232875824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1135675460100174, "rewards/margins": 0.8978842496871948, "rewards/rejected": -1.0114517211914062, "sft_loss": 1.1356754302978516, "step": 8535 }, { "epoch": 0.66, "grad_norm": 8.122133255004883, "learning_rate": 2.5678478511680143e-06, "logits/chosen": -1.3327136039733887, "logits/rejected": -0.841677188873291, "logps/chosen": -0.6438709497451782, "logps/rejected": -6.755641937255859, "loss": 0.6519, "odds_ratio_loss": 0.08069080114364624, "rewards/accuracies": 1.0, "rewards/chosen": -0.06438709795475006, "rewards/margins": 0.6111770868301392, "rewards/rejected": -0.675564169883728, "sft_loss": 0.6438709497451782, "step": 8540 }, { "epoch": 0.66, "grad_norm": 7.290009498596191, "learning_rate": 2.5624697082819517e-06, "logits/chosen": -1.1562501192092896, "logits/rejected": -1.0644404888153076, "logps/chosen": -0.6335257291793823, "logps/rejected": -8.344648361206055, "loss": 0.6375, "odds_ratio_loss": 0.03943305462598801, "rewards/accuracies": 1.0, "rewards/chosen": -0.06335257738828659, "rewards/margins": 0.771112322807312, "rewards/rejected": -0.8344649076461792, "sft_loss": 0.6335257291793823, "step": 8545 }, { "epoch": 0.67, "grad_norm": 10.739457130432129, "learning_rate": 2.5570952622200575e-06, "logits/chosen": -1.4692208766937256, "logits/rejected": -1.1127398014068604, "logps/chosen": -1.4549614191055298, "logps/rejected": -10.238380432128906, "loss": 1.4863, "odds_ratio_loss": 0.31321436166763306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14549614489078522, "rewards/margins": 0.8783419728279114, "rewards/rejected": -1.0238380432128906, "sft_loss": 1.4549614191055298, "step": 8550 }, { "epoch": 0.67, "grad_norm": 24.13983917236328, "learning_rate": 2.5517245211333585e-06, "logits/chosen": -1.2598793506622314, "logits/rejected": -1.0114340782165527, "logps/chosen": -1.1584948301315308, "logps/rejected": -4.865040302276611, "loss": 1.1745, "odds_ratio_loss": 0.16014239192008972, "rewards/accuracies": 1.0, "rewards/chosen": -0.11584948003292084, "rewards/margins": 0.37065452337265015, "rewards/rejected": -0.4865039885044098, "sft_loss": 1.1584948301315308, "step": 8555 }, { "epoch": 0.67, "grad_norm": 18.325902938842773, "learning_rate": 2.5463574931672714e-06, "logits/chosen": -1.2867510318756104, "logits/rejected": -1.064446210861206, "logps/chosen": -0.9027272462844849, "logps/rejected": -6.976473808288574, "loss": 0.9577, "odds_ratio_loss": 0.5501552820205688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09027273207902908, "rewards/margins": 0.6073746681213379, "rewards/rejected": -0.6976473927497864, "sft_loss": 0.9027272462844849, "step": 8560 }, { "epoch": 0.67, "grad_norm": 26.868846893310547, "learning_rate": 2.5409941864615717e-06, "logits/chosen": -1.2793635129928589, "logits/rejected": -1.092800259590149, "logps/chosen": -0.8769108057022095, "logps/rejected": -6.240958213806152, "loss": 0.8987, "odds_ratio_loss": 0.21805354952812195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08769109100103378, "rewards/margins": 0.5364047884941101, "rewards/rejected": -0.6240958571434021, "sft_loss": 0.8769108057022095, "step": 8565 }, { "epoch": 0.67, "grad_norm": 16.86786460876465, "learning_rate": 2.535634609150395e-06, "logits/chosen": -1.327836275100708, "logits/rejected": -1.0574593544006348, "logps/chosen": -1.1664834022521973, "logps/rejected": -4.941313743591309, "loss": 1.1825, "odds_ratio_loss": 0.1604529619216919, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11664833873510361, "rewards/margins": 0.3774830996990204, "rewards/rejected": -0.494131475687027, "sft_loss": 1.1664834022521973, "step": 8570 }, { "epoch": 0.67, "grad_norm": 4.323856830596924, "learning_rate": 2.5302787693622223e-06, "logits/chosen": -1.4250985383987427, "logits/rejected": -1.1745866537094116, "logps/chosen": -0.7151682376861572, "logps/rejected": -4.566773891448975, "loss": 0.7262, "odds_ratio_loss": 0.11058475822210312, "rewards/accuracies": 1.0, "rewards/chosen": -0.07151682674884796, "rewards/margins": 0.3851606249809265, "rewards/rejected": -0.4566774368286133, "sft_loss": 0.7151682376861572, "step": 8575 }, { "epoch": 0.67, "grad_norm": 4.649173259735107, "learning_rate": 2.5249266752198644e-06, "logits/chosen": -1.2564361095428467, "logits/rejected": -0.82867431640625, "logps/chosen": -0.9718457460403442, "logps/rejected": -3.516380786895752, "loss": 1.0028, "odds_ratio_loss": 0.3100363314151764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09718457609415054, "rewards/margins": 0.25445348024368286, "rewards/rejected": -0.3516380786895752, "sft_loss": 0.9718457460403442, "step": 8580 }, { "epoch": 0.67, "grad_norm": 6.03109884262085, "learning_rate": 2.519578334840449e-06, "logits/chosen": -1.2383915185928345, "logits/rejected": -0.9514943957328796, "logps/chosen": -1.175445318222046, "logps/rejected": -7.592044830322266, "loss": 1.1958, "odds_ratio_loss": 0.20304739475250244, "rewards/accuracies": 1.0, "rewards/chosen": -0.11754453182220459, "rewards/margins": 0.6416599154472351, "rewards/rejected": -0.7592045068740845, "sft_loss": 1.175445318222046, "step": 8585 }, { "epoch": 0.67, "grad_norm": 13.900643348693848, "learning_rate": 2.514233756335417e-06, "logits/chosen": -1.415035605430603, "logits/rejected": -1.2320356369018555, "logps/chosen": -0.7346863746643066, "logps/rejected": -3.777554750442505, "loss": 0.7469, "odds_ratio_loss": 0.12233449518680573, "rewards/accuracies": 1.0, "rewards/chosen": -0.0734686404466629, "rewards/margins": 0.3042868673801422, "rewards/rejected": -0.37775546312332153, "sft_loss": 0.7346863746643066, "step": 8590 }, { "epoch": 0.67, "grad_norm": 25.50760269165039, "learning_rate": 2.5088929478104993e-06, "logits/chosen": -1.3608187437057495, "logits/rejected": -0.8603233098983765, "logps/chosen": -0.862570583820343, "logps/rejected": -9.544593811035156, "loss": 0.9183, "odds_ratio_loss": 0.5568982362747192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08625705540180206, "rewards/margins": 0.8682023882865906, "rewards/rejected": -0.9544594883918762, "sft_loss": 0.862570583820343, "step": 8595 }, { "epoch": 0.67, "grad_norm": 11.82734489440918, "learning_rate": 2.503555917365711e-06, "logits/chosen": -1.062488079071045, "logits/rejected": -1.4695297479629517, "logps/chosen": -1.097212791442871, "logps/rejected": -7.532125949859619, "loss": 1.1187, "odds_ratio_loss": 0.21446946263313293, "rewards/accuracies": 1.0, "rewards/chosen": -0.10972128063440323, "rewards/margins": 0.6434913873672485, "rewards/rejected": -0.753212571144104, "sft_loss": 1.097212791442871, "step": 8600 }, { "epoch": 0.67, "grad_norm": 8.268609046936035, "learning_rate": 2.4982226730953315e-06, "logits/chosen": -1.3083078861236572, "logits/rejected": -1.256940484046936, "logps/chosen": -1.0806376934051514, "logps/rejected": -7.487344264984131, "loss": 1.0854, "odds_ratio_loss": 0.047921765595674515, "rewards/accuracies": 1.0, "rewards/chosen": -0.10806377232074738, "rewards/margins": 0.6406707167625427, "rewards/rejected": -0.7487344741821289, "sft_loss": 1.0806376934051514, "step": 8605 }, { "epoch": 0.67, "grad_norm": 85.88379669189453, "learning_rate": 2.4928932230879076e-06, "logits/chosen": -1.2781380414962769, "logits/rejected": -1.4369587898254395, "logps/chosen": -1.1298456192016602, "logps/rejected": -16.079452514648438, "loss": 1.1302, "odds_ratio_loss": 0.003542351070791483, "rewards/accuracies": 1.0, "rewards/chosen": -0.11298457533121109, "rewards/margins": 1.4949607849121094, "rewards/rejected": -1.607945203781128, "sft_loss": 1.1298456192016602, "step": 8610 }, { "epoch": 0.67, "grad_norm": 9.954533576965332, "learning_rate": 2.4875675754262265e-06, "logits/chosen": -1.3310059309005737, "logits/rejected": -1.3483989238739014, "logps/chosen": -0.8747609257698059, "logps/rejected": -7.9532470703125, "loss": 0.8769, "odds_ratio_loss": 0.02184412255883217, "rewards/accuracies": 1.0, "rewards/chosen": -0.08747608959674835, "rewards/margins": 0.7078485488891602, "rewards/rejected": -0.7953246831893921, "sft_loss": 0.8747609257698059, "step": 8615 }, { "epoch": 0.67, "grad_norm": 11.167495727539062, "learning_rate": 2.4822457381873055e-06, "logits/chosen": -1.3983945846557617, "logits/rejected": -1.107145071029663, "logps/chosen": -1.239545226097107, "logps/rejected": -6.608633518218994, "loss": 1.2722, "odds_ratio_loss": 0.3261207342147827, "rewards/accuracies": 1.0, "rewards/chosen": -0.12395453453063965, "rewards/margins": 0.5369088053703308, "rewards/rejected": -0.6608633399009705, "sft_loss": 1.239545226097107, "step": 8620 }, { "epoch": 0.67, "grad_norm": 48.37714767456055, "learning_rate": 2.476927719442391e-06, "logits/chosen": -1.4697296619415283, "logits/rejected": -1.104361891746521, "logps/chosen": -1.1724387407302856, "logps/rejected": -6.618524074554443, "loss": 1.1824, "odds_ratio_loss": 0.10003963857889175, "rewards/accuracies": 1.0, "rewards/chosen": -0.11724387109279633, "rewards/margins": 0.5446085333824158, "rewards/rejected": -0.6618523597717285, "sft_loss": 1.1724387407302856, "step": 8625 }, { "epoch": 0.67, "grad_norm": 13.710233688354492, "learning_rate": 2.471613527256932e-06, "logits/chosen": -1.3458722829818726, "logits/rejected": -1.153869867324829, "logps/chosen": -0.9881760478019714, "logps/rejected": -10.99592113494873, "loss": 0.9948, "odds_ratio_loss": 0.06657255440950394, "rewards/accuracies": 1.0, "rewards/chosen": -0.0988176092505455, "rewards/margins": 1.000774621963501, "rewards/rejected": -1.0995922088623047, "sft_loss": 0.9881760478019714, "step": 8630 }, { "epoch": 0.67, "grad_norm": 5.124767780303955, "learning_rate": 2.4663031696905732e-06, "logits/chosen": -1.375286340713501, "logits/rejected": -1.0435411930084229, "logps/chosen": -0.9623756408691406, "logps/rejected": -4.443991661071777, "loss": 0.9678, "odds_ratio_loss": 0.053923167288303375, "rewards/accuracies": 1.0, "rewards/chosen": -0.09623756259679794, "rewards/margins": 0.34816163778305054, "rewards/rejected": -0.4443992078304291, "sft_loss": 0.9623756408691406, "step": 8635 }, { "epoch": 0.67, "grad_norm": 5.593282699584961, "learning_rate": 2.4609966547971505e-06, "logits/chosen": -1.276353359222412, "logits/rejected": -0.8955556154251099, "logps/chosen": -0.9168095588684082, "logps/rejected": -4.759482383728027, "loss": 0.941, "odds_ratio_loss": 0.24143771827220917, "rewards/accuracies": 1.0, "rewards/chosen": -0.09168095886707306, "rewards/margins": 0.38426730036735535, "rewards/rejected": -0.4759482443332672, "sft_loss": 0.9168095588684082, "step": 8640 }, { "epoch": 0.67, "grad_norm": 32.27127456665039, "learning_rate": 2.4556939906246644e-06, "logits/chosen": -1.1802663803100586, "logits/rejected": -1.1810654401779175, "logps/chosen": -1.0008405447006226, "logps/rejected": -7.588818550109863, "loss": 1.0307, "odds_ratio_loss": 0.2982523441314697, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10008406639099121, "rewards/margins": 0.6587977409362793, "rewards/rejected": -0.7588818669319153, "sft_loss": 1.0008405447006226, "step": 8645 }, { "epoch": 0.67, "grad_norm": 4.079829216003418, "learning_rate": 2.4503951852152803e-06, "logits/chosen": -1.3074984550476074, "logits/rejected": -0.7468923330307007, "logps/chosen": -0.9436966180801392, "logps/rejected": -6.366265296936035, "loss": 0.9684, "odds_ratio_loss": 0.24740377068519592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09436966478824615, "rewards/margins": 0.5422569513320923, "rewards/rejected": -0.6366265416145325, "sft_loss": 0.9436966180801392, "step": 8650 }, { "epoch": 0.67, "grad_norm": 5.481407642364502, "learning_rate": 2.4451002466053075e-06, "logits/chosen": -1.199569821357727, "logits/rejected": -1.1957504749298096, "logps/chosen": -0.8332787752151489, "logps/rejected": -4.7142720222473145, "loss": 0.9124, "odds_ratio_loss": 0.7910371422767639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08332787454128265, "rewards/margins": 0.38809934258461, "rewards/rejected": -0.47142720222473145, "sft_loss": 0.8332787752151489, "step": 8655 }, { "epoch": 0.67, "grad_norm": 6.620190620422363, "learning_rate": 2.4398091828251935e-06, "logits/chosen": -0.940380871295929, "logits/rejected": -1.2794045209884644, "logps/chosen": -1.1835598945617676, "logps/rejected": -8.489274024963379, "loss": 1.1934, "odds_ratio_loss": 0.0984053835272789, "rewards/accuracies": 1.0, "rewards/chosen": -0.11835597455501556, "rewards/margins": 0.7305713891983032, "rewards/rejected": -0.84892737865448, "sft_loss": 1.1835598945617676, "step": 8660 }, { "epoch": 0.67, "grad_norm": 6.176886081695557, "learning_rate": 2.4345220018995086e-06, "logits/chosen": -1.3382463455200195, "logits/rejected": -0.9954336881637573, "logps/chosen": -1.0478094816207886, "logps/rejected": -17.834476470947266, "loss": 1.0495, "odds_ratio_loss": 0.01731877587735653, "rewards/accuracies": 1.0, "rewards/chosen": -0.10478094965219498, "rewards/margins": 1.6786667108535767, "rewards/rejected": -1.7834476232528687, "sft_loss": 1.0478094816207886, "step": 8665 }, { "epoch": 0.67, "grad_norm": 4.765834331512451, "learning_rate": 2.429238711846932e-06, "logits/chosen": -1.1585235595703125, "logits/rejected": -0.7135952711105347, "logps/chosen": -1.0281130075454712, "logps/rejected": -11.070489883422852, "loss": 1.0394, "odds_ratio_loss": 0.11250078678131104, "rewards/accuracies": 1.0, "rewards/chosen": -0.102811299264431, "rewards/margins": 1.0042376518249512, "rewards/rejected": -1.1070489883422852, "sft_loss": 1.0281130075454712, "step": 8670 }, { "epoch": 0.67, "grad_norm": 5.145506858825684, "learning_rate": 2.4239593206802465e-06, "logits/chosen": -1.3943018913269043, "logits/rejected": -0.9713207483291626, "logps/chosen": -0.9120500683784485, "logps/rejected": -5.5587615966796875, "loss": 0.9227, "odds_ratio_loss": 0.10694025456905365, "rewards/accuracies": 1.0, "rewards/chosen": -0.09120501577854156, "rewards/margins": 0.46467119455337524, "rewards/rejected": -0.5558761358261108, "sft_loss": 0.9120500683784485, "step": 8675 }, { "epoch": 0.68, "grad_norm": 4.959620952606201, "learning_rate": 2.418683836406318e-06, "logits/chosen": -1.3747514486312866, "logits/rejected": -0.5954295992851257, "logps/chosen": -1.0089941024780273, "logps/rejected": -6.331844329833984, "loss": 1.0323, "odds_ratio_loss": 0.23257341980934143, "rewards/accuracies": 1.0, "rewards/chosen": -0.10089939832687378, "rewards/margins": 0.5322850942611694, "rewards/rejected": -0.6331844329833984, "sft_loss": 1.0089941024780273, "step": 8680 }, { "epoch": 0.68, "grad_norm": 9.242082595825195, "learning_rate": 2.4134122670260875e-06, "logits/chosen": -1.2145757675170898, "logits/rejected": -1.2564046382904053, "logps/chosen": -1.1589300632476807, "logps/rejected": -8.16043472290039, "loss": 1.1596, "odds_ratio_loss": 0.007167732808738947, "rewards/accuracies": 1.0, "rewards/chosen": -0.11589299142360687, "rewards/margins": 0.7001504898071289, "rewards/rejected": -0.8160433769226074, "sft_loss": 1.1589300632476807, "step": 8685 }, { "epoch": 0.68, "grad_norm": 9.718241691589355, "learning_rate": 2.408144620534561e-06, "logits/chosen": -1.3903659582138062, "logits/rejected": -1.2220784425735474, "logps/chosen": -0.8666499853134155, "logps/rejected": -6.302515506744385, "loss": 0.8757, "odds_ratio_loss": 0.09074664860963821, "rewards/accuracies": 1.0, "rewards/chosen": -0.08666499704122543, "rewards/margins": 0.5435865521430969, "rewards/rejected": -0.6302515268325806, "sft_loss": 0.8666499853134155, "step": 8690 }, { "epoch": 0.68, "grad_norm": 21.57361602783203, "learning_rate": 2.4028809049207922e-06, "logits/chosen": -1.3529293537139893, "logits/rejected": -0.9282048940658569, "logps/chosen": -1.0757924318313599, "logps/rejected": -5.857820987701416, "loss": 1.0973, "odds_ratio_loss": 0.21466533839702606, "rewards/accuracies": 1.0, "rewards/chosen": -0.10757924616336823, "rewards/margins": 0.4782029092311859, "rewards/rejected": -0.5857821106910706, "sft_loss": 1.0757924318313599, "step": 8695 }, { "epoch": 0.68, "grad_norm": 24.09572410583496, "learning_rate": 2.3976211281678723e-06, "logits/chosen": -1.4947898387908936, "logits/rejected": -0.8406974673271179, "logps/chosen": -0.9563905596733093, "logps/rejected": -6.372321605682373, "loss": 0.9644, "odds_ratio_loss": 0.07989028841257095, "rewards/accuracies": 1.0, "rewards/chosen": -0.09563905000686646, "rewards/margins": 0.5415931940078735, "rewards/rejected": -0.63723224401474, "sft_loss": 0.9563905596733093, "step": 8700 }, { "epoch": 0.68, "grad_norm": 25.895950317382812, "learning_rate": 2.392365298252925e-06, "logits/chosen": -1.3481794595718384, "logits/rejected": -1.4586479663848877, "logps/chosen": -0.8709031343460083, "logps/rejected": -6.724035739898682, "loss": 0.9243, "odds_ratio_loss": 0.5336446762084961, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08709030598402023, "rewards/margins": 0.5853133201599121, "rewards/rejected": -0.6724035739898682, "sft_loss": 0.8709031343460083, "step": 8705 }, { "epoch": 0.68, "grad_norm": 8.630340576171875, "learning_rate": 2.3871134231470806e-06, "logits/chosen": -1.1768683195114136, "logits/rejected": -1.250208854675293, "logps/chosen": -0.9202073216438293, "logps/rejected": -4.344022750854492, "loss": 0.9729, "odds_ratio_loss": 0.526607871055603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09202073514461517, "rewards/margins": 0.3423815965652466, "rewards/rejected": -0.43440231680870056, "sft_loss": 0.9202073216438293, "step": 8710 }, { "epoch": 0.68, "grad_norm": 10.498140335083008, "learning_rate": 2.3818655108154747e-06, "logits/chosen": -1.1865965127944946, "logits/rejected": -0.629920244216919, "logps/chosen": -1.229215383529663, "logps/rejected": -3.8668923377990723, "loss": 1.2554, "odds_ratio_loss": 0.2619324326515198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12292154133319855, "rewards/margins": 0.26376765966415405, "rewards/rejected": -0.3866892457008362, "sft_loss": 1.229215383529663, "step": 8715 }, { "epoch": 0.68, "grad_norm": 4.093234062194824, "learning_rate": 2.3766215692172335e-06, "logits/chosen": -1.387241244316101, "logits/rejected": -0.8403748273849487, "logps/chosen": -1.0737946033477783, "logps/rejected": -8.066632270812988, "loss": 1.0756, "odds_ratio_loss": 0.01804409921169281, "rewards/accuracies": 1.0, "rewards/chosen": -0.10737945884466171, "rewards/margins": 0.6992837190628052, "rewards/rejected": -0.8066631555557251, "sft_loss": 1.0737946033477783, "step": 8720 }, { "epoch": 0.68, "grad_norm": 7.077796459197998, "learning_rate": 2.3713816063054594e-06, "logits/chosen": -1.3643040657043457, "logits/rejected": -1.0773613452911377, "logps/chosen": -0.9288724064826965, "logps/rejected": -14.269126892089844, "loss": 0.9366, "odds_ratio_loss": 0.07765939831733704, "rewards/accuracies": 1.0, "rewards/chosen": -0.09288725256919861, "rewards/margins": 1.334025263786316, "rewards/rejected": -1.426912546157837, "sft_loss": 0.9288724064826965, "step": 8725 }, { "epoch": 0.68, "grad_norm": 139.87179565429688, "learning_rate": 2.3661456300272218e-06, "logits/chosen": -1.4828407764434814, "logits/rejected": -1.1542161703109741, "logps/chosen": -0.9767447710037231, "logps/rejected": -13.312211990356445, "loss": 0.983, "odds_ratio_loss": 0.06288562715053558, "rewards/accuracies": 1.0, "rewards/chosen": -0.09767447412014008, "rewards/margins": 1.2335467338562012, "rewards/rejected": -1.3312212228775024, "sft_loss": 0.9767447710037231, "step": 8730 }, { "epoch": 0.68, "grad_norm": 5.913445949554443, "learning_rate": 2.3609136483235417e-06, "logits/chosen": -1.3908110857009888, "logits/rejected": -0.7219542264938354, "logps/chosen": -1.0922479629516602, "logps/rejected": -7.108962059020996, "loss": 1.0973, "odds_ratio_loss": 0.050926219671964645, "rewards/accuracies": 1.0, "rewards/chosen": -0.10922479629516602, "rewards/margins": 0.6016713976860046, "rewards/rejected": -0.7108962535858154, "sft_loss": 1.0922479629516602, "step": 8735 }, { "epoch": 0.68, "grad_norm": 7.034379959106445, "learning_rate": 2.3556856691293874e-06, "logits/chosen": -1.3366495370864868, "logits/rejected": -0.9458214640617371, "logps/chosen": -1.1878178119659424, "logps/rejected": -5.579440116882324, "loss": 1.2406, "odds_ratio_loss": 0.5279297232627869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11878176778554916, "rewards/margins": 0.4391621947288513, "rewards/rejected": -0.5579439401626587, "sft_loss": 1.1878178119659424, "step": 8740 }, { "epoch": 0.68, "grad_norm": 164.0315704345703, "learning_rate": 2.3504617003736505e-06, "logits/chosen": -1.2090203762054443, "logits/rejected": -1.0629816055297852, "logps/chosen": -1.0493745803833008, "logps/rejected": -7.64117431640625, "loss": 1.0686, "odds_ratio_loss": 0.19206681847572327, "rewards/accuracies": 1.0, "rewards/chosen": -0.10493745654821396, "rewards/margins": 0.6591799259185791, "rewards/rejected": -0.764117419719696, "sft_loss": 1.0493745803833008, "step": 8745 }, { "epoch": 0.68, "grad_norm": 777.0653686523438, "learning_rate": 2.345241749979142e-06, "logits/chosen": -1.2011092901229858, "logits/rejected": -1.496852159500122, "logps/chosen": -1.4050729274749756, "logps/rejected": -9.406866073608398, "loss": 1.4124, "odds_ratio_loss": 0.07344251871109009, "rewards/accuracies": 1.0, "rewards/chosen": -0.140507310628891, "rewards/margins": 0.8001793026924133, "rewards/rejected": -0.9406865835189819, "sft_loss": 1.4050729274749756, "step": 8750 }, { "epoch": 0.68, "grad_norm": 5.332313537597656, "learning_rate": 2.3400258258625824e-06, "logits/chosen": -1.2466320991516113, "logits/rejected": -1.1018311977386475, "logps/chosen": -0.7403644323348999, "logps/rejected": -3.1287522315979004, "loss": 0.7526, "odds_ratio_loss": 0.12221725285053253, "rewards/accuracies": 1.0, "rewards/chosen": -0.07403644919395447, "rewards/margins": 0.23883876204490662, "rewards/rejected": -0.3128752112388611, "sft_loss": 0.7403644323348999, "step": 8755 }, { "epoch": 0.68, "grad_norm": 171.93978881835938, "learning_rate": 2.3348139359345818e-06, "logits/chosen": -1.4692953824996948, "logits/rejected": -1.001387357711792, "logps/chosen": -1.5447783470153809, "logps/rejected": -5.410799026489258, "loss": 1.552, "odds_ratio_loss": 0.07230857759714127, "rewards/accuracies": 1.0, "rewards/chosen": -0.15447784960269928, "rewards/margins": 0.3866020143032074, "rewards/rejected": -0.5410798788070679, "sft_loss": 1.5447783470153809, "step": 8760 }, { "epoch": 0.68, "grad_norm": 20.867794036865234, "learning_rate": 2.3296060880996324e-06, "logits/chosen": -1.1623890399932861, "logits/rejected": -0.919702410697937, "logps/chosen": -1.1486616134643555, "logps/rejected": -2.9376235008239746, "loss": 1.1773, "odds_ratio_loss": 0.28643161058425903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11486617475748062, "rewards/margins": 0.17889617383480072, "rewards/rejected": -0.29376235604286194, "sft_loss": 1.1486616134643555, "step": 8765 }, { "epoch": 0.68, "grad_norm": 11.270215034484863, "learning_rate": 2.324402290256097e-06, "logits/chosen": -1.356595754623413, "logits/rejected": -1.6129558086395264, "logps/chosen": -0.7033634185791016, "logps/rejected": -10.820283889770508, "loss": 0.7142, "odds_ratio_loss": 0.10823347419500351, "rewards/accuracies": 1.0, "rewards/chosen": -0.07033634185791016, "rewards/margins": 1.0116920471191406, "rewards/rejected": -1.0820282697677612, "sft_loss": 0.7033634185791016, "step": 8770 }, { "epoch": 0.68, "grad_norm": 5.982351303100586, "learning_rate": 2.319202550296195e-06, "logits/chosen": -1.2674534320831299, "logits/rejected": -1.3874826431274414, "logps/chosen": -1.2085932493209839, "logps/rejected": -13.73901081085205, "loss": 1.2104, "odds_ratio_loss": 0.018456827849149704, "rewards/accuracies": 1.0, "rewards/chosen": -0.12085933983325958, "rewards/margins": 1.2530416250228882, "rewards/rejected": -1.373901128768921, "sft_loss": 1.2085932493209839, "step": 8775 }, { "epoch": 0.68, "grad_norm": 5.8140764236450195, "learning_rate": 2.3140068761059936e-06, "logits/chosen": -1.3922148942947388, "logits/rejected": -1.0940783023834229, "logps/chosen": -1.1989779472351074, "logps/rejected": -6.598939418792725, "loss": 1.2196, "odds_ratio_loss": 0.20643094182014465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11989779770374298, "rewards/margins": 0.5399961471557617, "rewards/rejected": -0.6598939299583435, "sft_loss": 1.1989779472351074, "step": 8780 }, { "epoch": 0.68, "grad_norm": 20.456661224365234, "learning_rate": 2.3088152755653893e-06, "logits/chosen": -1.314902901649475, "logits/rejected": -1.0060148239135742, "logps/chosen": -0.9692791104316711, "logps/rejected": -5.441655158996582, "loss": 1.0225, "odds_ratio_loss": 0.5325320959091187, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09692790359258652, "rewards/margins": 0.44723764061927795, "rewards/rejected": -0.5441655516624451, "sft_loss": 0.9692791104316711, "step": 8785 }, { "epoch": 0.68, "grad_norm": 5.951380729675293, "learning_rate": 2.3036277565481076e-06, "logits/chosen": -1.3810487985610962, "logits/rejected": -0.8749133348464966, "logps/chosen": -0.6693980097770691, "logps/rejected": -2.945880174636841, "loss": 0.6842, "odds_ratio_loss": 0.14806067943572998, "rewards/accuracies": 1.0, "rewards/chosen": -0.0669398084282875, "rewards/margins": 0.22764822840690613, "rewards/rejected": -0.29458802938461304, "sft_loss": 0.6693980097770691, "step": 8790 }, { "epoch": 0.68, "grad_norm": 74.74230194091797, "learning_rate": 2.2984443269216777e-06, "logits/chosen": -1.3578306436538696, "logits/rejected": -1.1013226509094238, "logps/chosen": -1.1087204217910767, "logps/rejected": -6.908092498779297, "loss": 1.1202, "odds_ratio_loss": 0.11455819755792618, "rewards/accuracies": 1.0, "rewards/chosen": -0.11087203025817871, "rewards/margins": 0.5799371600151062, "rewards/rejected": -0.6908092498779297, "sft_loss": 1.1087204217910767, "step": 8795 }, { "epoch": 0.68, "grad_norm": 126.49470520019531, "learning_rate": 2.293264994547427e-06, "logits/chosen": -1.4132869243621826, "logits/rejected": -0.7511752843856812, "logps/chosen": -0.8112546801567078, "logps/rejected": -4.038820266723633, "loss": 0.8352, "odds_ratio_loss": 0.23952248692512512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08112547546625137, "rewards/margins": 0.3227565884590149, "rewards/rejected": -0.4038820266723633, "sft_loss": 0.8112546801567078, "step": 8800 }, { "epoch": 0.68, "grad_norm": 4.296744346618652, "learning_rate": 2.288089767280474e-06, "logits/chosen": -1.481168270111084, "logits/rejected": -1.0833094120025635, "logps/chosen": -0.8455594182014465, "logps/rejected": -7.945650577545166, "loss": 0.8557, "odds_ratio_loss": 0.1012900322675705, "rewards/accuracies": 1.0, "rewards/chosen": -0.08455593883991241, "rewards/margins": 0.7100090384483337, "rewards/rejected": -0.794564962387085, "sft_loss": 0.8455594182014465, "step": 8805 }, { "epoch": 0.69, "grad_norm": 12.861416816711426, "learning_rate": 2.282918652969707e-06, "logits/chosen": -1.3677926063537598, "logits/rejected": -1.070143222808838, "logps/chosen": -0.8709946870803833, "logps/rejected": -3.9313464164733887, "loss": 0.8799, "odds_ratio_loss": 0.08856067806482315, "rewards/accuracies": 1.0, "rewards/chosen": -0.08709947764873505, "rewards/margins": 0.3060351610183716, "rewards/rejected": -0.3931346535682678, "sft_loss": 0.8709946870803833, "step": 8810 }, { "epoch": 0.69, "grad_norm": 10.0030517578125, "learning_rate": 2.2777516594577753e-06, "logits/chosen": -1.2824790477752686, "logits/rejected": -1.475572109222412, "logps/chosen": -0.984046459197998, "logps/rejected": -8.88023853302002, "loss": 0.9905, "odds_ratio_loss": 0.06427445262670517, "rewards/accuracies": 1.0, "rewards/chosen": -0.09840463846921921, "rewards/margins": 0.7896192669868469, "rewards/rejected": -0.8880239725112915, "sft_loss": 0.984046459197998, "step": 8815 }, { "epoch": 0.69, "grad_norm": 4.735327243804932, "learning_rate": 2.2725887945810835e-06, "logits/chosen": -1.349716305732727, "logits/rejected": -1.353406548500061, "logps/chosen": -0.7635191679000854, "logps/rejected": -12.640535354614258, "loss": 0.813, "odds_ratio_loss": 0.49513259530067444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07635192573070526, "rewards/margins": 1.1877018213272095, "rewards/rejected": -1.2640537023544312, "sft_loss": 0.7635191679000854, "step": 8820 }, { "epoch": 0.69, "grad_norm": 12.46005916595459, "learning_rate": 2.2674300661697705e-06, "logits/chosen": -1.369740605354309, "logits/rejected": -1.3559238910675049, "logps/chosen": -0.9420153498649597, "logps/rejected": -14.380511283874512, "loss": 0.9612, "odds_ratio_loss": 0.19223158061504364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09420153498649597, "rewards/margins": 1.343849778175354, "rewards/rejected": -1.4380512237548828, "sft_loss": 0.9420153498649597, "step": 8825 }, { "epoch": 0.69, "grad_norm": 6.363523960113525, "learning_rate": 2.2622754820477033e-06, "logits/chosen": -1.3766988515853882, "logits/rejected": -0.830722451210022, "logps/chosen": -1.0896638631820679, "logps/rejected": -12.302000045776367, "loss": 1.1067, "odds_ratio_loss": 0.17026808857917786, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1089663952589035, "rewards/margins": 1.1212337017059326, "rewards/rejected": -1.2302000522613525, "sft_loss": 1.0896638631820679, "step": 8830 }, { "epoch": 0.69, "grad_norm": 10.626047134399414, "learning_rate": 2.257125050032462e-06, "logits/chosen": -1.3120410442352295, "logits/rejected": -1.2341954708099365, "logps/chosen": -0.9303919076919556, "logps/rejected": -3.4059882164001465, "loss": 0.9528, "odds_ratio_loss": 0.22447124123573303, "rewards/accuracies": 1.0, "rewards/chosen": -0.09303919970989227, "rewards/margins": 0.24755963683128357, "rewards/rejected": -0.34059882164001465, "sft_loss": 0.9303919076919556, "step": 8835 }, { "epoch": 0.69, "grad_norm": 100.3499984741211, "learning_rate": 2.2519787779353312e-06, "logits/chosen": -1.3813316822052002, "logits/rejected": -0.935640811920166, "logps/chosen": -0.9915526509284973, "logps/rejected": -6.646824836730957, "loss": 1.0069, "odds_ratio_loss": 0.15396466851234436, "rewards/accuracies": 1.0, "rewards/chosen": -0.09915526211261749, "rewards/margins": 0.5655272006988525, "rewards/rejected": -0.6646824479103088, "sft_loss": 0.9915526509284973, "step": 8840 }, { "epoch": 0.69, "grad_norm": 3.4930734634399414, "learning_rate": 2.246836673561286e-06, "logits/chosen": -1.2621219158172607, "logits/rejected": -0.7246573567390442, "logps/chosen": -1.005814552307129, "logps/rejected": -3.994333267211914, "loss": 1.0149, "odds_ratio_loss": 0.09052891284227371, "rewards/accuracies": 1.0, "rewards/chosen": -0.10058145225048065, "rewards/margins": 0.2988519072532654, "rewards/rejected": -0.3994333744049072, "sft_loss": 1.005814552307129, "step": 8845 }, { "epoch": 0.69, "grad_norm": 6.091088771820068, "learning_rate": 2.2416987447089795e-06, "logits/chosen": -1.11075758934021, "logits/rejected": -1.1599006652832031, "logps/chosen": -0.9157182574272156, "logps/rejected": -6.711939811706543, "loss": 0.9192, "odds_ratio_loss": 0.03470756858587265, "rewards/accuracies": 1.0, "rewards/chosen": -0.09157182276248932, "rewards/margins": 0.5796222686767578, "rewards/rejected": -0.6711940765380859, "sft_loss": 0.9157182574272156, "step": 8850 }, { "epoch": 0.69, "grad_norm": 4.585426330566406, "learning_rate": 2.236564999170735e-06, "logits/chosen": -1.3810508251190186, "logits/rejected": -0.529486358165741, "logps/chosen": -0.9336601495742798, "logps/rejected": -4.21042013168335, "loss": 0.9444, "odds_ratio_loss": 0.10773054510354996, "rewards/accuracies": 1.0, "rewards/chosen": -0.09336601197719574, "rewards/margins": 0.327675998210907, "rewards/rejected": -0.4210420250892639, "sft_loss": 0.9336601495742798, "step": 8855 }, { "epoch": 0.69, "grad_norm": 5.172513961791992, "learning_rate": 2.231435444732529e-06, "logits/chosen": -1.4077708721160889, "logits/rejected": -1.3687193393707275, "logps/chosen": -1.0155298709869385, "logps/rejected": -10.070394515991211, "loss": 1.0235, "odds_ratio_loss": 0.08000832796096802, "rewards/accuracies": 1.0, "rewards/chosen": -0.10155300050973892, "rewards/margins": 0.9054864048957825, "rewards/rejected": -1.0070393085479736, "sft_loss": 1.0155298709869385, "step": 8860 }, { "epoch": 0.69, "grad_norm": 5.20650577545166, "learning_rate": 2.2263100891739804e-06, "logits/chosen": -1.3737702369689941, "logits/rejected": -1.1111339330673218, "logps/chosen": -1.0363662242889404, "logps/rejected": -10.14550495147705, "loss": 1.0399, "odds_ratio_loss": 0.03571737930178642, "rewards/accuracies": 1.0, "rewards/chosen": -0.10363663733005524, "rewards/margins": 0.9109139442443848, "rewards/rejected": -1.0145504474639893, "sft_loss": 1.0363662242889404, "step": 8865 }, { "epoch": 0.69, "grad_norm": 7.640948295593262, "learning_rate": 2.2211889402683444e-06, "logits/chosen": -1.306793212890625, "logits/rejected": -0.9922159910202026, "logps/chosen": -0.8645086288452148, "logps/rejected": -10.478594779968262, "loss": 0.8654, "odds_ratio_loss": 0.009273124858736992, "rewards/accuracies": 1.0, "rewards/chosen": -0.08645085990428925, "rewards/margins": 0.9614086151123047, "rewards/rejected": -1.0478594303131104, "sft_loss": 0.8645086288452148, "step": 8870 }, { "epoch": 0.69, "grad_norm": 5.960412502288818, "learning_rate": 2.216072005782492e-06, "logits/chosen": -1.4007996320724487, "logits/rejected": -0.977988600730896, "logps/chosen": -1.1421080827713013, "logps/rejected": -8.920265197753906, "loss": 1.1509, "odds_ratio_loss": 0.08776558190584183, "rewards/accuracies": 1.0, "rewards/chosen": -0.11421080678701401, "rewards/margins": 0.7778158187866211, "rewards/rejected": -0.8920267224311829, "sft_loss": 1.1421080827713013, "step": 8875 }, { "epoch": 0.69, "grad_norm": 9.963485717773438, "learning_rate": 2.2109592934769042e-06, "logits/chosen": -1.4444516897201538, "logits/rejected": -1.264888048171997, "logps/chosen": -1.100772500038147, "logps/rejected": -5.076528072357178, "loss": 1.1213, "odds_ratio_loss": 0.2054096907377243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11007723957300186, "rewards/margins": 0.3975755572319031, "rewards/rejected": -0.5076528787612915, "sft_loss": 1.100772500038147, "step": 8880 }, { "epoch": 0.69, "grad_norm": 22.328811645507812, "learning_rate": 2.205850811105658e-06, "logits/chosen": -1.1906664371490479, "logits/rejected": -1.420731782913208, "logps/chosen": -0.8635552525520325, "logps/rejected": -5.215235710144043, "loss": 0.8849, "odds_ratio_loss": 0.21342246234416962, "rewards/accuracies": 1.0, "rewards/chosen": -0.0863555297255516, "rewards/margins": 0.4351680874824524, "rewards/rejected": -0.5215235352516174, "sft_loss": 0.8635552525520325, "step": 8885 }, { "epoch": 0.69, "grad_norm": 13.49941349029541, "learning_rate": 2.2007465664164163e-06, "logits/chosen": -1.1897201538085938, "logits/rejected": -1.3385847806930542, "logps/chosen": -1.1155879497528076, "logps/rejected": -10.237648010253906, "loss": 1.1251, "odds_ratio_loss": 0.0953986719250679, "rewards/accuracies": 1.0, "rewards/chosen": -0.11155879497528076, "rewards/margins": 0.9122061729431152, "rewards/rejected": -1.0237648487091064, "sft_loss": 1.1155879497528076, "step": 8890 }, { "epoch": 0.69, "grad_norm": 5.225167274475098, "learning_rate": 2.1956465671504117e-06, "logits/chosen": -1.446629285812378, "logits/rejected": -1.0897634029388428, "logps/chosen": -0.8796972036361694, "logps/rejected": -6.697667598724365, "loss": 0.8802, "odds_ratio_loss": 0.005169983953237534, "rewards/accuracies": 1.0, "rewards/chosen": -0.08796972036361694, "rewards/margins": 0.5817970037460327, "rewards/rejected": -0.6697667837142944, "sft_loss": 0.8796972036361694, "step": 8895 }, { "epoch": 0.69, "grad_norm": 39.218021392822266, "learning_rate": 2.190550821042444e-06, "logits/chosen": -1.362776279449463, "logits/rejected": -1.3119524717330933, "logps/chosen": -1.0069937705993652, "logps/rejected": -8.32177448272705, "loss": 1.0326, "odds_ratio_loss": 0.25601357221603394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10069938004016876, "rewards/margins": 0.7314780950546265, "rewards/rejected": -0.8321775197982788, "sft_loss": 1.0069937705993652, "step": 8900 }, { "epoch": 0.69, "grad_norm": 4.985468864440918, "learning_rate": 2.185459335820858e-06, "logits/chosen": -1.263514757156372, "logits/rejected": -0.6906660795211792, "logps/chosen": -0.7955794334411621, "logps/rejected": -3.6059112548828125, "loss": 0.8298, "odds_ratio_loss": 0.34243783354759216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07955794036388397, "rewards/margins": 0.2810332179069519, "rewards/rejected": -0.3605911433696747, "sft_loss": 0.7955794334411621, "step": 8905 }, { "epoch": 0.69, "grad_norm": 6.7755513191223145, "learning_rate": 2.1803721192075376e-06, "logits/chosen": -1.2453514337539673, "logits/rejected": -0.7368025779724121, "logps/chosen": -0.8245415687561035, "logps/rejected": -1.625554084777832, "loss": 0.8633, "odds_ratio_loss": 0.38740354776382446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08245415985584259, "rewards/margins": 0.08010125160217285, "rewards/rejected": -0.16255542635917664, "sft_loss": 0.8245415687561035, "step": 8910 }, { "epoch": 0.69, "grad_norm": 8.046930313110352, "learning_rate": 2.1752891789178903e-06, "logits/chosen": -1.2851436138153076, "logits/rejected": -0.8310701251029968, "logps/chosen": -0.9399498105049133, "logps/rejected": -4.056199073791504, "loss": 0.9708, "odds_ratio_loss": 0.30810680985450745, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09399497509002686, "rewards/margins": 0.3116249144077301, "rewards/rejected": -0.40561985969543457, "sft_loss": 0.9399498105049133, "step": 8915 }, { "epoch": 0.69, "grad_norm": 45.22992706298828, "learning_rate": 2.170210522660844e-06, "logits/chosen": -1.1178120374679565, "logits/rejected": -1.3127976655960083, "logps/chosen": -0.791592538356781, "logps/rejected": -4.469288349151611, "loss": 0.8065, "odds_ratio_loss": 0.14901237189769745, "rewards/accuracies": 1.0, "rewards/chosen": -0.07915925979614258, "rewards/margins": 0.36776959896087646, "rewards/rejected": -0.44692888855934143, "sft_loss": 0.791592538356781, "step": 8920 }, { "epoch": 0.69, "grad_norm": 8.589193344116211, "learning_rate": 2.1651361581388244e-06, "logits/chosen": -1.2633674144744873, "logits/rejected": -0.8884477615356445, "logps/chosen": -1.0612653493881226, "logps/rejected": -6.128218173980713, "loss": 1.0918, "odds_ratio_loss": 0.3054296672344208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10612654685974121, "rewards/margins": 0.5066952705383301, "rewards/rejected": -0.6128217577934265, "sft_loss": 1.0612653493881226, "step": 8925 }, { "epoch": 0.69, "grad_norm": 22.513050079345703, "learning_rate": 2.1600660930477473e-06, "logits/chosen": -1.3976125717163086, "logits/rejected": -1.2376105785369873, "logps/chosen": -0.8673698306083679, "logps/rejected": -4.726605415344238, "loss": 0.8781, "odds_ratio_loss": 0.10777624696493149, "rewards/accuracies": 1.0, "rewards/chosen": -0.08673697710037231, "rewards/margins": 0.3859235644340515, "rewards/rejected": -0.4726606011390686, "sft_loss": 0.8673698306083679, "step": 8930 }, { "epoch": 0.7, "grad_norm": 34.55830383300781, "learning_rate": 2.1550003350770145e-06, "logits/chosen": -1.4735945463180542, "logits/rejected": -1.2802393436431885, "logps/chosen": -1.6333625316619873, "logps/rejected": -5.451124668121338, "loss": 1.6782, "odds_ratio_loss": 0.44808006286621094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16333624720573425, "rewards/margins": 0.38177618384361267, "rewards/rejected": -0.5451124906539917, "sft_loss": 1.6333625316619873, "step": 8935 }, { "epoch": 0.7, "grad_norm": 7.063345909118652, "learning_rate": 2.1499388919094878e-06, "logits/chosen": -1.357527494430542, "logits/rejected": -0.8941570520401001, "logps/chosen": -0.860805332660675, "logps/rejected": -3.371488094329834, "loss": 0.8828, "odds_ratio_loss": 0.219833642244339, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08608053624629974, "rewards/margins": 0.2510682940483093, "rewards/rejected": -0.3371488153934479, "sft_loss": 0.860805332660675, "step": 8940 }, { "epoch": 0.7, "grad_norm": 4.833588123321533, "learning_rate": 2.14488177122149e-06, "logits/chosen": -1.3718502521514893, "logits/rejected": -1.3062679767608643, "logps/chosen": -0.9539562463760376, "logps/rejected": -13.6318998336792, "loss": 0.9722, "odds_ratio_loss": 0.1820879876613617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09539561718702316, "rewards/margins": 1.2677944898605347, "rewards/rejected": -1.3631901741027832, "sft_loss": 0.9539562463760376, "step": 8945 }, { "epoch": 0.7, "grad_norm": 7.767849922180176, "learning_rate": 2.139828980682786e-06, "logits/chosen": -1.3066259622573853, "logits/rejected": -1.2431743144989014, "logps/chosen": -0.8903936147689819, "logps/rejected": -7.409182548522949, "loss": 0.893, "odds_ratio_loss": 0.025705674663186073, "rewards/accuracies": 1.0, "rewards/chosen": -0.08903936296701431, "rewards/margins": 0.6518789529800415, "rewards/rejected": -0.7409183382987976, "sft_loss": 0.8903936147689819, "step": 8950 }, { "epoch": 0.7, "grad_norm": 4.688467979431152, "learning_rate": 2.1347805279565743e-06, "logits/chosen": -1.2730211019515991, "logits/rejected": -0.8708003163337708, "logps/chosen": -1.1545875072479248, "logps/rejected": -8.899227142333984, "loss": 1.1654, "odds_ratio_loss": 0.10816816240549088, "rewards/accuracies": 1.0, "rewards/chosen": -0.11545874923467636, "rewards/margins": 0.7744640111923218, "rewards/rejected": -0.8899227976799011, "sft_loss": 1.1545875072479248, "step": 8955 }, { "epoch": 0.7, "grad_norm": 12.68004035949707, "learning_rate": 2.1297364206994727e-06, "logits/chosen": -1.3241649866104126, "logits/rejected": -0.8673623204231262, "logps/chosen": -0.922534167766571, "logps/rejected": -2.2232301235198975, "loss": 0.9554, "odds_ratio_loss": 0.3290178179740906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0922534242272377, "rewards/margins": 0.13006961345672607, "rewards/rejected": -0.22232303023338318, "sft_loss": 0.922534167766571, "step": 8960 }, { "epoch": 0.7, "grad_norm": 5.558168411254883, "learning_rate": 2.124696666561513e-06, "logits/chosen": -1.3462955951690674, "logits/rejected": -1.0334182977676392, "logps/chosen": -0.9550608396530151, "logps/rejected": -3.349431276321411, "loss": 1.0117, "odds_ratio_loss": 0.5658974647521973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09550608694553375, "rewards/margins": 0.2394370287656784, "rewards/rejected": -0.33494311571121216, "sft_loss": 0.9550608396530151, "step": 8965 }, { "epoch": 0.7, "grad_norm": 5.458000659942627, "learning_rate": 2.119661273186122e-06, "logits/chosen": -1.2887673377990723, "logits/rejected": -0.9980741739273071, "logps/chosen": -0.8225724101066589, "logps/rejected": -2.6496741771698, "loss": 0.8715, "odds_ratio_loss": 0.4890063405036926, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08225724846124649, "rewards/margins": 0.182710200548172, "rewards/rejected": -0.2649674713611603, "sft_loss": 0.8225724101066589, "step": 8970 }, { "epoch": 0.7, "grad_norm": 15.437626838684082, "learning_rate": 2.114630248210112e-06, "logits/chosen": -1.324136734008789, "logits/rejected": -1.00652277469635, "logps/chosen": -1.0153963565826416, "logps/rejected": -6.181136608123779, "loss": 1.0371, "odds_ratio_loss": 0.21681568026542664, "rewards/accuracies": 1.0, "rewards/chosen": -0.10153963416814804, "rewards/margins": 0.5165740251541138, "rewards/rejected": -0.6181136965751648, "sft_loss": 1.0153963565826416, "step": 8975 }, { "epoch": 0.7, "grad_norm": 14.838114738464355, "learning_rate": 2.10960359926367e-06, "logits/chosen": -1.1353790760040283, "logits/rejected": -0.9303848147392273, "logps/chosen": -0.9914374351501465, "logps/rejected": -6.257478713989258, "loss": 1.0012, "odds_ratio_loss": 0.09774111211299896, "rewards/accuracies": 1.0, "rewards/chosen": -0.09914375841617584, "rewards/margins": 0.5266041159629822, "rewards/rejected": -0.6257479190826416, "sft_loss": 0.9914374351501465, "step": 8980 }, { "epoch": 0.7, "grad_norm": 12.68740463256836, "learning_rate": 2.1045813339703504e-06, "logits/chosen": -1.332756757736206, "logits/rejected": -1.1559302806854248, "logps/chosen": -1.1947431564331055, "logps/rejected": -14.672497749328613, "loss": 1.1961, "odds_ratio_loss": 0.01405693031847477, "rewards/accuracies": 1.0, "rewards/chosen": -0.11947431415319443, "rewards/margins": 1.3477754592895508, "rewards/rejected": -1.4672497510910034, "sft_loss": 1.1947431564331055, "step": 8985 }, { "epoch": 0.7, "grad_norm": 12.600214004516602, "learning_rate": 2.0995634599470543e-06, "logits/chosen": -1.1652392148971558, "logits/rejected": -1.0661487579345703, "logps/chosen": -1.3858683109283447, "logps/rejected": -3.2528934478759766, "loss": 1.4568, "odds_ratio_loss": 0.7093421220779419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1385868340730667, "rewards/margins": 0.18670251965522766, "rewards/rejected": -0.3252893090248108, "sft_loss": 1.3858683109283447, "step": 8990 }, { "epoch": 0.7, "grad_norm": 13.642959594726562, "learning_rate": 2.0945499848040245e-06, "logits/chosen": -1.1991904973983765, "logits/rejected": -1.5512006282806396, "logps/chosen": -1.1417248249053955, "logps/rejected": -16.12936019897461, "loss": 1.1417, "odds_ratio_loss": 0.0002171795058529824, "rewards/accuracies": 1.0, "rewards/chosen": -0.11417248100042343, "rewards/margins": 1.4987636804580688, "rewards/rejected": -1.61293625831604, "sft_loss": 1.1417248249053955, "step": 8995 }, { "epoch": 0.7, "grad_norm": 273.96759033203125, "learning_rate": 2.0895409161448336e-06, "logits/chosen": -1.0852572917938232, "logits/rejected": -1.2790428400039673, "logps/chosen": -1.4878782033920288, "logps/rejected": -7.7668137550354, "loss": 1.5419, "odds_ratio_loss": 0.5402774214744568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14878782629966736, "rewards/margins": 0.6278935670852661, "rewards/rejected": -0.7766814231872559, "sft_loss": 1.4878782033920288, "step": 9000 }, { "epoch": 0.7, "grad_norm": 4.770481109619141, "learning_rate": 2.08453626156637e-06, "logits/chosen": -1.2997276782989502, "logits/rejected": -0.8713030815124512, "logps/chosen": -0.8640148043632507, "logps/rejected": -10.577802658081055, "loss": 0.8718, "odds_ratio_loss": 0.0777444839477539, "rewards/accuracies": 1.0, "rewards/chosen": -0.08640148490667343, "rewards/margins": 0.9713788032531738, "rewards/rejected": -1.057780385017395, "sft_loss": 0.8640148043632507, "step": 9005 }, { "epoch": 0.7, "grad_norm": 23.215740203857422, "learning_rate": 2.079536028658825e-06, "logits/chosen": -1.1639597415924072, "logits/rejected": -1.0849668979644775, "logps/chosen": -0.9988861083984375, "logps/rejected": -4.473105430603027, "loss": 1.0022, "odds_ratio_loss": 0.033389657735824585, "rewards/accuracies": 1.0, "rewards/chosen": -0.09988861531019211, "rewards/margins": 0.34742194414138794, "rewards/rejected": -0.44731053709983826, "sft_loss": 0.9988861083984375, "step": 9010 }, { "epoch": 0.7, "grad_norm": 18.671756744384766, "learning_rate": 2.074540225005691e-06, "logits/chosen": -1.3484288454055786, "logits/rejected": -0.8984023928642273, "logps/chosen": -1.5796782970428467, "logps/rejected": -6.037528991699219, "loss": 1.5958, "odds_ratio_loss": 0.16151759028434753, "rewards/accuracies": 1.0, "rewards/chosen": -0.15796785056591034, "rewards/margins": 0.4457850456237793, "rewards/rejected": -0.6037529706954956, "sft_loss": 1.5796782970428467, "step": 9015 }, { "epoch": 0.7, "grad_norm": 6.736073970794678, "learning_rate": 2.069548858183737e-06, "logits/chosen": -1.3836395740509033, "logits/rejected": -1.2181329727172852, "logps/chosen": -0.72617107629776, "logps/rejected": -9.326863288879395, "loss": 0.7917, "odds_ratio_loss": 0.654904842376709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07261711359024048, "rewards/margins": 0.8600692749023438, "rewards/rejected": -0.932686448097229, "sft_loss": 0.72617107629776, "step": 9020 }, { "epoch": 0.7, "grad_norm": 5.815211772918701, "learning_rate": 2.0645619357630037e-06, "logits/chosen": -1.3405768871307373, "logits/rejected": -1.0667366981506348, "logps/chosen": -0.8995200395584106, "logps/rejected": -10.469018936157227, "loss": 0.8996, "odds_ratio_loss": 0.0003613657027017325, "rewards/accuracies": 1.0, "rewards/chosen": -0.0899519994854927, "rewards/margins": 0.9569500088691711, "rewards/rejected": -1.0469019412994385, "sft_loss": 0.8995200395584106, "step": 9025 }, { "epoch": 0.7, "grad_norm": 51.4861946105957, "learning_rate": 2.059579465306791e-06, "logits/chosen": -1.082763910293579, "logits/rejected": -1.3052375316619873, "logps/chosen": -0.7171798944473267, "logps/rejected": -2.294360876083374, "loss": 0.7315, "odds_ratio_loss": 0.14310906827449799, "rewards/accuracies": 1.0, "rewards/chosen": -0.0717179924249649, "rewards/margins": 0.15771810710430145, "rewards/rejected": -0.22943606972694397, "sft_loss": 0.7171798944473267, "step": 9030 }, { "epoch": 0.7, "grad_norm": 23.833728790283203, "learning_rate": 2.0546014543716516e-06, "logits/chosen": -1.336284875869751, "logits/rejected": -1.3159081935882568, "logps/chosen": -0.9601768255233765, "logps/rejected": -4.6210761070251465, "loss": 0.9913, "odds_ratio_loss": 0.3113359808921814, "rewards/accuracies": 1.0, "rewards/chosen": -0.09601768851280212, "rewards/margins": 0.3660898804664612, "rewards/rejected": -0.4621075689792633, "sft_loss": 0.9601768255233765, "step": 9035 }, { "epoch": 0.7, "grad_norm": 14.797272682189941, "learning_rate": 2.0496279105073686e-06, "logits/chosen": -1.3687362670898438, "logits/rejected": -1.0789031982421875, "logps/chosen": -0.9673604965209961, "logps/rejected": -6.7939581871032715, "loss": 0.9952, "odds_ratio_loss": 0.27864471077919006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09673605114221573, "rewards/margins": 0.5826598405838013, "rewards/rejected": -0.6793957948684692, "sft_loss": 0.9673604965209961, "step": 9040 }, { "epoch": 0.7, "grad_norm": 7.334042549133301, "learning_rate": 2.0446588412569514e-06, "logits/chosen": -1.1946535110473633, "logits/rejected": -0.973946750164032, "logps/chosen": -0.8864052891731262, "logps/rejected": -4.6504411697387695, "loss": 0.8992, "odds_ratio_loss": 0.12813030183315277, "rewards/accuracies": 1.0, "rewards/chosen": -0.08864052593708038, "rewards/margins": 0.3764035701751709, "rewards/rejected": -0.4650441110134125, "sft_loss": 0.8864052891731262, "step": 9045 }, { "epoch": 0.7, "grad_norm": 6.007212162017822, "learning_rate": 2.0396942541566277e-06, "logits/chosen": -1.289546012878418, "logits/rejected": -1.0946879386901855, "logps/chosen": -0.9183686971664429, "logps/rejected": -5.516299724578857, "loss": 0.936, "odds_ratio_loss": 0.1759006530046463, "rewards/accuracies": 1.0, "rewards/chosen": -0.09183688461780548, "rewards/margins": 0.4597931504249573, "rewards/rejected": -0.5516299605369568, "sft_loss": 0.9183686971664429, "step": 9050 }, { "epoch": 0.7, "grad_norm": 49.407833099365234, "learning_rate": 2.034734156735823e-06, "logits/chosen": -1.0109045505523682, "logits/rejected": -1.3671051263809204, "logps/chosen": -0.9036981463432312, "logps/rejected": -11.408597946166992, "loss": 0.9159, "odds_ratio_loss": 0.12224564701318741, "rewards/accuracies": 1.0, "rewards/chosen": -0.0903698205947876, "rewards/margins": 1.0504900217056274, "rewards/rejected": -1.140859842300415, "sft_loss": 0.9036981463432312, "step": 9055 }, { "epoch": 0.7, "grad_norm": 50.97902297973633, "learning_rate": 2.029778556517154e-06, "logits/chosen": -1.2944746017456055, "logits/rejected": -1.1573295593261719, "logps/chosen": -0.816990077495575, "logps/rejected": -16.013568878173828, "loss": 0.8343, "odds_ratio_loss": 0.17356497049331665, "rewards/accuracies": 1.0, "rewards/chosen": -0.08169901371002197, "rewards/margins": 1.519658088684082, "rewards/rejected": -1.601357102394104, "sft_loss": 0.816990077495575, "step": 9060 }, { "epoch": 0.71, "grad_norm": 102.18575286865234, "learning_rate": 2.0248274610164185e-06, "logits/chosen": -1.3450043201446533, "logits/rejected": -1.1237186193466187, "logps/chosen": -1.0184475183486938, "logps/rejected": -6.712457180023193, "loss": 1.0302, "odds_ratio_loss": 0.11714156717061996, "rewards/accuracies": 1.0, "rewards/chosen": -0.10184475034475327, "rewards/margins": 0.5694010853767395, "rewards/rejected": -0.6712457537651062, "sft_loss": 1.0184475183486938, "step": 9065 }, { "epoch": 0.71, "grad_norm": 38.02252960205078, "learning_rate": 2.019880877742581e-06, "logits/chosen": -1.4429337978363037, "logits/rejected": -1.2393356561660767, "logps/chosen": -5.945528507232666, "logps/rejected": -8.805086135864258, "loss": 6.0357, "odds_ratio_loss": 0.9014400243759155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5945528149604797, "rewards/margins": 0.2859557271003723, "rewards/rejected": -0.880508542060852, "sft_loss": 5.945528507232666, "step": 9070 }, { "epoch": 0.71, "grad_norm": 70.94361877441406, "learning_rate": 2.014938814197761e-06, "logits/chosen": -1.3768596649169922, "logits/rejected": -1.2406704425811768, "logps/chosen": -1.871372938156128, "logps/rejected": -3.2459359169006348, "loss": 1.9491, "odds_ratio_loss": 0.7770655751228333, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18713729083538055, "rewards/margins": 0.1374562680721283, "rewards/rejected": -0.32459357380867004, "sft_loss": 1.871372938156128, "step": 9075 }, { "epoch": 0.71, "grad_norm": 19.992162704467773, "learning_rate": 2.0100012778772294e-06, "logits/chosen": -1.3903619050979614, "logits/rejected": -0.9029962420463562, "logps/chosen": -2.460082769393921, "logps/rejected": -4.92894983291626, "loss": 2.5327, "odds_ratio_loss": 0.7256797552108765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24600830674171448, "rewards/margins": 0.24688668549060822, "rewards/rejected": -0.4928949475288391, "sft_loss": 2.460082769393921, "step": 9080 }, { "epoch": 0.71, "grad_norm": 27.510286331176758, "learning_rate": 2.0050682762693846e-06, "logits/chosen": -1.3608497381210327, "logits/rejected": -1.1397944688796997, "logps/chosen": -0.7833740711212158, "logps/rejected": -4.637581825256348, "loss": 0.797, "odds_ratio_loss": 0.13609935343265533, "rewards/accuracies": 1.0, "rewards/chosen": -0.0783374086022377, "rewards/margins": 0.3854207694530487, "rewards/rejected": -0.4637581706047058, "sft_loss": 0.7833740711212158, "step": 9085 }, { "epoch": 0.71, "grad_norm": 28.414520263671875, "learning_rate": 2.0001398168557508e-06, "logits/chosen": -1.2943460941314697, "logits/rejected": -0.8901378512382507, "logps/chosen": -0.6564239263534546, "logps/rejected": -3.8656997680664062, "loss": 0.677, "odds_ratio_loss": 0.20536451041698456, "rewards/accuracies": 1.0, "rewards/chosen": -0.06564239412546158, "rewards/margins": 0.32092761993408203, "rewards/rejected": -0.3865699768066406, "sft_loss": 0.6564239263534546, "step": 9090 }, { "epoch": 0.71, "grad_norm": 5.669079780578613, "learning_rate": 1.9952159071109594e-06, "logits/chosen": -1.4045063257217407, "logits/rejected": -0.9172590970993042, "logps/chosen": -1.0156147480010986, "logps/rejected": -3.5632667541503906, "loss": 1.0359, "odds_ratio_loss": 0.202461838722229, "rewards/accuracies": 1.0, "rewards/chosen": -0.10156148672103882, "rewards/margins": 0.25476521253585815, "rewards/rejected": -0.356326699256897, "sft_loss": 1.0156147480010986, "step": 9095 }, { "epoch": 0.71, "grad_norm": 10.984932899475098, "learning_rate": 1.990296554502749e-06, "logits/chosen": -1.3105428218841553, "logits/rejected": -0.997005820274353, "logps/chosen": -1.2994569540023804, "logps/rejected": -4.665297985076904, "loss": 1.3316, "odds_ratio_loss": 0.32166650891304016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12994569540023804, "rewards/margins": 0.3365841209888458, "rewards/rejected": -0.4665297865867615, "sft_loss": 1.2994569540023804, "step": 9100 }, { "epoch": 0.71, "grad_norm": 7.292508125305176, "learning_rate": 1.9853817664919413e-06, "logits/chosen": -1.3598629236221313, "logits/rejected": -1.0822525024414062, "logps/chosen": -0.8946825861930847, "logps/rejected": -3.746201753616333, "loss": 0.9321, "odds_ratio_loss": 0.37457841634750366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08946826308965683, "rewards/margins": 0.2851519286632538, "rewards/rejected": -0.3746201992034912, "sft_loss": 0.8946825861930847, "step": 9105 }, { "epoch": 0.71, "grad_norm": 9.447525024414062, "learning_rate": 1.9804715505324346e-06, "logits/chosen": -1.172864556312561, "logits/rejected": -0.8160622715950012, "logps/chosen": -1.010909914970398, "logps/rejected": -2.0625815391540527, "loss": 1.0415, "odds_ratio_loss": 0.3055305480957031, "rewards/accuracies": 1.0, "rewards/chosen": -0.10109099000692368, "rewards/margins": 0.10516715049743652, "rewards/rejected": -0.2062581479549408, "sft_loss": 1.010909914970398, "step": 9110 }, { "epoch": 0.71, "grad_norm": 6.003201007843018, "learning_rate": 1.9755659140711965e-06, "logits/chosen": -1.2514169216156006, "logits/rejected": -1.43091881275177, "logps/chosen": -0.5724098682403564, "logps/rejected": -5.860909461975098, "loss": 0.5743, "odds_ratio_loss": 0.01881580427289009, "rewards/accuracies": 1.0, "rewards/chosen": -0.057240985333919525, "rewards/margins": 0.5288499593734741, "rewards/rejected": -0.5860909819602966, "sft_loss": 0.5724098682403564, "step": 9115 }, { "epoch": 0.71, "grad_norm": 4.640727519989014, "learning_rate": 1.9706648645482464e-06, "logits/chosen": -1.2182533740997314, "logits/rejected": -1.101128339767456, "logps/chosen": -0.9864501953125, "logps/rejected": -10.166420936584473, "loss": 0.996, "odds_ratio_loss": 0.09508304297924042, "rewards/accuracies": 1.0, "rewards/chosen": -0.09864500910043716, "rewards/margins": 0.9179970622062683, "rewards/rejected": -1.0166422128677368, "sft_loss": 0.9864501953125, "step": 9120 }, { "epoch": 0.71, "grad_norm": 13.459922790527344, "learning_rate": 1.965768409396647e-06, "logits/chosen": -1.087660551071167, "logits/rejected": -1.2694013118743896, "logps/chosen": -0.7529363632202148, "logps/rejected": -5.429129600524902, "loss": 0.7717, "odds_ratio_loss": 0.18774135410785675, "rewards/accuracies": 1.0, "rewards/chosen": -0.0752936378121376, "rewards/margins": 0.4676193594932556, "rewards/rejected": -0.5429129600524902, "sft_loss": 0.7529363632202148, "step": 9125 }, { "epoch": 0.71, "grad_norm": 5.044445037841797, "learning_rate": 1.9608765560424976e-06, "logits/chosen": -1.3213609457015991, "logits/rejected": -1.0660345554351807, "logps/chosen": -0.9869930148124695, "logps/rejected": -9.335617065429688, "loss": 0.9935, "odds_ratio_loss": 0.06551004201173782, "rewards/accuracies": 1.0, "rewards/chosen": -0.09869930893182755, "rewards/margins": 0.8348624110221863, "rewards/rejected": -0.9335616827011108, "sft_loss": 0.9869930148124695, "step": 9130 }, { "epoch": 0.71, "grad_norm": 8.95915412902832, "learning_rate": 1.9559893119049127e-06, "logits/chosen": -1.3892710208892822, "logits/rejected": -1.130847692489624, "logps/chosen": -0.8897703289985657, "logps/rejected": -4.048529624938965, "loss": 0.9171, "odds_ratio_loss": 0.27287232875823975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08897703886032104, "rewards/margins": 0.31587594747543335, "rewards/rejected": -0.404852956533432, "sft_loss": 0.8897703289985657, "step": 9135 }, { "epoch": 0.71, "grad_norm": 6.433979511260986, "learning_rate": 1.9511066843960175e-06, "logits/chosen": -1.3489278554916382, "logits/rejected": -1.1770883798599243, "logps/chosen": -1.082330584526062, "logps/rejected": -4.711759567260742, "loss": 1.109, "odds_ratio_loss": 0.26716113090515137, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10823307186365128, "rewards/margins": 0.3629428744316101, "rewards/rejected": -0.47117599844932556, "sft_loss": 1.082330584526062, "step": 9140 }, { "epoch": 0.71, "grad_norm": 5.469911575317383, "learning_rate": 1.9462286809209395e-06, "logits/chosen": -1.4053490161895752, "logits/rejected": -0.8818165063858032, "logps/chosen": -0.7211654782295227, "logps/rejected": -5.684488296508789, "loss": 0.7272, "odds_ratio_loss": 0.06042628735303879, "rewards/accuracies": 1.0, "rewards/chosen": -0.07211655378341675, "rewards/margins": 0.49633222818374634, "rewards/rejected": -0.5684488415718079, "sft_loss": 0.7211654782295227, "step": 9145 }, { "epoch": 0.71, "grad_norm": 6.692239761352539, "learning_rate": 1.9413553088777894e-06, "logits/chosen": -1.279656171798706, "logits/rejected": -1.232208013534546, "logps/chosen": -1.0628973245620728, "logps/rejected": -9.918838500976562, "loss": 1.0781, "odds_ratio_loss": 0.1518601030111313, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10628972202539444, "rewards/margins": 0.8855941891670227, "rewards/rejected": -0.9918839335441589, "sft_loss": 1.0628973245620728, "step": 9150 }, { "epoch": 0.71, "grad_norm": 4.951191425323486, "learning_rate": 1.9364865756576534e-06, "logits/chosen": -1.3498833179473877, "logits/rejected": -0.9882432222366333, "logps/chosen": -1.00874662399292, "logps/rejected": -3.104807138442993, "loss": 1.0643, "odds_ratio_loss": 0.5553382039070129, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10087466239929199, "rewards/margins": 0.2096060812473297, "rewards/rejected": -0.3104807436466217, "sft_loss": 1.00874662399292, "step": 9155 }, { "epoch": 0.71, "grad_norm": 7.204696178436279, "learning_rate": 1.931622488644583e-06, "logits/chosen": -1.3502788543701172, "logits/rejected": -1.2425469160079956, "logps/chosen": -0.7986493110656738, "logps/rejected": -8.240682601928711, "loss": 0.7989, "odds_ratio_loss": 0.002235189313068986, "rewards/accuracies": 1.0, "rewards/chosen": -0.07986493408679962, "rewards/margins": 0.7442033886909485, "rewards/rejected": -0.8240682482719421, "sft_loss": 0.7986493110656738, "step": 9160 }, { "epoch": 0.71, "grad_norm": 63.30450439453125, "learning_rate": 1.9267630552155862e-06, "logits/chosen": -1.3233058452606201, "logits/rejected": -1.1473569869995117, "logps/chosen": -1.1643285751342773, "logps/rejected": -2.8467297554016113, "loss": 1.2127, "odds_ratio_loss": 0.4839521050453186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11643286049365997, "rewards/margins": 0.16824014484882355, "rewards/rejected": -0.2846730053424835, "sft_loss": 1.1643285751342773, "step": 9165 }, { "epoch": 0.71, "grad_norm": 13.165739059448242, "learning_rate": 1.92190828274061e-06, "logits/chosen": -1.3316316604614258, "logits/rejected": -0.3999803960323334, "logps/chosen": -1.0170505046844482, "logps/rejected": -3.586103916168213, "loss": 1.0439, "odds_ratio_loss": 0.2684716582298279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10170505195856094, "rewards/margins": 0.25690528750419617, "rewards/rejected": -0.3586103618144989, "sft_loss": 1.0170505046844482, "step": 9170 }, { "epoch": 0.71, "grad_norm": 10.03995132446289, "learning_rate": 1.917058178582532e-06, "logits/chosen": -1.3869171142578125, "logits/rejected": -0.897848904132843, "logps/chosen": -1.0685797929763794, "logps/rejected": -3.428459882736206, "loss": 1.0797, "odds_ratio_loss": 0.11104003340005875, "rewards/accuracies": 1.0, "rewards/chosen": -0.10685797780752182, "rewards/margins": 0.235988050699234, "rewards/rejected": -0.34284600615501404, "sft_loss": 1.0685797929763794, "step": 9175 }, { "epoch": 0.71, "grad_norm": 7.961609840393066, "learning_rate": 1.9122127500971525e-06, "logits/chosen": -1.301819086074829, "logits/rejected": -1.0404784679412842, "logps/chosen": -0.975425124168396, "logps/rejected": -4.277166843414307, "loss": 1.0074, "odds_ratio_loss": 0.31970852613449097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09754252433776855, "rewards/margins": 0.33017414808273315, "rewards/rejected": -0.4277166724205017, "sft_loss": 0.975425124168396, "step": 9180 }, { "epoch": 0.71, "grad_norm": 97.14888763427734, "learning_rate": 1.9073720046331777e-06, "logits/chosen": -1.392052412033081, "logits/rejected": -1.0715065002441406, "logps/chosen": -1.0489897727966309, "logps/rejected": -2.846344470977783, "loss": 1.0661, "odds_ratio_loss": 0.17116869986057281, "rewards/accuracies": 1.0, "rewards/chosen": -0.10489897429943085, "rewards/margins": 0.179735466837883, "rewards/rejected": -0.28463444113731384, "sft_loss": 1.0489897727966309, "step": 9185 }, { "epoch": 0.71, "grad_norm": 26.383729934692383, "learning_rate": 1.902535949532212e-06, "logits/chosen": -1.3624117374420166, "logits/rejected": -1.1035476922988892, "logps/chosen": -0.8702263832092285, "logps/rejected": -3.861776828765869, "loss": 0.8804, "odds_ratio_loss": 0.101753830909729, "rewards/accuracies": 1.0, "rewards/chosen": -0.08702263981103897, "rewards/margins": 0.29915502667427063, "rewards/rejected": -0.3861776888370514, "sft_loss": 0.8702263832092285, "step": 9190 }, { "epoch": 0.72, "grad_norm": 9.27212142944336, "learning_rate": 1.8977045921287496e-06, "logits/chosen": -1.3226594924926758, "logits/rejected": -1.1544954776763916, "logps/chosen": -0.8614295125007629, "logps/rejected": -10.17005443572998, "loss": 0.8677, "odds_ratio_loss": 0.062495239078998566, "rewards/accuracies": 1.0, "rewards/chosen": -0.08614294975996017, "rewards/margins": 0.930862545967102, "rewards/rejected": -1.0170055627822876, "sft_loss": 0.8614295125007629, "step": 9195 }, { "epoch": 0.72, "grad_norm": 4.527229309082031, "learning_rate": 1.8928779397501561e-06, "logits/chosen": -1.3080815076828003, "logits/rejected": -0.7346660494804382, "logps/chosen": -0.6970736980438232, "logps/rejected": -8.263237953186035, "loss": 0.7016, "odds_ratio_loss": 0.045706018805503845, "rewards/accuracies": 1.0, "rewards/chosen": -0.06970737129449844, "rewards/margins": 0.7566162943840027, "rewards/rejected": -0.8263236880302429, "sft_loss": 0.6970736980438232, "step": 9200 }, { "epoch": 0.72, "grad_norm": 5.699766635894775, "learning_rate": 1.888055999716661e-06, "logits/chosen": -1.2307069301605225, "logits/rejected": -1.1355819702148438, "logps/chosen": -0.8993587493896484, "logps/rejected": -11.566434860229492, "loss": 0.8996, "odds_ratio_loss": 0.0026048864237964153, "rewards/accuracies": 1.0, "rewards/chosen": -0.08993588387966156, "rewards/margins": 1.0667074918746948, "rewards/rejected": -1.1566433906555176, "sft_loss": 0.8993587493896484, "step": 9205 }, { "epoch": 0.72, "grad_norm": 152.52505493164062, "learning_rate": 1.883238779341352e-06, "logits/chosen": -1.3559463024139404, "logits/rejected": -1.3009026050567627, "logps/chosen": -0.942069411277771, "logps/rejected": -12.025169372558594, "loss": 0.9462, "odds_ratio_loss": 0.0411582887172699, "rewards/accuracies": 1.0, "rewards/chosen": -0.09420694410800934, "rewards/margins": 1.1083099842071533, "rewards/rejected": -1.2025169134140015, "sft_loss": 0.942069411277771, "step": 9210 }, { "epoch": 0.72, "grad_norm": 7.183080673217773, "learning_rate": 1.8784262859301534e-06, "logits/chosen": -1.365206003189087, "logits/rejected": -1.2650926113128662, "logps/chosen": -1.0181998014450073, "logps/rejected": -8.810084342956543, "loss": 1.0332, "odds_ratio_loss": 0.14968711137771606, "rewards/accuracies": 1.0, "rewards/chosen": -0.1018199697136879, "rewards/margins": 0.7791884541511536, "rewards/rejected": -0.8810084462165833, "sft_loss": 1.0181998014450073, "step": 9215 }, { "epoch": 0.72, "grad_norm": 5.462626934051514, "learning_rate": 1.8736185267818224e-06, "logits/chosen": -1.4750196933746338, "logits/rejected": -1.0687038898468018, "logps/chosen": -0.9219584465026855, "logps/rejected": -5.657142162322998, "loss": 0.9329, "odds_ratio_loss": 0.10907478630542755, "rewards/accuracies": 1.0, "rewards/chosen": -0.09219583868980408, "rewards/margins": 0.47351837158203125, "rewards/rejected": -0.5657142400741577, "sft_loss": 0.9219584465026855, "step": 9220 }, { "epoch": 0.72, "grad_norm": 9.681036949157715, "learning_rate": 1.8688155091879361e-06, "logits/chosen": -1.2853978872299194, "logits/rejected": -1.139392614364624, "logps/chosen": -0.8742135167121887, "logps/rejected": -7.485783576965332, "loss": 0.8791, "odds_ratio_loss": 0.04870065301656723, "rewards/accuracies": 1.0, "rewards/chosen": -0.08742136508226395, "rewards/margins": 0.6611570119857788, "rewards/rejected": -0.7485784292221069, "sft_loss": 0.8742135167121887, "step": 9225 }, { "epoch": 0.72, "grad_norm": 319.069580078125, "learning_rate": 1.8640172404328816e-06, "logits/chosen": -1.2906922101974487, "logits/rejected": -1.2166502475738525, "logps/chosen": -1.2384088039398193, "logps/rejected": -4.230016231536865, "loss": 1.2526, "odds_ratio_loss": 0.14192232489585876, "rewards/accuracies": 1.0, "rewards/chosen": -0.12384088337421417, "rewards/margins": 0.29916077852249146, "rewards/rejected": -0.4230016767978668, "sft_loss": 1.2384088039398193, "step": 9230 }, { "epoch": 0.72, "grad_norm": 4.312925338745117, "learning_rate": 1.8592237277938413e-06, "logits/chosen": -1.2975214719772339, "logits/rejected": -0.49361056089401245, "logps/chosen": -0.8198236227035522, "logps/rejected": -9.956624984741211, "loss": 0.8327, "odds_ratio_loss": 0.12906351685523987, "rewards/accuracies": 1.0, "rewards/chosen": -0.08198236674070358, "rewards/margins": 0.9136801958084106, "rewards/rejected": -0.9956625699996948, "sft_loss": 0.8198236227035522, "step": 9235 }, { "epoch": 0.72, "grad_norm": 16.511611938476562, "learning_rate": 1.8544349785407844e-06, "logits/chosen": -1.1626781225204468, "logits/rejected": -0.8125447034835815, "logps/chosen": -0.6874169111251831, "logps/rejected": -2.541872501373291, "loss": 0.7038, "odds_ratio_loss": 0.1637047976255417, "rewards/accuracies": 1.0, "rewards/chosen": -0.06874169409275055, "rewards/margins": 0.18544557690620422, "rewards/rejected": -0.2541872560977936, "sft_loss": 0.6874169111251831, "step": 9240 }, { "epoch": 0.72, "grad_norm": 6.639622211456299, "learning_rate": 1.8496509999364609e-06, "logits/chosen": -1.3168201446533203, "logits/rejected": -0.7880635857582092, "logps/chosen": -0.7180159687995911, "logps/rejected": -5.2311906814575195, "loss": 0.7216, "odds_ratio_loss": 0.0356980636715889, "rewards/accuracies": 1.0, "rewards/chosen": -0.07180158793926239, "rewards/margins": 0.4513174593448639, "rewards/rejected": -0.5231190323829651, "sft_loss": 0.7180159687995911, "step": 9245 }, { "epoch": 0.72, "grad_norm": 5.950743198394775, "learning_rate": 1.8448717992363802e-06, "logits/chosen": -1.481801152229309, "logits/rejected": -1.1455045938491821, "logps/chosen": -1.1666754484176636, "logps/rejected": -9.948326110839844, "loss": 1.1821, "odds_ratio_loss": 0.15418633818626404, "rewards/accuracies": 1.0, "rewards/chosen": -0.11666754633188248, "rewards/margins": 0.8781651258468628, "rewards/rejected": -0.9948326349258423, "sft_loss": 1.1666754484176636, "step": 9250 }, { "epoch": 0.72, "grad_norm": 5.432945251464844, "learning_rate": 1.8400973836888048e-06, "logits/chosen": -1.3602441549301147, "logits/rejected": -1.0623838901519775, "logps/chosen": -1.0072778463363647, "logps/rejected": -8.661463737487793, "loss": 1.0476, "odds_ratio_loss": 0.4036317765712738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10072778165340424, "rewards/margins": 0.7654186487197876, "rewards/rejected": -0.8661463856697083, "sft_loss": 1.0072778463363647, "step": 9255 }, { "epoch": 0.72, "grad_norm": 5.445363521575928, "learning_rate": 1.8353277605347458e-06, "logits/chosen": -1.3271220922470093, "logits/rejected": -0.8071807026863098, "logps/chosen": -0.8467211723327637, "logps/rejected": -3.2516613006591797, "loss": 0.8858, "odds_ratio_loss": 0.3908676505088806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08467211574316025, "rewards/margins": 0.240494042634964, "rewards/rejected": -0.32516616582870483, "sft_loss": 0.8467211723327637, "step": 9260 }, { "epoch": 0.72, "grad_norm": 49.54649353027344, "learning_rate": 1.8305629370079403e-06, "logits/chosen": -1.394164800643921, "logits/rejected": -1.3578661680221558, "logps/chosen": -1.0298556089401245, "logps/rejected": -4.586734294891357, "loss": 1.0392, "odds_ratio_loss": 0.09359271079301834, "rewards/accuracies": 1.0, "rewards/chosen": -0.10298557579517365, "rewards/margins": 0.3556878864765167, "rewards/rejected": -0.45867347717285156, "sft_loss": 1.0298556089401245, "step": 9265 }, { "epoch": 0.72, "grad_norm": 55.202701568603516, "learning_rate": 1.8258029203348482e-06, "logits/chosen": -1.2330883741378784, "logits/rejected": -0.87456876039505, "logps/chosen": -1.030839443206787, "logps/rejected": -6.096185684204102, "loss": 1.0388, "odds_ratio_loss": 0.0793967917561531, "rewards/accuracies": 1.0, "rewards/chosen": -0.10308394581079483, "rewards/margins": 0.5065346360206604, "rewards/rejected": -0.6096185445785522, "sft_loss": 1.030839443206787, "step": 9270 }, { "epoch": 0.72, "grad_norm": 25.153141021728516, "learning_rate": 1.821047717734637e-06, "logits/chosen": -1.3587119579315186, "logits/rejected": -1.0884228944778442, "logps/chosen": -1.9555097818374634, "logps/rejected": -8.748655319213867, "loss": 2.0443, "odds_ratio_loss": 0.8876272439956665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.19555097818374634, "rewards/margins": 0.6793144345283508, "rewards/rejected": -0.8748654127120972, "sft_loss": 1.9555097818374634, "step": 9275 }, { "epoch": 0.72, "grad_norm": 6.199098110198975, "learning_rate": 1.8162973364191794e-06, "logits/chosen": -1.3529561758041382, "logits/rejected": -0.5642414689064026, "logps/chosen": -1.07791006565094, "logps/rejected": -3.2078804969787598, "loss": 1.1049, "odds_ratio_loss": 0.27012649178504944, "rewards/accuracies": 1.0, "rewards/chosen": -0.1077909916639328, "rewards/margins": 0.21299704909324646, "rewards/rejected": -0.32078805565834045, "sft_loss": 1.07791006565094, "step": 9280 }, { "epoch": 0.72, "grad_norm": 5.515949249267578, "learning_rate": 1.8115517835930303e-06, "logits/chosen": -1.309288740158081, "logits/rejected": -1.2958306074142456, "logps/chosen": -1.3788987398147583, "logps/rejected": -6.528385162353516, "loss": 1.3981, "odds_ratio_loss": 0.1917024850845337, "rewards/accuracies": 1.0, "rewards/chosen": -0.13788987696170807, "rewards/margins": 0.5149486660957336, "rewards/rejected": -0.6528385877609253, "sft_loss": 1.3788987398147583, "step": 9285 }, { "epoch": 0.72, "grad_norm": 22.566404342651367, "learning_rate": 1.8068110664534217e-06, "logits/chosen": -1.1723864078521729, "logits/rejected": -1.353244423866272, "logps/chosen": -1.0381544828414917, "logps/rejected": -10.768172264099121, "loss": 1.0382, "odds_ratio_loss": 0.0005095123779028654, "rewards/accuracies": 1.0, "rewards/chosen": -0.10381545126438141, "rewards/margins": 0.9730018377304077, "rewards/rejected": -1.076817274093628, "sft_loss": 1.0381544828414917, "step": 9290 }, { "epoch": 0.72, "grad_norm": 5.992241382598877, "learning_rate": 1.802075192190254e-06, "logits/chosen": -1.377455472946167, "logits/rejected": -1.128418207168579, "logps/chosen": -1.0655977725982666, "logps/rejected": -7.172788143157959, "loss": 1.0682, "odds_ratio_loss": 0.02605503238737583, "rewards/accuracies": 1.0, "rewards/chosen": -0.10655979067087173, "rewards/margins": 0.6107190251350403, "rewards/rejected": -0.7172788381576538, "sft_loss": 1.0655977725982666, "step": 9295 }, { "epoch": 0.72, "grad_norm": 7.843822956085205, "learning_rate": 1.797344167986082e-06, "logits/chosen": -1.3239879608154297, "logits/rejected": -0.9022638201713562, "logps/chosen": -1.0198582410812378, "logps/rejected": -10.66782283782959, "loss": 1.0446, "odds_ratio_loss": 0.24729104340076447, "rewards/accuracies": 1.0, "rewards/chosen": -0.10198582708835602, "rewards/margins": 0.9647965431213379, "rewards/rejected": -1.0667822360992432, "sft_loss": 1.0198582410812378, "step": 9300 }, { "epoch": 0.72, "grad_norm": 20.95368766784668, "learning_rate": 1.7926180010161027e-06, "logits/chosen": -1.2632579803466797, "logits/rejected": -1.2453078031539917, "logps/chosen": -0.8721585273742676, "logps/rejected": -2.734140396118164, "loss": 0.8955, "odds_ratio_loss": 0.23379027843475342, "rewards/accuracies": 1.0, "rewards/chosen": -0.087215855717659, "rewards/margins": 0.1861981898546219, "rewards/rejected": -0.2734140455722809, "sft_loss": 0.8721585273742676, "step": 9305 }, { "epoch": 0.72, "grad_norm": 10.482601165771484, "learning_rate": 1.7878966984481515e-06, "logits/chosen": -1.3100192546844482, "logits/rejected": -1.3720757961273193, "logps/chosen": -1.2211915254592896, "logps/rejected": -4.563811302185059, "loss": 1.2738, "odds_ratio_loss": 0.5262596011161804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12211916595697403, "rewards/margins": 0.334261953830719, "rewards/rejected": -0.4563811421394348, "sft_loss": 1.2211915254592896, "step": 9310 }, { "epoch": 0.72, "grad_norm": 36.49144744873047, "learning_rate": 1.7831802674426813e-06, "logits/chosen": -1.2298425436019897, "logits/rejected": -1.1529333591461182, "logps/chosen": -0.874362587928772, "logps/rejected": -10.53254222869873, "loss": 0.8903, "odds_ratio_loss": 0.15956975519657135, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0874362587928772, "rewards/margins": 0.9658180475234985, "rewards/rejected": -1.053254246711731, "sft_loss": 0.874362587928772, "step": 9315 }, { "epoch": 0.73, "grad_norm": 6.323707103729248, "learning_rate": 1.7784687151527574e-06, "logits/chosen": -1.464393138885498, "logits/rejected": -1.2067110538482666, "logps/chosen": -1.0483952760696411, "logps/rejected": -7.121232032775879, "loss": 1.0594, "odds_ratio_loss": 0.10971790552139282, "rewards/accuracies": 1.0, "rewards/chosen": -0.10483952611684799, "rewards/margins": 0.6072835922241211, "rewards/rejected": -0.7121232151985168, "sft_loss": 1.0483952760696411, "step": 9320 }, { "epoch": 0.73, "grad_norm": 6.304421424865723, "learning_rate": 1.7737620487240504e-06, "logits/chosen": -1.199683666229248, "logits/rejected": -0.8703166842460632, "logps/chosen": -1.2598625421524048, "logps/rejected": -6.0696187019348145, "loss": 1.3126, "odds_ratio_loss": 0.5278078317642212, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.125986248254776, "rewards/margins": 0.4809756875038147, "rewards/rejected": -0.6069619059562683, "sft_loss": 1.2598625421524048, "step": 9325 }, { "epoch": 0.73, "grad_norm": 6.901340961456299, "learning_rate": 1.7690602752948155e-06, "logits/chosen": -1.3581178188323975, "logits/rejected": -1.2259575128555298, "logps/chosen": -0.7316224575042725, "logps/rejected": -8.98783016204834, "loss": 0.7541, "odds_ratio_loss": 0.22492511570453644, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0731622502207756, "rewards/margins": 0.8256209492683411, "rewards/rejected": -0.8987830877304077, "sft_loss": 0.7316224575042725, "step": 9330 }, { "epoch": 0.73, "grad_norm": 5.403962135314941, "learning_rate": 1.7643634019958894e-06, "logits/chosen": -1.3089879751205444, "logits/rejected": -0.7208669781684875, "logps/chosen": -0.830630898475647, "logps/rejected": -7.895847320556641, "loss": 0.8413, "odds_ratio_loss": 0.10705997794866562, "rewards/accuracies": 1.0, "rewards/chosen": -0.08306309580802917, "rewards/margins": 0.7065216302871704, "rewards/rejected": -0.789584755897522, "sft_loss": 0.830630898475647, "step": 9335 }, { "epoch": 0.73, "grad_norm": 59.112098693847656, "learning_rate": 1.7596714359506762e-06, "logits/chosen": -1.31985604763031, "logits/rejected": -1.316774606704712, "logps/chosen": -0.9269720911979675, "logps/rejected": -4.943132400512695, "loss": 0.9619, "odds_ratio_loss": 0.34920763969421387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09269721806049347, "rewards/margins": 0.40161600708961487, "rewards/rejected": -0.4943132996559143, "sft_loss": 0.9269720911979675, "step": 9340 }, { "epoch": 0.73, "grad_norm": 6.590602397918701, "learning_rate": 1.754984384275139e-06, "logits/chosen": -1.1757590770721436, "logits/rejected": -1.2967216968536377, "logps/chosen": -0.9774863123893738, "logps/rejected": -16.89197540283203, "loss": 0.9868, "odds_ratio_loss": 0.09362256526947021, "rewards/accuracies": 1.0, "rewards/chosen": -0.09774863719940186, "rewards/margins": 1.5914490222930908, "rewards/rejected": -1.6891975402832031, "sft_loss": 0.9774863123893738, "step": 9345 }, { "epoch": 0.73, "grad_norm": 7.168046951293945, "learning_rate": 1.750302254077786e-06, "logits/chosen": -1.3897745609283447, "logits/rejected": -1.1414748430252075, "logps/chosen": -0.7206248044967651, "logps/rejected": -6.595806121826172, "loss": 0.7491, "odds_ratio_loss": 0.28505057096481323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07206248492002487, "rewards/margins": 0.5875180959701538, "rewards/rejected": -0.6595805883407593, "sft_loss": 0.7206248044967651, "step": 9350 }, { "epoch": 0.73, "grad_norm": 12.575242042541504, "learning_rate": 1.7456250524596607e-06, "logits/chosen": -1.248130202293396, "logits/rejected": -1.2025890350341797, "logps/chosen": -0.7890609502792358, "logps/rejected": -9.191629409790039, "loss": 0.7918, "odds_ratio_loss": 0.02735903300344944, "rewards/accuracies": 1.0, "rewards/chosen": -0.0789061039686203, "rewards/margins": 0.8402568697929382, "rewards/rejected": -0.919162929058075, "sft_loss": 0.7890609502792358, "step": 9355 }, { "epoch": 0.73, "grad_norm": 9.834772109985352, "learning_rate": 1.7409527865143366e-06, "logits/chosen": -1.3604110479354858, "logits/rejected": -1.3642244338989258, "logps/chosen": -0.42751726508140564, "logps/rejected": -6.097275733947754, "loss": 0.4296, "odds_ratio_loss": 0.020784597843885422, "rewards/accuracies": 1.0, "rewards/chosen": -0.0427517332136631, "rewards/margins": 0.5669758915901184, "rewards/rejected": -0.6097276210784912, "sft_loss": 0.42751726508140564, "step": 9360 }, { "epoch": 0.73, "grad_norm": 10.374699592590332, "learning_rate": 1.7362854633278963e-06, "logits/chosen": -1.3058464527130127, "logits/rejected": -1.0281599760055542, "logps/chosen": -0.5291213393211365, "logps/rejected": -1.087648630142212, "loss": 0.5644, "odds_ratio_loss": 0.352711021900177, "rewards/accuracies": 1.0, "rewards/chosen": -0.052912138402462006, "rewards/margins": 0.05585271865129471, "rewards/rejected": -0.10876486450433731, "sft_loss": 0.5291213393211365, "step": 9365 }, { "epoch": 0.73, "grad_norm": 303.52349853515625, "learning_rate": 1.7316230899789266e-06, "logits/chosen": -1.299647569656372, "logits/rejected": -1.0840200185775757, "logps/chosen": -1.3607500791549683, "logps/rejected": -9.152244567871094, "loss": 1.4027, "odds_ratio_loss": 0.41933393478393555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13607501983642578, "rewards/margins": 0.7791494131088257, "rewards/rejected": -0.9152244329452515, "sft_loss": 1.3607500791549683, "step": 9370 }, { "epoch": 0.73, "grad_norm": 5.656765937805176, "learning_rate": 1.726965673538512e-06, "logits/chosen": -1.362069845199585, "logits/rejected": -0.7855729460716248, "logps/chosen": -1.309899091720581, "logps/rejected": -9.314172744750977, "loss": 1.316, "odds_ratio_loss": 0.060855478048324585, "rewards/accuracies": 1.0, "rewards/chosen": -0.1309899091720581, "rewards/margins": 0.800427258014679, "rewards/rejected": -0.9314171671867371, "sft_loss": 1.309899091720581, "step": 9375 }, { "epoch": 0.73, "grad_norm": 18.312532424926758, "learning_rate": 1.7223132210702142e-06, "logits/chosen": -1.274558663368225, "logits/rejected": -1.370939016342163, "logps/chosen": -0.8942365646362305, "logps/rejected": -11.986459732055664, "loss": 0.8956, "odds_ratio_loss": 0.013301363214850426, "rewards/accuracies": 1.0, "rewards/chosen": -0.08942366391420364, "rewards/margins": 1.109222173690796, "rewards/rejected": -1.198645830154419, "sft_loss": 0.8942365646362305, "step": 9380 }, { "epoch": 0.73, "grad_norm": 5.234900951385498, "learning_rate": 1.7176657396300667e-06, "logits/chosen": -1.3498860597610474, "logits/rejected": -1.177386999130249, "logps/chosen": -1.1110543012619019, "logps/rejected": -11.639063835144043, "loss": 1.1182, "odds_ratio_loss": 0.07124121487140656, "rewards/accuracies": 1.0, "rewards/chosen": -0.11110544204711914, "rewards/margins": 1.0528010129928589, "rewards/rejected": -1.163906455039978, "sft_loss": 1.1110543012619019, "step": 9385 }, { "epoch": 0.73, "grad_norm": 6.4449238777160645, "learning_rate": 1.7130232362665672e-06, "logits/chosen": -1.3539700508117676, "logits/rejected": -1.365997076034546, "logps/chosen": -1.0939593315124512, "logps/rejected": -4.255034923553467, "loss": 1.1402, "odds_ratio_loss": 0.4621972143650055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10939594358205795, "rewards/margins": 0.31610754132270813, "rewards/rejected": -0.4255034923553467, "sft_loss": 1.0939593315124512, "step": 9390 }, { "epoch": 0.73, "grad_norm": 49.2803840637207, "learning_rate": 1.7083857180206613e-06, "logits/chosen": -1.3713133335113525, "logits/rejected": -1.0905721187591553, "logps/chosen": -0.9607345461845398, "logps/rejected": -8.983610153198242, "loss": 0.9681, "odds_ratio_loss": 0.07401247322559357, "rewards/accuracies": 1.0, "rewards/chosen": -0.0960734561085701, "rewards/margins": 0.8022874593734741, "rewards/rejected": -0.8983610272407532, "sft_loss": 0.9607345461845398, "step": 9395 }, { "epoch": 0.73, "grad_norm": 10.100704193115234, "learning_rate": 1.7037531919257338e-06, "logits/chosen": -1.3139148950576782, "logits/rejected": -1.182771921157837, "logps/chosen": -1.1720882654190063, "logps/rejected": -11.94184684753418, "loss": 1.1866, "odds_ratio_loss": 0.14487013220787048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1172088235616684, "rewards/margins": 1.0769760608673096, "rewards/rejected": -1.1941848993301392, "sft_loss": 1.1720882654190063, "step": 9400 }, { "epoch": 0.73, "grad_norm": 10.951863288879395, "learning_rate": 1.6991256650075983e-06, "logits/chosen": -1.39067542552948, "logits/rejected": -0.8745479583740234, "logps/chosen": -1.161098599433899, "logps/rejected": -8.327936172485352, "loss": 1.1635, "odds_ratio_loss": 0.023716315627098083, "rewards/accuracies": 1.0, "rewards/chosen": -0.11610986292362213, "rewards/margins": 0.7166837453842163, "rewards/rejected": -0.8327935934066772, "sft_loss": 1.161098599433899, "step": 9405 }, { "epoch": 0.73, "grad_norm": 33.24994659423828, "learning_rate": 1.6945031442844872e-06, "logits/chosen": -1.328922986984253, "logits/rejected": -1.135591745376587, "logps/chosen": -0.8492316007614136, "logps/rejected": -5.809294700622559, "loss": 0.8688, "odds_ratio_loss": 0.19584044814109802, "rewards/accuracies": 1.0, "rewards/chosen": -0.0849231630563736, "rewards/margins": 0.4960063099861145, "rewards/rejected": -0.5809294581413269, "sft_loss": 0.8492316007614136, "step": 9410 }, { "epoch": 0.73, "grad_norm": 31.55966567993164, "learning_rate": 1.6898856367670397e-06, "logits/chosen": -1.1461572647094727, "logits/rejected": -1.0324511528015137, "logps/chosen": -0.9476990699768066, "logps/rejected": -4.202122211456299, "loss": 0.971, "odds_ratio_loss": 0.2331872433423996, "rewards/accuracies": 1.0, "rewards/chosen": -0.0947699099779129, "rewards/margins": 0.3254423439502716, "rewards/rejected": -0.4202122688293457, "sft_loss": 0.9476990699768066, "step": 9415 }, { "epoch": 0.73, "grad_norm": 4.240586280822754, "learning_rate": 1.6852731494582913e-06, "logits/chosen": -1.3576472997665405, "logits/rejected": -0.860035240650177, "logps/chosen": -1.1766688823699951, "logps/rejected": -4.946648597717285, "loss": 1.2052, "odds_ratio_loss": 0.28534621000289917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11766688525676727, "rewards/margins": 0.3769979476928711, "rewards/rejected": -0.4946648180484772, "sft_loss": 1.1766688823699951, "step": 9420 }, { "epoch": 0.73, "grad_norm": 7.833755016326904, "learning_rate": 1.6806656893536672e-06, "logits/chosen": -1.2453978061676025, "logits/rejected": -0.9850869178771973, "logps/chosen": -0.8818928003311157, "logps/rejected": -4.74990177154541, "loss": 0.9018, "odds_ratio_loss": 0.1990598738193512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08818928152322769, "rewards/margins": 0.3868009150028229, "rewards/rejected": -0.47499018907546997, "sft_loss": 0.8818928003311157, "step": 9425 }, { "epoch": 0.73, "grad_norm": 127.3130874633789, "learning_rate": 1.6760632634409647e-06, "logits/chosen": -1.1609935760498047, "logits/rejected": -0.7955946326255798, "logps/chosen": -1.1204373836517334, "logps/rejected": -4.585291862487793, "loss": 1.1288, "odds_ratio_loss": 0.08315370976924896, "rewards/accuracies": 1.0, "rewards/chosen": -0.11204373836517334, "rewards/margins": 0.3464854657649994, "rewards/rejected": -0.45852917432785034, "sft_loss": 1.1204373836517334, "step": 9430 }, { "epoch": 0.73, "grad_norm": 8.43161678314209, "learning_rate": 1.6714658787003445e-06, "logits/chosen": -1.2553160190582275, "logits/rejected": -1.1255378723144531, "logps/chosen": -1.2484910488128662, "logps/rejected": -3.9797306060791016, "loss": 1.2893, "odds_ratio_loss": 0.4080115258693695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1248491033911705, "rewards/margins": 0.27312394976615906, "rewards/rejected": -0.39797303080558777, "sft_loss": 1.2484910488128662, "step": 9435 }, { "epoch": 0.73, "grad_norm": 7.703319549560547, "learning_rate": 1.6668735421043287e-06, "logits/chosen": -1.2995141744613647, "logits/rejected": -1.1087191104888916, "logps/chosen": -1.626448392868042, "logps/rejected": -11.731626510620117, "loss": 1.638, "odds_ratio_loss": 0.11596866697072983, "rewards/accuracies": 1.0, "rewards/chosen": -0.16264484822750092, "rewards/margins": 1.0105178356170654, "rewards/rejected": -1.173162817955017, "sft_loss": 1.626448392868042, "step": 9440 }, { "epoch": 0.73, "grad_norm": 86.04496002197266, "learning_rate": 1.662286260617776e-06, "logits/chosen": -1.345144271850586, "logits/rejected": -1.0741063356399536, "logps/chosen": -0.8128703832626343, "logps/rejected": -9.815523147583008, "loss": 0.8431, "odds_ratio_loss": 0.30230069160461426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08128704130649567, "rewards/margins": 0.900265097618103, "rewards/rejected": -0.9815523028373718, "sft_loss": 0.8128703832626343, "step": 9445 }, { "epoch": 0.74, "grad_norm": 7.98433256149292, "learning_rate": 1.6577040411978817e-06, "logits/chosen": -1.4675757884979248, "logits/rejected": -1.2412128448486328, "logps/chosen": -0.8189373016357422, "logps/rejected": -7.274649143218994, "loss": 0.8294, "odds_ratio_loss": 0.10454756021499634, "rewards/accuracies": 1.0, "rewards/chosen": -0.08189372718334198, "rewards/margins": 0.645571231842041, "rewards/rejected": -0.7274649143218994, "sft_loss": 0.8189373016357422, "step": 9450 }, { "epoch": 0.74, "grad_norm": 7.043887138366699, "learning_rate": 1.653126890794164e-06, "logits/chosen": -1.4140623807907104, "logits/rejected": -1.0005137920379639, "logps/chosen": -0.7331684231758118, "logps/rejected": -2.0889861583709717, "loss": 0.7617, "odds_ratio_loss": 0.2852046489715576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07331683486700058, "rewards/margins": 0.13558180630207062, "rewards/rejected": -0.2088986337184906, "sft_loss": 0.7331684231758118, "step": 9455 }, { "epoch": 0.74, "grad_norm": 4.3914384841918945, "learning_rate": 1.6485548163484511e-06, "logits/chosen": -1.293859601020813, "logits/rejected": -0.8690057992935181, "logps/chosen": -0.7792251706123352, "logps/rejected": -3.0021920204162598, "loss": 0.8029, "odds_ratio_loss": 0.23707649111747742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0779225155711174, "rewards/margins": 0.22229667007923126, "rewards/rejected": -0.30021920800209045, "sft_loss": 0.7792251706123352, "step": 9460 }, { "epoch": 0.74, "grad_norm": 10.480997085571289, "learning_rate": 1.643987824794876e-06, "logits/chosen": -1.3863446712493896, "logits/rejected": -1.0339863300323486, "logps/chosen": -0.9150098562240601, "logps/rejected": -6.108335018157959, "loss": 0.921, "odds_ratio_loss": 0.0596047043800354, "rewards/accuracies": 1.0, "rewards/chosen": -0.09150098264217377, "rewards/margins": 0.5193325281143188, "rewards/rejected": -0.6108335256576538, "sft_loss": 0.9150098562240601, "step": 9465 }, { "epoch": 0.74, "grad_norm": 6.933332920074463, "learning_rate": 1.639425923059858e-06, "logits/chosen": -1.2239768505096436, "logits/rejected": -1.0536987781524658, "logps/chosen": -1.0001404285430908, "logps/rejected": -8.236177444458008, "loss": 1.0256, "odds_ratio_loss": 0.2543320059776306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10001404583454132, "rewards/margins": 0.7236037850379944, "rewards/rejected": -0.8236177563667297, "sft_loss": 1.0001404285430908, "step": 9470 }, { "epoch": 0.74, "grad_norm": 7.526812553405762, "learning_rate": 1.634869118062105e-06, "logits/chosen": -1.223048210144043, "logits/rejected": -1.3066279888153076, "logps/chosen": -1.0369764566421509, "logps/rejected": -10.110676765441895, "loss": 1.0437, "odds_ratio_loss": 0.06689582765102386, "rewards/accuracies": 1.0, "rewards/chosen": -0.10369764268398285, "rewards/margins": 0.9073699712753296, "rewards/rejected": -1.0110676288604736, "sft_loss": 1.0369764566421509, "step": 9475 }, { "epoch": 0.74, "grad_norm": 10.411308288574219, "learning_rate": 1.630317416712588e-06, "logits/chosen": -0.7705143094062805, "logits/rejected": -1.2210378646850586, "logps/chosen": -0.8669899702072144, "logps/rejected": -8.831012725830078, "loss": 0.8733, "odds_ratio_loss": 0.06326936930418015, "rewards/accuracies": 1.0, "rewards/chosen": -0.0866990014910698, "rewards/margins": 0.7964022755622864, "rewards/rejected": -0.8831012845039368, "sft_loss": 0.8669899702072144, "step": 9480 }, { "epoch": 0.74, "grad_norm": 752.5221557617188, "learning_rate": 1.6257708259145388e-06, "logits/chosen": -1.4490292072296143, "logits/rejected": -1.3708478212356567, "logps/chosen": -2.4949989318847656, "logps/rejected": -7.886415958404541, "loss": 2.5058, "odds_ratio_loss": 0.1081923395395279, "rewards/accuracies": 1.0, "rewards/chosen": -0.2494998723268509, "rewards/margins": 0.5391417145729065, "rewards/rejected": -0.7886415719985962, "sft_loss": 2.4949989318847656, "step": 9485 }, { "epoch": 0.74, "grad_norm": 6.341780662536621, "learning_rate": 1.621229352563442e-06, "logits/chosen": -1.4050637483596802, "logits/rejected": -0.7608194351196289, "logps/chosen": -0.8335064053535461, "logps/rejected": -9.597498893737793, "loss": 0.8375, "odds_ratio_loss": 0.03985415771603584, "rewards/accuracies": 1.0, "rewards/chosen": -0.08335064351558685, "rewards/margins": 0.8763993382453918, "rewards/rejected": -0.9597498774528503, "sft_loss": 0.8335064053535461, "step": 9490 }, { "epoch": 0.74, "grad_norm": 21.7724552154541, "learning_rate": 1.616693003547018e-06, "logits/chosen": -1.3081724643707275, "logits/rejected": -1.0145162343978882, "logps/chosen": -0.870733380317688, "logps/rejected": -7.0630693435668945, "loss": 0.8841, "odds_ratio_loss": 0.13338907063007355, "rewards/accuracies": 1.0, "rewards/chosen": -0.08707333356142044, "rewards/margins": 0.6192336082458496, "rewards/rejected": -0.7063069343566895, "sft_loss": 0.870733380317688, "step": 9495 }, { "epoch": 0.74, "grad_norm": 12.79421615600586, "learning_rate": 1.6121617857452138e-06, "logits/chosen": -1.3996565341949463, "logits/rejected": -1.309962511062622, "logps/chosen": -0.990923285484314, "logps/rejected": -1.5980056524276733, "loss": 1.0392, "odds_ratio_loss": 0.4825132489204407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09909233450889587, "rewards/margins": 0.06070823594927788, "rewards/rejected": -0.15980055928230286, "sft_loss": 0.990923285484314, "step": 9500 }, { "epoch": 0.74, "grad_norm": 88.10269927978516, "learning_rate": 1.6076357060301995e-06, "logits/chosen": -1.284991979598999, "logits/rejected": -1.2980594635009766, "logps/chosen": -0.7582489848136902, "logps/rejected": -4.3230133056640625, "loss": 0.7667, "odds_ratio_loss": 0.08428351581096649, "rewards/accuracies": 1.0, "rewards/chosen": -0.07582490146160126, "rewards/margins": 0.35647648572921753, "rewards/rejected": -0.4323013722896576, "sft_loss": 0.7582489848136902, "step": 9505 }, { "epoch": 0.74, "grad_norm": 6.083242893218994, "learning_rate": 1.6031147712663487e-06, "logits/chosen": -1.2613264322280884, "logits/rejected": -1.400712490081787, "logps/chosen": -1.0994956493377686, "logps/rejected": -8.263212203979492, "loss": 1.1013, "odds_ratio_loss": 0.01845996081829071, "rewards/accuracies": 1.0, "rewards/chosen": -0.10994956642389297, "rewards/margins": 0.7163716554641724, "rewards/rejected": -0.8263211250305176, "sft_loss": 1.0994956493377686, "step": 9510 }, { "epoch": 0.74, "grad_norm": 6.732654094696045, "learning_rate": 1.5985989883102343e-06, "logits/chosen": -1.3507130146026611, "logits/rejected": -1.2158911228179932, "logps/chosen": -1.0793886184692383, "logps/rejected": -7.2026190757751465, "loss": 1.0963, "odds_ratio_loss": 0.16960716247558594, "rewards/accuracies": 1.0, "rewards/chosen": -0.10793887078762054, "rewards/margins": 0.6123231053352356, "rewards/rejected": -0.7202619314193726, "sft_loss": 1.0793886184692383, "step": 9515 }, { "epoch": 0.74, "grad_norm": 7.203071594238281, "learning_rate": 1.5940883640106091e-06, "logits/chosen": -1.3774728775024414, "logits/rejected": -1.1365160942077637, "logps/chosen": -0.8118964433670044, "logps/rejected": -4.104200839996338, "loss": 0.842, "odds_ratio_loss": 0.30122292041778564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08118964731693268, "rewards/margins": 0.3292304277420044, "rewards/rejected": -0.4104200303554535, "sft_loss": 0.8118964433670044, "step": 9520 }, { "epoch": 0.74, "grad_norm": 24.832763671875, "learning_rate": 1.5895829052084132e-06, "logits/chosen": -1.3976446390151978, "logits/rejected": -1.1410542726516724, "logps/chosen": -1.0454027652740479, "logps/rejected": -4.984982490539551, "loss": 1.0612, "odds_ratio_loss": 0.15799330174922943, "rewards/accuracies": 1.0, "rewards/chosen": -0.10454028844833374, "rewards/margins": 0.3939579427242279, "rewards/rejected": -0.49849826097488403, "sft_loss": 1.0454027652740479, "step": 9525 }, { "epoch": 0.74, "grad_norm": 23.104721069335938, "learning_rate": 1.5850826187367452e-06, "logits/chosen": -1.245971918106079, "logits/rejected": -1.221071720123291, "logps/chosen": -0.7174273729324341, "logps/rejected": -4.505877494812012, "loss": 0.73, "odds_ratio_loss": 0.12609794735908508, "rewards/accuracies": 1.0, "rewards/chosen": -0.07174274325370789, "rewards/margins": 0.3788450062274933, "rewards/rejected": -0.45058774948120117, "sft_loss": 0.7174273729324341, "step": 9530 }, { "epoch": 0.74, "grad_norm": 6.791812896728516, "learning_rate": 1.5805875114208586e-06, "logits/chosen": -1.4716187715530396, "logits/rejected": -0.8964160680770874, "logps/chosen": -0.8372504115104675, "logps/rejected": -9.657258987426758, "loss": 0.8378, "odds_ratio_loss": 0.005763564258813858, "rewards/accuracies": 1.0, "rewards/chosen": -0.08372504264116287, "rewards/margins": 0.8820008039474487, "rewards/rejected": -0.9657258987426758, "sft_loss": 0.8372504115104675, "step": 9535 }, { "epoch": 0.74, "grad_norm": 8.337750434875488, "learning_rate": 1.5760975900781582e-06, "logits/chosen": -1.3692501783370972, "logits/rejected": -1.2153772115707397, "logps/chosen": -0.7844537496566772, "logps/rejected": -6.409913063049316, "loss": 0.7908, "odds_ratio_loss": 0.06374012678861618, "rewards/accuracies": 1.0, "rewards/chosen": -0.07844537496566772, "rewards/margins": 0.562545895576477, "rewards/rejected": -0.6409912705421448, "sft_loss": 0.7844537496566772, "step": 9540 }, { "epoch": 0.74, "grad_norm": 9.413233757019043, "learning_rate": 1.5716128615181786e-06, "logits/chosen": -1.4353245496749878, "logits/rejected": -1.217750072479248, "logps/chosen": -1.0690791606903076, "logps/rejected": -5.4481353759765625, "loss": 1.0838, "odds_ratio_loss": 0.1469026356935501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.106907919049263, "rewards/margins": 0.43790560960769653, "rewards/rejected": -0.5448135137557983, "sft_loss": 1.0690791606903076, "step": 9545 }, { "epoch": 0.74, "grad_norm": 5.308843612670898, "learning_rate": 1.5671333325425775e-06, "logits/chosen": -1.2684317827224731, "logits/rejected": -1.0575990676879883, "logps/chosen": -0.9879388809204102, "logps/rejected": -9.218892097473145, "loss": 1.0006, "odds_ratio_loss": 0.12698756158351898, "rewards/accuracies": 1.0, "rewards/chosen": -0.0987938940525055, "rewards/margins": 0.8230952024459839, "rewards/rejected": -0.9218891859054565, "sft_loss": 0.9879388809204102, "step": 9550 }, { "epoch": 0.74, "grad_norm": 18.182369232177734, "learning_rate": 1.5626590099451329e-06, "logits/chosen": -1.2446739673614502, "logits/rejected": -1.1840369701385498, "logps/chosen": -1.1093944311141968, "logps/rejected": -9.49000358581543, "loss": 1.1481, "odds_ratio_loss": 0.3874703347682953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11093944311141968, "rewards/margins": 0.8380608558654785, "rewards/rejected": -0.9490002393722534, "sft_loss": 1.1093944311141968, "step": 9555 }, { "epoch": 0.74, "grad_norm": 41.64733123779297, "learning_rate": 1.5581899005117212e-06, "logits/chosen": -1.0101242065429688, "logits/rejected": -1.3759095668792725, "logps/chosen": -0.942404568195343, "logps/rejected": -7.333226680755615, "loss": 0.9562, "odds_ratio_loss": 0.13792428374290466, "rewards/accuracies": 1.0, "rewards/chosen": -0.0942404493689537, "rewards/margins": 0.6390821933746338, "rewards/rejected": -0.7333226203918457, "sft_loss": 0.942404568195343, "step": 9560 }, { "epoch": 0.74, "grad_norm": 93.02265930175781, "learning_rate": 1.553726011020315e-06, "logits/chosen": -1.3022782802581787, "logits/rejected": -0.844427227973938, "logps/chosen": -0.8994660377502441, "logps/rejected": -2.6767678260803223, "loss": 0.9391, "odds_ratio_loss": 0.39630061388015747, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08994659781455994, "rewards/margins": 0.17773017287254333, "rewards/rejected": -0.26767677068710327, "sft_loss": 0.8994660377502441, "step": 9565 }, { "epoch": 0.74, "grad_norm": 7.572153091430664, "learning_rate": 1.5492673482409693e-06, "logits/chosen": -1.386723518371582, "logits/rejected": -1.2708041667938232, "logps/chosen": -0.9327977895736694, "logps/rejected": -8.727251052856445, "loss": 0.9405, "odds_ratio_loss": 0.07732054591178894, "rewards/accuracies": 1.0, "rewards/chosen": -0.09327977895736694, "rewards/margins": 0.7794452905654907, "rewards/rejected": -0.8727251291275024, "sft_loss": 0.9327977895736694, "step": 9570 }, { "epoch": 0.74, "grad_norm": 6.863697052001953, "learning_rate": 1.5448139189358114e-06, "logits/chosen": -1.4256162643432617, "logits/rejected": -0.9826396703720093, "logps/chosen": -0.9701846837997437, "logps/rejected": -4.691870212554932, "loss": 0.9802, "odds_ratio_loss": 0.1004917174577713, "rewards/accuracies": 1.0, "rewards/chosen": -0.09701846539974213, "rewards/margins": 0.37216854095458984, "rewards/rejected": -0.4691869616508484, "sft_loss": 0.9701846837997437, "step": 9575 }, { "epoch": 0.75, "grad_norm": 22.540124893188477, "learning_rate": 1.5403657298590335e-06, "logits/chosen": -1.4844796657562256, "logits/rejected": -1.193983793258667, "logps/chosen": -0.8730993270874023, "logps/rejected": -3.906749725341797, "loss": 0.9163, "odds_ratio_loss": 0.4321001470088959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08730993419885635, "rewards/margins": 0.3033650517463684, "rewards/rejected": -0.39067500829696655, "sft_loss": 0.8730993270874023, "step": 9580 }, { "epoch": 0.75, "grad_norm": 56.619239807128906, "learning_rate": 1.5359227877568766e-06, "logits/chosen": -1.3769557476043701, "logits/rejected": -0.7135136127471924, "logps/chosen": -1.2803035974502563, "logps/rejected": -6.4957756996154785, "loss": 1.3857, "odds_ratio_loss": 1.0542762279510498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12803034484386444, "rewards/margins": 0.5215471982955933, "rewards/rejected": -0.6495774984359741, "sft_loss": 1.2803035974502563, "step": 9585 }, { "epoch": 0.75, "grad_norm": 8.903173446655273, "learning_rate": 1.53148509936763e-06, "logits/chosen": -1.2418177127838135, "logits/rejected": -1.2458902597427368, "logps/chosen": -0.7455726861953735, "logps/rejected": -8.08189582824707, "loss": 0.7642, "odds_ratio_loss": 0.1859380453824997, "rewards/accuracies": 1.0, "rewards/chosen": -0.07455727458000183, "rewards/margins": 0.7336322069168091, "rewards/rejected": -0.8081895112991333, "sft_loss": 0.7455726861953735, "step": 9590 }, { "epoch": 0.75, "grad_norm": 13.163384437561035, "learning_rate": 1.5270526714216106e-06, "logits/chosen": -1.321645975112915, "logits/rejected": -1.3027857542037964, "logps/chosen": -1.1771001815795898, "logps/rejected": -24.939193725585938, "loss": 1.1771, "odds_ratio_loss": 0.00014036455831956118, "rewards/accuracies": 1.0, "rewards/chosen": -0.11771001666784286, "rewards/margins": 2.376209259033203, "rewards/rejected": -2.4939193725585938, "sft_loss": 1.1771001815795898, "step": 9595 }, { "epoch": 0.75, "grad_norm": 22.36627960205078, "learning_rate": 1.5226255106411553e-06, "logits/chosen": -1.3507254123687744, "logits/rejected": -1.065592646598816, "logps/chosen": -0.9857769012451172, "logps/rejected": -4.691664695739746, "loss": 0.9978, "odds_ratio_loss": 0.12072012573480606, "rewards/accuracies": 1.0, "rewards/chosen": -0.09857769310474396, "rewards/margins": 0.37058883905410767, "rewards/rejected": -0.46916651725769043, "sft_loss": 0.9857769012451172, "step": 9600 }, { "epoch": 0.75, "grad_norm": 10.55109691619873, "learning_rate": 1.51820362374062e-06, "logits/chosen": -1.4908877611160278, "logits/rejected": -1.7430016994476318, "logps/chosen": -0.9867733120918274, "logps/rejected": -8.725939750671387, "loss": 0.9872, "odds_ratio_loss": 0.0038646336179226637, "rewards/accuracies": 1.0, "rewards/chosen": -0.09867732971906662, "rewards/margins": 0.7739167213439941, "rewards/rejected": -0.8725940585136414, "sft_loss": 0.9867733120918274, "step": 9605 }, { "epoch": 0.75, "grad_norm": 12.481772422790527, "learning_rate": 1.5137870174263547e-06, "logits/chosen": -1.3180853128433228, "logits/rejected": -1.3651639223098755, "logps/chosen": -0.7774752974510193, "logps/rejected": -2.9693262577056885, "loss": 0.795, "odds_ratio_loss": 0.17541441321372986, "rewards/accuracies": 1.0, "rewards/chosen": -0.07774752378463745, "rewards/margins": 0.21918511390686035, "rewards/rejected": -0.2969326376914978, "sft_loss": 0.7774752974510193, "step": 9610 }, { "epoch": 0.75, "grad_norm": 25.910600662231445, "learning_rate": 1.5093756983967035e-06, "logits/chosen": -1.425577163696289, "logits/rejected": -1.1789932250976562, "logps/chosen": -1.0428050756454468, "logps/rejected": -3.9875640869140625, "loss": 1.0858, "odds_ratio_loss": 0.42989516258239746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1042805090546608, "rewards/margins": 0.2944759130477905, "rewards/rejected": -0.3987564146518707, "sft_loss": 1.0428050756454468, "step": 9615 }, { "epoch": 0.75, "grad_norm": 11.152374267578125, "learning_rate": 1.5049696733419938e-06, "logits/chosen": -1.2368385791778564, "logits/rejected": -0.9455921053886414, "logps/chosen": -1.049968957901001, "logps/rejected": -3.453242778778076, "loss": 1.0725, "odds_ratio_loss": 0.22488944232463837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10499688237905502, "rewards/margins": 0.2403274029493332, "rewards/rejected": -0.34532430768013, "sft_loss": 1.049968957901001, "step": 9620 }, { "epoch": 0.75, "grad_norm": 7.991733551025391, "learning_rate": 1.5005689489445208e-06, "logits/chosen": -1.086672067642212, "logits/rejected": -1.2593472003936768, "logps/chosen": -1.281997561454773, "logps/rejected": -5.90949010848999, "loss": 1.2998, "odds_ratio_loss": 0.17808976769447327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1281997561454773, "rewards/margins": 0.46274930238723755, "rewards/rejected": -0.5909490585327148, "sft_loss": 1.281997561454773, "step": 9625 }, { "epoch": 0.75, "grad_norm": 18.611215591430664, "learning_rate": 1.4961735318785415e-06, "logits/chosen": -1.2058672904968262, "logits/rejected": -1.2779263257980347, "logps/chosen": -1.0617318153381348, "logps/rejected": -3.314312696456909, "loss": 1.1067, "odds_ratio_loss": 0.4496592879295349, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10617318004369736, "rewards/margins": 0.22525811195373535, "rewards/rejected": -0.3314313292503357, "sft_loss": 1.0617318153381348, "step": 9630 }, { "epoch": 0.75, "grad_norm": 4.113557815551758, "learning_rate": 1.4917834288102646e-06, "logits/chosen": -1.3224985599517822, "logits/rejected": -0.7819467782974243, "logps/chosen": -1.22843337059021, "logps/rejected": -7.48949670791626, "loss": 1.2421, "odds_ratio_loss": 0.13632090389728546, "rewards/accuracies": 1.0, "rewards/chosen": -0.12284334748983383, "rewards/margins": 0.6261062622070312, "rewards/rejected": -0.7489496469497681, "sft_loss": 1.22843337059021, "step": 9635 }, { "epoch": 0.75, "grad_norm": 9.185248374938965, "learning_rate": 1.4873986463978386e-06, "logits/chosen": -1.2555792331695557, "logits/rejected": -1.1100329160690308, "logps/chosen": -1.0447484254837036, "logps/rejected": -4.491857528686523, "loss": 1.0741, "odds_ratio_loss": 0.29385143518447876, "rewards/accuracies": 1.0, "rewards/chosen": -0.10447484254837036, "rewards/margins": 0.34471091628074646, "rewards/rejected": -0.4491857588291168, "sft_loss": 1.0447484254837036, "step": 9640 }, { "epoch": 0.75, "grad_norm": 5.883439540863037, "learning_rate": 1.4830191912913421e-06, "logits/chosen": -1.2748113870620728, "logits/rejected": -1.396545648574829, "logps/chosen": -1.0344129800796509, "logps/rejected": -6.886133670806885, "loss": 1.0502, "odds_ratio_loss": 0.1582815945148468, "rewards/accuracies": 1.0, "rewards/chosen": -0.10344129800796509, "rewards/margins": 0.5851720571517944, "rewards/rejected": -0.6886133551597595, "sft_loss": 1.0344129800796509, "step": 9645 }, { "epoch": 0.75, "grad_norm": 23.18375587463379, "learning_rate": 1.4786450701327742e-06, "logits/chosen": -1.3845680952072144, "logits/rejected": -0.9784570932388306, "logps/chosen": -0.9990909695625305, "logps/rejected": -5.884314060211182, "loss": 1.0163, "odds_ratio_loss": 0.1719394028186798, "rewards/accuracies": 1.0, "rewards/chosen": -0.09990908950567245, "rewards/margins": 0.4885222911834717, "rewards/rejected": -0.5884313583374023, "sft_loss": 0.9990909695625305, "step": 9650 }, { "epoch": 0.75, "grad_norm": 7.935603141784668, "learning_rate": 1.4742762895560476e-06, "logits/chosen": -1.3818585872650146, "logits/rejected": -1.1174324750900269, "logps/chosen": -0.7408406138420105, "logps/rejected": -3.53485107421875, "loss": 0.7542, "odds_ratio_loss": 0.13402345776557922, "rewards/accuracies": 1.0, "rewards/chosen": -0.07408406585454941, "rewards/margins": 0.279401034116745, "rewards/rejected": -0.353485107421875, "sft_loss": 0.7408406138420105, "step": 9655 }, { "epoch": 0.75, "grad_norm": 9.305075645446777, "learning_rate": 1.4699128561869708e-06, "logits/chosen": -1.294163703918457, "logits/rejected": -0.9440513849258423, "logps/chosen": -1.1079423427581787, "logps/rejected": -6.321938991546631, "loss": 1.1293, "odds_ratio_loss": 0.21366176009178162, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11079423129558563, "rewards/margins": 0.5213996171951294, "rewards/rejected": -0.632193922996521, "sft_loss": 1.1079423427581787, "step": 9660 }, { "epoch": 0.75, "grad_norm": 23.341026306152344, "learning_rate": 1.4655547766432437e-06, "logits/chosen": -1.4045528173446655, "logits/rejected": -1.3434903621673584, "logps/chosen": -1.1345020532608032, "logps/rejected": -7.883443355560303, "loss": 1.1512, "odds_ratio_loss": 0.16651661694049835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11345021426677704, "rewards/margins": 0.6748942136764526, "rewards/rejected": -0.7883443236351013, "sft_loss": 1.1345020532608032, "step": 9665 }, { "epoch": 0.75, "grad_norm": 9.361612319946289, "learning_rate": 1.4612020575344499e-06, "logits/chosen": -1.3855892419815063, "logits/rejected": -0.5439692139625549, "logps/chosen": -0.8594552874565125, "logps/rejected": -5.760911464691162, "loss": 0.8727, "odds_ratio_loss": 0.13233062624931335, "rewards/accuracies": 1.0, "rewards/chosen": -0.08594552427530289, "rewards/margins": 0.49014562368392944, "rewards/rejected": -0.5760911703109741, "sft_loss": 0.8594552874565125, "step": 9670 }, { "epoch": 0.75, "grad_norm": 39.15553665161133, "learning_rate": 1.4568547054620392e-06, "logits/chosen": -1.3476572036743164, "logits/rejected": -0.8295741081237793, "logps/chosen": -1.328378438949585, "logps/rejected": -2.901322841644287, "loss": 1.3614, "odds_ratio_loss": 0.32974696159362793, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13283784687519073, "rewards/margins": 0.15729445219039917, "rewards/rejected": -0.2901323139667511, "sft_loss": 1.328378438949585, "step": 9675 }, { "epoch": 0.75, "grad_norm": 8.7423677444458, "learning_rate": 1.452512727019323e-06, "logits/chosen": -1.3339862823486328, "logits/rejected": -0.887398898601532, "logps/chosen": -0.948280930519104, "logps/rejected": -6.027867317199707, "loss": 0.9603, "odds_ratio_loss": 0.12048999965190887, "rewards/accuracies": 1.0, "rewards/chosen": -0.09482809156179428, "rewards/margins": 0.5079585909843445, "rewards/rejected": -0.6027867197990417, "sft_loss": 0.948280930519104, "step": 9680 }, { "epoch": 0.75, "grad_norm": 25.774412155151367, "learning_rate": 1.4481761287914625e-06, "logits/chosen": -1.239070177078247, "logits/rejected": -1.229046106338501, "logps/chosen": -1.0665532350540161, "logps/rejected": -4.786002159118652, "loss": 1.0929, "odds_ratio_loss": 0.2636137902736664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1066553145647049, "rewards/margins": 0.3719449043273926, "rewards/rejected": -0.4786002039909363, "sft_loss": 1.0665532350540161, "step": 9685 }, { "epoch": 0.75, "grad_norm": 26.393348693847656, "learning_rate": 1.4438449173554597e-06, "logits/chosen": -1.2464635372161865, "logits/rejected": -1.4154313802719116, "logps/chosen": -1.031361699104309, "logps/rejected": -4.2127685546875, "loss": 1.0615, "odds_ratio_loss": 0.30091702938079834, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10313616693019867, "rewards/margins": 0.3181407153606415, "rewards/rejected": -0.42127689719200134, "sft_loss": 1.031361699104309, "step": 9690 }, { "epoch": 0.75, "grad_norm": 6.652644157409668, "learning_rate": 1.4395190992801456e-06, "logits/chosen": -1.371250033378601, "logits/rejected": -0.7872442603111267, "logps/chosen": -1.035787582397461, "logps/rejected": -13.284795761108398, "loss": 1.0411, "odds_ratio_loss": 0.05353992059826851, "rewards/accuracies": 1.0, "rewards/chosen": -0.10357876867055893, "rewards/margins": 1.2249009609222412, "rewards/rejected": -1.3284796476364136, "sft_loss": 1.035787582397461, "step": 9695 }, { "epoch": 0.75, "grad_norm": 9.1168794631958, "learning_rate": 1.4351986811261753e-06, "logits/chosen": -1.2818351984024048, "logits/rejected": -1.6169687509536743, "logps/chosen": -0.7868584990501404, "logps/rejected": -10.137491226196289, "loss": 0.788, "odds_ratio_loss": 0.01179027371108532, "rewards/accuracies": 1.0, "rewards/chosen": -0.07868585735559464, "rewards/margins": 0.9350631833076477, "rewards/rejected": -1.0137490034103394, "sft_loss": 0.7868584990501404, "step": 9700 }, { "epoch": 0.75, "grad_norm": 9.317264556884766, "learning_rate": 1.43088366944601e-06, "logits/chosen": -1.3212807178497314, "logits/rejected": -1.1130956411361694, "logps/chosen": -0.832508385181427, "logps/rejected": -10.450431823730469, "loss": 0.8588, "odds_ratio_loss": 0.2629583179950714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08325083553791046, "rewards/margins": 0.9617922902107239, "rewards/rejected": -1.0450432300567627, "sft_loss": 0.832508385181427, "step": 9705 }, { "epoch": 0.76, "grad_norm": 9.495530128479004, "learning_rate": 1.4265740707839127e-06, "logits/chosen": -1.3654181957244873, "logits/rejected": -1.3107173442840576, "logps/chosen": -1.4224342107772827, "logps/rejected": -12.950488090515137, "loss": 1.4268, "odds_ratio_loss": 0.04388565570116043, "rewards/accuracies": 1.0, "rewards/chosen": -0.142243430018425, "rewards/margins": 1.1528054475784302, "rewards/rejected": -1.2950489521026611, "sft_loss": 1.4224342107772827, "step": 9710 }, { "epoch": 0.76, "grad_norm": 8.009198188781738, "learning_rate": 1.4222698916759347e-06, "logits/chosen": -1.3352587223052979, "logits/rejected": -1.2217845916748047, "logps/chosen": -0.9053753018379211, "logps/rejected": -6.1464009284973145, "loss": 0.9398, "odds_ratio_loss": 0.3441086411476135, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09053752571344376, "rewards/margins": 0.5241025686264038, "rewards/rejected": -0.6146401166915894, "sft_loss": 0.9053753018379211, "step": 9715 }, { "epoch": 0.76, "grad_norm": 46.444496154785156, "learning_rate": 1.4179711386499145e-06, "logits/chosen": -1.1560405492782593, "logits/rejected": -1.4345853328704834, "logps/chosen": -1.78522527217865, "logps/rejected": -6.893980503082275, "loss": 1.8423, "odds_ratio_loss": 0.5703558921813965, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17852254211902618, "rewards/margins": 0.5108754634857178, "rewards/rejected": -0.689397931098938, "sft_loss": 1.78522527217865, "step": 9720 }, { "epoch": 0.76, "grad_norm": 13.907815933227539, "learning_rate": 1.413677818225454e-06, "logits/chosen": -1.3137757778167725, "logits/rejected": -1.4432036876678467, "logps/chosen": -0.8441891670227051, "logps/rejected": -4.315835952758789, "loss": 0.8707, "odds_ratio_loss": 0.2653736472129822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08441893011331558, "rewards/margins": 0.34716469049453735, "rewards/rejected": -0.43158358335494995, "sft_loss": 0.8441891670227051, "step": 9725 }, { "epoch": 0.76, "grad_norm": 60.72998046875, "learning_rate": 1.409389936913918e-06, "logits/chosen": -1.379272699356079, "logits/rejected": -1.3644345998764038, "logps/chosen": -1.0330593585968018, "logps/rejected": -6.766173362731934, "loss": 1.0723, "odds_ratio_loss": 0.39229878783226013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10330593585968018, "rewards/margins": 0.573311448097229, "rewards/rejected": -0.6766173839569092, "sft_loss": 1.0330593585968018, "step": 9730 }, { "epoch": 0.76, "grad_norm": 9.962238311767578, "learning_rate": 1.4051075012184262e-06, "logits/chosen": -1.0592668056488037, "logits/rejected": -1.1845660209655762, "logps/chosen": -1.181274175643921, "logps/rejected": -4.246553897857666, "loss": 1.2449, "odds_ratio_loss": 0.6367109417915344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11812742054462433, "rewards/margins": 0.30652791261672974, "rewards/rejected": -0.42465537786483765, "sft_loss": 1.181274175643921, "step": 9735 }, { "epoch": 0.76, "grad_norm": 5.543389797210693, "learning_rate": 1.4008305176338337e-06, "logits/chosen": -1.1186319589614868, "logits/rejected": -0.8160092234611511, "logps/chosen": -0.6612199544906616, "logps/rejected": -6.092648506164551, "loss": 0.6635, "odds_ratio_loss": 0.02256493642926216, "rewards/accuracies": 1.0, "rewards/chosen": -0.06612200289964676, "rewards/margins": 0.5431429147720337, "rewards/rejected": -0.6092648506164551, "sft_loss": 0.6612199544906616, "step": 9740 }, { "epoch": 0.76, "grad_norm": 7.234911918640137, "learning_rate": 1.39655899264673e-06, "logits/chosen": -1.37042236328125, "logits/rejected": -0.9824494123458862, "logps/chosen": -0.8050609827041626, "logps/rejected": -4.500408172607422, "loss": 0.8205, "odds_ratio_loss": 0.1539815068244934, "rewards/accuracies": 1.0, "rewards/chosen": -0.0805060938000679, "rewards/margins": 0.36953476071357727, "rewards/rejected": -0.45004087686538696, "sft_loss": 0.8050609827041626, "step": 9745 }, { "epoch": 0.76, "grad_norm": 15.264321327209473, "learning_rate": 1.3922929327354245e-06, "logits/chosen": -1.4009451866149902, "logits/rejected": -1.0183489322662354, "logps/chosen": -1.019995927810669, "logps/rejected": -4.73303747177124, "loss": 1.0445, "odds_ratio_loss": 0.24479708075523376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10199960321187973, "rewards/margins": 0.37130412459373474, "rewards/rejected": -0.47330373525619507, "sft_loss": 1.019995927810669, "step": 9750 }, { "epoch": 0.76, "grad_norm": 30.697858810424805, "learning_rate": 1.388032344369939e-06, "logits/chosen": -1.3964107036590576, "logits/rejected": -1.2231998443603516, "logps/chosen": -0.7747622728347778, "logps/rejected": -4.604029655456543, "loss": 0.7883, "odds_ratio_loss": 0.13550665974617004, "rewards/accuracies": 1.0, "rewards/chosen": -0.07747622579336166, "rewards/margins": 0.3829267621040344, "rewards/rejected": -0.4604029655456543, "sft_loss": 0.7747622728347778, "step": 9755 }, { "epoch": 0.76, "grad_norm": 100.62313842773438, "learning_rate": 1.3837772340119959e-06, "logits/chosen": -1.1030269861221313, "logits/rejected": -0.8924986124038696, "logps/chosen": -1.0514342784881592, "logps/rejected": -3.5023257732391357, "loss": 1.1183, "odds_ratio_loss": 0.6688185930252075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10514342784881592, "rewards/margins": 0.24508914351463318, "rewards/rejected": -0.3502325713634491, "sft_loss": 1.0514342784881592, "step": 9760 }, { "epoch": 0.76, "grad_norm": 6.77316427230835, "learning_rate": 1.37952760811501e-06, "logits/chosen": -1.3329914808273315, "logits/rejected": -1.0536625385284424, "logps/chosen": -1.2876416444778442, "logps/rejected": -7.932206153869629, "loss": 1.289, "odds_ratio_loss": 0.013573619537055492, "rewards/accuracies": 1.0, "rewards/chosen": -0.12876416742801666, "rewards/margins": 0.6644564270973206, "rewards/rejected": -0.793220579624176, "sft_loss": 1.2876416444778442, "step": 9765 }, { "epoch": 0.76, "grad_norm": 20.56844139099121, "learning_rate": 1.375283473124081e-06, "logits/chosen": -1.3427093029022217, "logits/rejected": -1.0028154850006104, "logps/chosen": -1.0161627531051636, "logps/rejected": -6.620965480804443, "loss": 1.0465, "odds_ratio_loss": 0.30296653509140015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1016162857413292, "rewards/margins": 0.5604802966117859, "rewards/rejected": -0.6620966196060181, "sft_loss": 1.0161627531051636, "step": 9770 }, { "epoch": 0.76, "grad_norm": 6.547877311706543, "learning_rate": 1.371044835475977e-06, "logits/chosen": -1.4146158695220947, "logits/rejected": -0.9940034747123718, "logps/chosen": -0.9952043294906616, "logps/rejected": -5.0639848709106445, "loss": 1.0029, "odds_ratio_loss": 0.07706589251756668, "rewards/accuracies": 1.0, "rewards/chosen": -0.09952044486999512, "rewards/margins": 0.4068779945373535, "rewards/rejected": -0.5063984990119934, "sft_loss": 0.9952043294906616, "step": 9775 }, { "epoch": 0.76, "grad_norm": 52.96457290649414, "learning_rate": 1.3668117015991284e-06, "logits/chosen": -1.3061707019805908, "logits/rejected": -1.382652997970581, "logps/chosen": -1.1862385272979736, "logps/rejected": -5.084000587463379, "loss": 1.2115, "odds_ratio_loss": 0.2521643042564392, "rewards/accuracies": 1.0, "rewards/chosen": -0.11862385272979736, "rewards/margins": 0.38977622985839844, "rewards/rejected": -0.5084000825881958, "sft_loss": 1.1862385272979736, "step": 9780 }, { "epoch": 0.76, "grad_norm": 11.038567543029785, "learning_rate": 1.3625840779136235e-06, "logits/chosen": -1.4083950519561768, "logits/rejected": -1.0077704191207886, "logps/chosen": -0.9693604707717896, "logps/rejected": -7.7081298828125, "loss": 0.9708, "odds_ratio_loss": 0.014776247553527355, "rewards/accuracies": 1.0, "rewards/chosen": -0.09693604707717896, "rewards/margins": 0.673876941204071, "rewards/rejected": -0.7708130478858948, "sft_loss": 0.9693604707717896, "step": 9785 }, { "epoch": 0.76, "grad_norm": 4.925965785980225, "learning_rate": 1.358361970831188e-06, "logits/chosen": -1.3355958461761475, "logits/rejected": -0.8679085969924927, "logps/chosen": -0.9071223139762878, "logps/rejected": -7.300946235656738, "loss": 0.9184, "odds_ratio_loss": 0.11292095482349396, "rewards/accuracies": 1.0, "rewards/chosen": -0.09071223437786102, "rewards/margins": 0.6393824219703674, "rewards/rejected": -0.7300946116447449, "sft_loss": 0.9071223139762878, "step": 9790 }, { "epoch": 0.76, "grad_norm": 14.827608108520508, "learning_rate": 1.3541453867551851e-06, "logits/chosen": -1.3884003162384033, "logits/rejected": -1.0902457237243652, "logps/chosen": -1.028895378112793, "logps/rejected": -4.301610946655273, "loss": 1.0612, "odds_ratio_loss": 0.32345858216285706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1028895378112793, "rewards/margins": 0.32727161049842834, "rewards/rejected": -0.43016114830970764, "sft_loss": 1.028895378112793, "step": 9795 }, { "epoch": 0.76, "grad_norm": 327.2066650390625, "learning_rate": 1.3499343320805986e-06, "logits/chosen": -1.3086704015731812, "logits/rejected": -0.8594743013381958, "logps/chosen": -1.4803388118743896, "logps/rejected": -8.974512100219727, "loss": 1.4996, "odds_ratio_loss": 0.1922084391117096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14803388714790344, "rewards/margins": 0.7494173049926758, "rewards/rejected": -0.8974512219429016, "sft_loss": 1.4803388118743896, "step": 9800 }, { "epoch": 0.76, "grad_norm": 3.762913227081299, "learning_rate": 1.3457288131940276e-06, "logits/chosen": -1.0716235637664795, "logits/rejected": -1.4478861093521118, "logps/chosen": -0.8366058468818665, "logps/rejected": -10.235267639160156, "loss": 0.8395, "odds_ratio_loss": 0.029129063710570335, "rewards/accuracies": 1.0, "rewards/chosen": -0.08366059511899948, "rewards/margins": 0.9398662447929382, "rewards/rejected": -1.0235267877578735, "sft_loss": 0.8366058468818665, "step": 9805 }, { "epoch": 0.76, "grad_norm": 32.673152923583984, "learning_rate": 1.3415288364736746e-06, "logits/chosen": -1.438460350036621, "logits/rejected": -1.3678359985351562, "logps/chosen": -0.791472315788269, "logps/rejected": -2.52308988571167, "loss": 0.8313, "odds_ratio_loss": 0.39869898557662964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07914721965789795, "rewards/margins": 0.17316177487373352, "rewards/rejected": -0.25230899453163147, "sft_loss": 0.791472315788269, "step": 9810 }, { "epoch": 0.76, "grad_norm": 6.212176322937012, "learning_rate": 1.3373344082893403e-06, "logits/chosen": -1.3597334623336792, "logits/rejected": -1.017281413078308, "logps/chosen": -0.8786466717720032, "logps/rejected": -3.898221969604492, "loss": 0.9047, "odds_ratio_loss": 0.2601260840892792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08786466717720032, "rewards/margins": 0.30195754766464233, "rewards/rejected": -0.38982218503952026, "sft_loss": 0.8786466717720032, "step": 9815 }, { "epoch": 0.76, "grad_norm": 63.43842315673828, "learning_rate": 1.3331455350024059e-06, "logits/chosen": -1.2316055297851562, "logits/rejected": -0.9201291799545288, "logps/chosen": -1.0224696397781372, "logps/rejected": -5.860587120056152, "loss": 1.053, "odds_ratio_loss": 0.3048619031906128, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1022469624876976, "rewards/margins": 0.48381170630455017, "rewards/rejected": -0.5860587358474731, "sft_loss": 1.0224696397781372, "step": 9820 }, { "epoch": 0.76, "grad_norm": 6.886044979095459, "learning_rate": 1.3289622229658294e-06, "logits/chosen": -1.3224356174468994, "logits/rejected": -1.299999713897705, "logps/chosen": -0.9050876498222351, "logps/rejected": -7.530638694763184, "loss": 0.9165, "odds_ratio_loss": 0.11380696296691895, "rewards/accuracies": 1.0, "rewards/chosen": -0.09050877392292023, "rewards/margins": 0.6625550985336304, "rewards/rejected": -0.7530638575553894, "sft_loss": 0.9050876498222351, "step": 9825 }, { "epoch": 0.76, "grad_norm": 27.36313819885254, "learning_rate": 1.3247844785241336e-06, "logits/chosen": -1.294485330581665, "logits/rejected": -0.9901436567306519, "logps/chosen": -1.0604060888290405, "logps/rejected": -6.207077980041504, "loss": 1.1032, "odds_ratio_loss": 0.4282000958919525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10604061186313629, "rewards/margins": 0.5146671533584595, "rewards/rejected": -0.6207078099250793, "sft_loss": 1.0604060888290405, "step": 9830 }, { "epoch": 0.77, "grad_norm": 4.699453353881836, "learning_rate": 1.320612308013401e-06, "logits/chosen": -1.3086684942245483, "logits/rejected": -0.826088547706604, "logps/chosen": -1.4098154306411743, "logps/rejected": -5.700179576873779, "loss": 1.441, "odds_ratio_loss": 0.3116340637207031, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1409815400838852, "rewards/margins": 0.4290364682674408, "rewards/rejected": -0.5700180530548096, "sft_loss": 1.4098154306411743, "step": 9835 }, { "epoch": 0.77, "grad_norm": 5.1790289878845215, "learning_rate": 1.3164457177612566e-06, "logits/chosen": -1.2479835748672485, "logits/rejected": -0.7511214017868042, "logps/chosen": -0.8636773824691772, "logps/rejected": -13.867403030395508, "loss": 0.8773, "odds_ratio_loss": 0.13587191700935364, "rewards/accuracies": 1.0, "rewards/chosen": -0.08636773377656937, "rewards/margins": 1.3003724813461304, "rewards/rejected": -1.3867400884628296, "sft_loss": 0.8636773824691772, "step": 9840 }, { "epoch": 0.77, "grad_norm": 18.41057014465332, "learning_rate": 1.3122847140868617e-06, "logits/chosen": -1.3606140613555908, "logits/rejected": -0.8298721313476562, "logps/chosen": -0.8453086018562317, "logps/rejected": -5.990819454193115, "loss": 0.8473, "odds_ratio_loss": 0.019589338451623917, "rewards/accuracies": 1.0, "rewards/chosen": -0.08453086018562317, "rewards/margins": 0.5145511031150818, "rewards/rejected": -0.5990819334983826, "sft_loss": 0.8453086018562317, "step": 9845 }, { "epoch": 0.77, "grad_norm": 8.951942443847656, "learning_rate": 1.3081293033009107e-06, "logits/chosen": -1.218518614768982, "logits/rejected": -1.2447441816329956, "logps/chosen": -1.276637077331543, "logps/rejected": -4.123940944671631, "loss": 1.2844, "odds_ratio_loss": 0.0777917206287384, "rewards/accuracies": 1.0, "rewards/chosen": -0.12766370177268982, "rewards/margins": 0.28473037481307983, "rewards/rejected": -0.41239410638809204, "sft_loss": 1.276637077331543, "step": 9850 }, { "epoch": 0.77, "grad_norm": 8.449200630187988, "learning_rate": 1.3039794917056087e-06, "logits/chosen": -1.3235777616500854, "logits/rejected": -0.7842531800270081, "logps/chosen": -0.9570550918579102, "logps/rejected": -5.944942474365234, "loss": 0.9629, "odds_ratio_loss": 0.058135222643613815, "rewards/accuracies": 1.0, "rewards/chosen": -0.09570551663637161, "rewards/margins": 0.4987887442111969, "rewards/rejected": -0.5944942831993103, "sft_loss": 0.9570550918579102, "step": 9855 }, { "epoch": 0.77, "grad_norm": 23.442842483520508, "learning_rate": 1.2998352855946728e-06, "logits/chosen": -1.3478174209594727, "logits/rejected": -0.9334796071052551, "logps/chosen": -0.742106556892395, "logps/rejected": -5.195427894592285, "loss": 0.7551, "odds_ratio_loss": 0.1301083117723465, "rewards/accuracies": 1.0, "rewards/chosen": -0.07421065866947174, "rewards/margins": 0.4453321397304535, "rewards/rejected": -0.5195428133010864, "sft_loss": 0.742106556892395, "step": 9860 }, { "epoch": 0.77, "grad_norm": 12.772910118103027, "learning_rate": 1.2956966912533176e-06, "logits/chosen": -1.140475869178772, "logits/rejected": -1.1725482940673828, "logps/chosen": -1.062133550643921, "logps/rejected": -5.455204010009766, "loss": 1.1273, "odds_ratio_loss": 0.6513864398002625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10621335357427597, "rewards/margins": 0.4393070638179779, "rewards/rejected": -0.5455204248428345, "sft_loss": 1.062133550643921, "step": 9865 }, { "epoch": 0.77, "grad_norm": 18.566396713256836, "learning_rate": 1.2915637149582466e-06, "logits/chosen": -1.3273341655731201, "logits/rejected": -0.6339842081069946, "logps/chosen": -1.1666196584701538, "logps/rejected": -5.155702114105225, "loss": 1.1715, "odds_ratio_loss": 0.04836392030119896, "rewards/accuracies": 1.0, "rewards/chosen": -0.11666196584701538, "rewards/margins": 0.39890819787979126, "rewards/rejected": -0.5155701637268066, "sft_loss": 1.1666196584701538, "step": 9870 }, { "epoch": 0.77, "grad_norm": 27.102888107299805, "learning_rate": 1.2874363629776422e-06, "logits/chosen": -1.2952888011932373, "logits/rejected": -1.339074730873108, "logps/chosen": -0.6745896339416504, "logps/rejected": -4.707167148590088, "loss": 0.6864, "odds_ratio_loss": 0.11814513057470322, "rewards/accuracies": 1.0, "rewards/chosen": -0.06745896488428116, "rewards/margins": 0.4032577574253082, "rewards/rejected": -0.4707167148590088, "sft_loss": 0.6745896339416504, "step": 9875 }, { "epoch": 0.77, "grad_norm": 15.510187149047852, "learning_rate": 1.28331464157116e-06, "logits/chosen": -1.2012255191802979, "logits/rejected": -1.3116233348846436, "logps/chosen": -1.244822382926941, "logps/rejected": -5.981281280517578, "loss": 1.2535, "odds_ratio_loss": 0.08675719797611237, "rewards/accuracies": 1.0, "rewards/chosen": -0.12448225170373917, "rewards/margins": 0.4736458659172058, "rewards/rejected": -0.598128080368042, "sft_loss": 1.244822382926941, "step": 9880 }, { "epoch": 0.77, "grad_norm": 20.459211349487305, "learning_rate": 1.2791985569899124e-06, "logits/chosen": -1.1367995738983154, "logits/rejected": -1.5840421915054321, "logps/chosen": -1.248212218284607, "logps/rejected": -12.186239242553711, "loss": 1.2493, "odds_ratio_loss": 0.011205102317035198, "rewards/accuracies": 1.0, "rewards/chosen": -0.12482122331857681, "rewards/margins": 1.0938026905059814, "rewards/rejected": -1.2186239957809448, "sft_loss": 1.248212218284607, "step": 9885 }, { "epoch": 0.77, "grad_norm": 4.763285160064697, "learning_rate": 1.275088115476465e-06, "logits/chosen": -1.2230708599090576, "logits/rejected": -1.3235952854156494, "logps/chosen": -0.8943685293197632, "logps/rejected": -6.485651969909668, "loss": 0.8969, "odds_ratio_loss": 0.025648515671491623, "rewards/accuracies": 1.0, "rewards/chosen": -0.08943686634302139, "rewards/margins": 0.5591284036636353, "rewards/rejected": -0.6485652327537537, "sft_loss": 0.8943685293197632, "step": 9890 }, { "epoch": 0.77, "grad_norm": 49.72977828979492, "learning_rate": 1.2709833232648216e-06, "logits/chosen": -1.3660045862197876, "logits/rejected": -1.3602436780929565, "logps/chosen": -1.9368641376495361, "logps/rejected": -15.42323112487793, "loss": 1.9435, "odds_ratio_loss": 0.06626741588115692, "rewards/accuracies": 1.0, "rewards/chosen": -0.19368639588356018, "rewards/margins": 1.3486367464065552, "rewards/rejected": -1.542323112487793, "sft_loss": 1.9368641376495361, "step": 9895 }, { "epoch": 0.77, "grad_norm": 45.193084716796875, "learning_rate": 1.2668841865804248e-06, "logits/chosen": -1.2795097827911377, "logits/rejected": -1.0634839534759521, "logps/chosen": -0.7293432354927063, "logps/rejected": -10.748265266418457, "loss": 0.7317, "odds_ratio_loss": 0.0239731278270483, "rewards/accuracies": 1.0, "rewards/chosen": -0.07293432205915451, "rewards/margins": 1.0018922090530396, "rewards/rejected": -1.0748264789581299, "sft_loss": 0.7293432354927063, "step": 9900 }, { "epoch": 0.77, "grad_norm": 15.02663803100586, "learning_rate": 1.2627907116401338e-06, "logits/chosen": -1.3065115213394165, "logits/rejected": -0.8887646794319153, "logps/chosen": -1.0517328977584839, "logps/rejected": -7.9593825340271, "loss": 1.067, "odds_ratio_loss": 0.15316042304039001, "rewards/accuracies": 1.0, "rewards/chosen": -0.10517330467700958, "rewards/margins": 0.6907650232315063, "rewards/rejected": -0.7959383726119995, "sft_loss": 1.0517328977584839, "step": 9905 }, { "epoch": 0.77, "grad_norm": 13.276571273803711, "learning_rate": 1.258702904652223e-06, "logits/chosen": -1.419241189956665, "logits/rejected": -1.1413755416870117, "logps/chosen": -0.8657780885696411, "logps/rejected": -8.248200416564941, "loss": 0.8702, "odds_ratio_loss": 0.04419126361608505, "rewards/accuracies": 1.0, "rewards/chosen": -0.08657781034708023, "rewards/margins": 0.7382422089576721, "rewards/rejected": -0.8248200416564941, "sft_loss": 0.8657780885696411, "step": 9910 }, { "epoch": 0.77, "grad_norm": 6.825826168060303, "learning_rate": 1.2546207718163717e-06, "logits/chosen": -1.387042760848999, "logits/rejected": -0.8412164449691772, "logps/chosen": -1.1644020080566406, "logps/rejected": -4.54971981048584, "loss": 1.2089, "odds_ratio_loss": 0.44534358382225037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11644019931554794, "rewards/margins": 0.3385317921638489, "rewards/rejected": -0.45497196912765503, "sft_loss": 1.1644020080566406, "step": 9915 }, { "epoch": 0.77, "grad_norm": 6.1360907554626465, "learning_rate": 1.2505443193236512e-06, "logits/chosen": -1.4001635313034058, "logits/rejected": -1.162745714187622, "logps/chosen": -1.2425587177276611, "logps/rejected": -6.215406894683838, "loss": 1.3028, "odds_ratio_loss": 0.6023607850074768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12425585836172104, "rewards/margins": 0.49728482961654663, "rewards/rejected": -0.6215406656265259, "sft_loss": 1.2425587177276611, "step": 9920 }, { "epoch": 0.77, "grad_norm": 6.499194622039795, "learning_rate": 1.246473553356518e-06, "logits/chosen": -1.255040168762207, "logits/rejected": -1.2155425548553467, "logps/chosen": -1.216691017150879, "logps/rejected": -19.482616424560547, "loss": 1.2167, "odds_ratio_loss": 3.4083663194905967e-05, "rewards/accuracies": 1.0, "rewards/chosen": -0.12166911363601685, "rewards/margins": 1.8265924453735352, "rewards/rejected": -1.9482614994049072, "sft_loss": 1.216691017150879, "step": 9925 }, { "epoch": 0.77, "grad_norm": 13.034653663635254, "learning_rate": 1.2424084800888093e-06, "logits/chosen": -1.297929048538208, "logits/rejected": -0.9741071462631226, "logps/chosen": -0.9298331141471863, "logps/rejected": -5.6627936363220215, "loss": 0.9421, "odds_ratio_loss": 0.1228838711977005, "rewards/accuracies": 1.0, "rewards/chosen": -0.09298331290483475, "rewards/margins": 0.4732961058616638, "rewards/rejected": -0.5662793517112732, "sft_loss": 0.9298331141471863, "step": 9930 }, { "epoch": 0.77, "grad_norm": 50.6912727355957, "learning_rate": 1.2383491056857234e-06, "logits/chosen": -1.28719162940979, "logits/rejected": -0.7962328195571899, "logps/chosen": -1.011508584022522, "logps/rejected": -5.1037397384643555, "loss": 1.0171, "odds_ratio_loss": 0.05606383830308914, "rewards/accuracies": 1.0, "rewards/chosen": -0.10115084797143936, "rewards/margins": 0.40922316908836365, "rewards/rejected": -0.5103740096092224, "sft_loss": 1.011508584022522, "step": 9935 }, { "epoch": 0.77, "grad_norm": 15.886419296264648, "learning_rate": 1.2342954363038146e-06, "logits/chosen": -1.2996234893798828, "logits/rejected": -1.1742438077926636, "logps/chosen": -0.9803162813186646, "logps/rejected": -10.773435592651367, "loss": 0.9813, "odds_ratio_loss": 0.00980658270418644, "rewards/accuracies": 1.0, "rewards/chosen": -0.09803163260221481, "rewards/margins": 0.9793119430541992, "rewards/rejected": -1.0773435831069946, "sft_loss": 0.9803162813186646, "step": 9940 }, { "epoch": 0.77, "grad_norm": 10.295068740844727, "learning_rate": 1.2302474780909901e-06, "logits/chosen": -1.5229541063308716, "logits/rejected": -1.3465713262557983, "logps/chosen": -0.9372552633285522, "logps/rejected": -10.703648567199707, "loss": 0.9475, "odds_ratio_loss": 0.10272009670734406, "rewards/accuracies": 1.0, "rewards/chosen": -0.0937255322933197, "rewards/margins": 0.9766393899917603, "rewards/rejected": -1.0703647136688232, "sft_loss": 0.9372552633285522, "step": 9945 }, { "epoch": 0.77, "grad_norm": 5.413816928863525, "learning_rate": 1.2262052371864924e-06, "logits/chosen": -1.3776428699493408, "logits/rejected": -0.8330678939819336, "logps/chosen": -0.7698219418525696, "logps/rejected": -6.723541259765625, "loss": 0.773, "odds_ratio_loss": 0.032238125801086426, "rewards/accuracies": 1.0, "rewards/chosen": -0.07698218524456024, "rewards/margins": 0.5953719615936279, "rewards/rejected": -0.6723541021347046, "sft_loss": 0.7698219418525696, "step": 9950 }, { "epoch": 0.77, "grad_norm": 27.832286834716797, "learning_rate": 1.2221687197208914e-06, "logits/chosen": -1.1848442554473877, "logits/rejected": -1.1238027811050415, "logps/chosen": -0.8974050283432007, "logps/rejected": -1.6427028179168701, "loss": 0.9332, "odds_ratio_loss": 0.3581832945346832, "rewards/accuracies": 1.0, "rewards/chosen": -0.08974049985408783, "rewards/margins": 0.07452978938817978, "rewards/rejected": -0.1642702966928482, "sft_loss": 0.8974050283432007, "step": 9955 }, { "epoch": 0.77, "grad_norm": 15.635947227478027, "learning_rate": 1.218137931816078e-06, "logits/chosen": -1.4109541177749634, "logits/rejected": -1.072914958000183, "logps/chosen": -0.7615107297897339, "logps/rejected": -7.309935092926025, "loss": 0.7773, "odds_ratio_loss": 0.15754522383213043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07615108042955399, "rewards/margins": 0.6548423767089844, "rewards/rejected": -0.7309934496879578, "sft_loss": 0.7615107297897339, "step": 9960 }, { "epoch": 0.78, "grad_norm": 5.612651348114014, "learning_rate": 1.2141128795852563e-06, "logits/chosen": -1.4222948551177979, "logits/rejected": -1.231484293937683, "logps/chosen": -0.7754429578781128, "logps/rejected": -10.327539443969727, "loss": 0.7877, "odds_ratio_loss": 0.12259682267904282, "rewards/accuracies": 1.0, "rewards/chosen": -0.07754429429769516, "rewards/margins": 0.9552095532417297, "rewards/rejected": -1.032753825187683, "sft_loss": 0.7754429578781128, "step": 9965 }, { "epoch": 0.78, "grad_norm": 5.235637187957764, "learning_rate": 1.210093569132928e-06, "logits/chosen": -1.2951066493988037, "logits/rejected": -0.5975306630134583, "logps/chosen": -0.9870067834854126, "logps/rejected": -8.021510124206543, "loss": 1.0151, "odds_ratio_loss": 0.281358927488327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09870067983865738, "rewards/margins": 0.7034503221511841, "rewards/rejected": -0.8021510243415833, "sft_loss": 0.9870067834854126, "step": 9970 }, { "epoch": 0.78, "grad_norm": 36.680965423583984, "learning_rate": 1.2060800065548867e-06, "logits/chosen": -1.3343515396118164, "logits/rejected": -1.5229475498199463, "logps/chosen": -0.8048030734062195, "logps/rejected": -8.215921401977539, "loss": 0.8068, "odds_ratio_loss": 0.01964385434985161, "rewards/accuracies": 1.0, "rewards/chosen": -0.08048031479120255, "rewards/margins": 0.7411118745803833, "rewards/rejected": -0.8215921521186829, "sft_loss": 0.8048030734062195, "step": 9975 }, { "epoch": 0.78, "grad_norm": 24.134246826171875, "learning_rate": 1.2020721979382111e-06, "logits/chosen": -1.3110750913619995, "logits/rejected": -1.1490697860717773, "logps/chosen": -1.087066888809204, "logps/rejected": -4.987780570983887, "loss": 1.1119, "odds_ratio_loss": 0.24879589676856995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10870669037103653, "rewards/margins": 0.39007139205932617, "rewards/rejected": -0.4987780451774597, "sft_loss": 1.087066888809204, "step": 9980 }, { "epoch": 0.78, "grad_norm": 8.332462310791016, "learning_rate": 1.1980701493612507e-06, "logits/chosen": -1.330665111541748, "logits/rejected": -0.6037562489509583, "logps/chosen": -0.8439868688583374, "logps/rejected": -10.712756156921387, "loss": 0.8541, "odds_ratio_loss": 0.10095206648111343, "rewards/accuracies": 1.0, "rewards/chosen": -0.08439868688583374, "rewards/margins": 0.9868769645690918, "rewards/rejected": -1.0712755918502808, "sft_loss": 0.8439868688583374, "step": 9985 }, { "epoch": 0.78, "grad_norm": 21.133403778076172, "learning_rate": 1.1940738668936187e-06, "logits/chosen": -1.2949966192245483, "logits/rejected": -0.9187256097793579, "logps/chosen": -1.0978871583938599, "logps/rejected": -7.6196160316467285, "loss": 1.1242, "odds_ratio_loss": 0.26295655965805054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10978871583938599, "rewards/margins": 0.652172863483429, "rewards/rejected": -0.7619615793228149, "sft_loss": 1.0978871583938599, "step": 9990 }, { "epoch": 0.78, "grad_norm": 9.91594123840332, "learning_rate": 1.1900833565961888e-06, "logits/chosen": -1.3776895999908447, "logits/rejected": -0.8226616978645325, "logps/chosen": -1.0611966848373413, "logps/rejected": -5.255466938018799, "loss": 1.0877, "odds_ratio_loss": 0.26494020223617554, "rewards/accuracies": 1.0, "rewards/chosen": -0.10611967742443085, "rewards/margins": 0.4194270670413971, "rewards/rejected": -0.5255467891693115, "sft_loss": 1.0611966848373413, "step": 9995 }, { "epoch": 0.78, "grad_norm": 11.703951835632324, "learning_rate": 1.1860986245210742e-06, "logits/chosen": -1.4472962617874146, "logits/rejected": -1.218927264213562, "logps/chosen": -0.932456374168396, "logps/rejected": -5.315445423126221, "loss": 0.9578, "odds_ratio_loss": 0.25324195623397827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09324564039707184, "rewards/margins": 0.43829888105392456, "rewards/rejected": -0.53154456615448, "sft_loss": 0.932456374168396, "step": 10000 }, { "epoch": 0.78, "grad_norm": 7.178709506988525, "learning_rate": 1.1821196767116272e-06, "logits/chosen": -1.3587113618850708, "logits/rejected": -1.2883832454681396, "logps/chosen": -1.508227825164795, "logps/rejected": -12.98760986328125, "loss": 1.5083, "odds_ratio_loss": 0.0008086395682767034, "rewards/accuracies": 1.0, "rewards/chosen": -0.15082278847694397, "rewards/margins": 1.1479381322860718, "rewards/rejected": -1.2987611293792725, "sft_loss": 1.508227825164795, "step": 10005 }, { "epoch": 0.78, "grad_norm": 15.75374984741211, "learning_rate": 1.1781465192024266e-06, "logits/chosen": -1.378434419631958, "logits/rejected": -1.251649260520935, "logps/chosen": -0.8725560307502747, "logps/rejected": -8.043733596801758, "loss": 0.8776, "odds_ratio_loss": 0.05054037645459175, "rewards/accuracies": 1.0, "rewards/chosen": -0.08725561201572418, "rewards/margins": 0.7171179056167603, "rewards/rejected": -0.8043734431266785, "sft_loss": 0.8725560307502747, "step": 10010 }, { "epoch": 0.78, "grad_norm": 493.7987060546875, "learning_rate": 1.1741791580192718e-06, "logits/chosen": -1.4295212030410767, "logits/rejected": -1.1595797538757324, "logps/chosen": -1.3225281238555908, "logps/rejected": -3.852818727493286, "loss": 1.3473, "odds_ratio_loss": 0.24733905494213104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13225281238555908, "rewards/margins": 0.2530290484428406, "rewards/rejected": -0.38528189063072205, "sft_loss": 1.3225281238555908, "step": 10015 }, { "epoch": 0.78, "grad_norm": 5.6853814125061035, "learning_rate": 1.1702175991791693e-06, "logits/chosen": -1.3617594242095947, "logits/rejected": -1.0348384380340576, "logps/chosen": -0.7079185247421265, "logps/rejected": -8.361276626586914, "loss": 0.7145, "odds_ratio_loss": 0.06607773154973984, "rewards/accuracies": 1.0, "rewards/chosen": -0.07079185545444489, "rewards/margins": 0.765335738658905, "rewards/rejected": -0.8361276388168335, "sft_loss": 0.7079185247421265, "step": 10020 }, { "epoch": 0.78, "grad_norm": 18.461105346679688, "learning_rate": 1.166261848690326e-06, "logits/chosen": -0.9604610204696655, "logits/rejected": -1.4066376686096191, "logps/chosen": -0.6313884854316711, "logps/rejected": -10.4922513961792, "loss": 0.6316, "odds_ratio_loss": 0.0021018588449805975, "rewards/accuracies": 1.0, "rewards/chosen": -0.06313885003328323, "rewards/margins": 0.9860862493515015, "rewards/rejected": -1.049225091934204, "sft_loss": 0.6313884854316711, "step": 10025 }, { "epoch": 0.78, "grad_norm": 37.55699920654297, "learning_rate": 1.1623119125521394e-06, "logits/chosen": -1.3622324466705322, "logits/rejected": -1.175256609916687, "logps/chosen": -1.1930874586105347, "logps/rejected": -10.830957412719727, "loss": 1.1946, "odds_ratio_loss": 0.015017673373222351, "rewards/accuracies": 1.0, "rewards/chosen": -0.11930874735116959, "rewards/margins": 0.9637869596481323, "rewards/rejected": -1.0830957889556885, "sft_loss": 1.1930874586105347, "step": 10030 }, { "epoch": 0.78, "grad_norm": 25.23794174194336, "learning_rate": 1.1583677967551888e-06, "logits/chosen": -1.375732421875, "logits/rejected": -1.0804195404052734, "logps/chosen": -1.0573394298553467, "logps/rejected": -5.16109561920166, "loss": 1.0634, "odds_ratio_loss": 0.060222726315259933, "rewards/accuracies": 1.0, "rewards/chosen": -0.1057339459657669, "rewards/margins": 0.41037559509277344, "rewards/rejected": -0.5161095261573792, "sft_loss": 1.0573394298553467, "step": 10035 }, { "epoch": 0.78, "grad_norm": 35.429866790771484, "learning_rate": 1.154429507281226e-06, "logits/chosen": -1.3516714572906494, "logits/rejected": -1.1342250108718872, "logps/chosen": -1.106170892715454, "logps/rejected": -4.331687927246094, "loss": 1.1394, "odds_ratio_loss": 0.33246272802352905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11061708629131317, "rewards/margins": 0.3225516974925995, "rewards/rejected": -0.43316879868507385, "sft_loss": 1.106170892715454, "step": 10040 }, { "epoch": 0.78, "grad_norm": 241.47119140625, "learning_rate": 1.1504970501031692e-06, "logits/chosen": -1.3227497339248657, "logits/rejected": -1.1068919897079468, "logps/chosen": -1.4671138525009155, "logps/rejected": -7.351625919342041, "loss": 1.487, "odds_ratio_loss": 0.19894203543663025, "rewards/accuracies": 1.0, "rewards/chosen": -0.14671137928962708, "rewards/margins": 0.5884512662887573, "rewards/rejected": -0.735162615776062, "sft_loss": 1.4671138525009155, "step": 10045 }, { "epoch": 0.78, "grad_norm": 29.387950897216797, "learning_rate": 1.1465704311850883e-06, "logits/chosen": -1.4164648056030273, "logits/rejected": -1.2835915088653564, "logps/chosen": -0.9282344579696655, "logps/rejected": -5.288893222808838, "loss": 0.9465, "odds_ratio_loss": 0.18245641887187958, "rewards/accuracies": 1.0, "rewards/chosen": -0.09282345324754715, "rewards/margins": 0.4360658526420593, "rewards/rejected": -0.5288892984390259, "sft_loss": 0.9282344579696655, "step": 10050 }, { "epoch": 0.78, "grad_norm": 6.3994903564453125, "learning_rate": 1.1426496564821976e-06, "logits/chosen": -1.3634178638458252, "logits/rejected": -0.6244993805885315, "logps/chosen": -0.9844557642936707, "logps/rejected": -5.556734085083008, "loss": 1.0037, "odds_ratio_loss": 0.19292011857032776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0984455794095993, "rewards/margins": 0.457227885723114, "rewards/rejected": -0.5556734800338745, "sft_loss": 0.9844557642936707, "step": 10055 }, { "epoch": 0.78, "grad_norm": 13.108819961547852, "learning_rate": 1.138734731940852e-06, "logits/chosen": -1.5008776187896729, "logits/rejected": -1.1382453441619873, "logps/chosen": -0.7119458317756653, "logps/rejected": -8.66386604309082, "loss": 0.714, "odds_ratio_loss": 0.02025299333035946, "rewards/accuracies": 1.0, "rewards/chosen": -0.071194589138031, "rewards/margins": 0.7951920628547668, "rewards/rejected": -0.8663867115974426, "sft_loss": 0.7119458317756653, "step": 10060 }, { "epoch": 0.78, "grad_norm": 7.2096076011657715, "learning_rate": 1.1348256634985311e-06, "logits/chosen": -1.3829349279403687, "logits/rejected": -1.0344932079315186, "logps/chosen": -0.9017425775527954, "logps/rejected": -3.416379928588867, "loss": 0.9245, "odds_ratio_loss": 0.22740647196769714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09017425775527954, "rewards/margins": 0.25146371126174927, "rewards/rejected": -0.3416379988193512, "sft_loss": 0.9017425775527954, "step": 10065 }, { "epoch": 0.78, "grad_norm": 14.399897575378418, "learning_rate": 1.1309224570838335e-06, "logits/chosen": -1.0961267948150635, "logits/rejected": -0.7845171093940735, "logps/chosen": -0.9008132815361023, "logps/rejected": -8.158656120300293, "loss": 0.9079, "odds_ratio_loss": 0.0713641345500946, "rewards/accuracies": 1.0, "rewards/chosen": -0.09008133411407471, "rewards/margins": 0.7257843017578125, "rewards/rejected": -0.8158656358718872, "sft_loss": 0.9008132815361023, "step": 10070 }, { "epoch": 0.78, "grad_norm": 5.566354751586914, "learning_rate": 1.1270251186164649e-06, "logits/chosen": -1.2573572397232056, "logits/rejected": -1.0662510395050049, "logps/chosen": -0.6841157674789429, "logps/rejected": -8.32490348815918, "loss": 0.6947, "odds_ratio_loss": 0.10547639429569244, "rewards/accuracies": 1.0, "rewards/chosen": -0.06841157376766205, "rewards/margins": 0.7640787363052368, "rewards/rejected": -0.8324903249740601, "sft_loss": 0.6841157674789429, "step": 10075 }, { "epoch": 0.78, "grad_norm": 43.7618293762207, "learning_rate": 1.1231336540072379e-06, "logits/chosen": -1.2308496236801147, "logits/rejected": -1.3328073024749756, "logps/chosen": -0.7990642786026001, "logps/rejected": -6.931474208831787, "loss": 0.825, "odds_ratio_loss": 0.2591875493526459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07990642637014389, "rewards/margins": 0.6132410168647766, "rewards/rejected": -0.6931473612785339, "sft_loss": 0.7990642786026001, "step": 10080 }, { "epoch": 0.78, "grad_norm": 19.308300018310547, "learning_rate": 1.1192480691580504e-06, "logits/chosen": -1.3502826690673828, "logits/rejected": -1.0207679271697998, "logps/chosen": -1.1335891485214233, "logps/rejected": -11.743545532226562, "loss": 1.1475, "odds_ratio_loss": 0.13940632343292236, "rewards/accuracies": 1.0, "rewards/chosen": -0.11335892975330353, "rewards/margins": 1.0609955787658691, "rewards/rejected": -1.1743545532226562, "sft_loss": 1.1335891485214233, "step": 10085 }, { "epoch": 0.78, "grad_norm": 5.187989711761475, "learning_rate": 1.1153683699618856e-06, "logits/chosen": -1.367068886756897, "logits/rejected": -1.1009019613265991, "logps/chosen": -1.4418981075286865, "logps/rejected": -7.871354579925537, "loss": 1.4518, "odds_ratio_loss": 0.09873579442501068, "rewards/accuracies": 1.0, "rewards/chosen": -0.14418981969356537, "rewards/margins": 0.6429456472396851, "rewards/rejected": -0.7871354222297668, "sft_loss": 1.4418981075286865, "step": 10090 }, { "epoch": 0.79, "grad_norm": 5.7330827713012695, "learning_rate": 1.1114945623027995e-06, "logits/chosen": -1.1983908414840698, "logits/rejected": -0.9033523797988892, "logps/chosen": -0.9997088313102722, "logps/rejected": -4.750739097595215, "loss": 1.014, "odds_ratio_loss": 0.14291249215602875, "rewards/accuracies": 1.0, "rewards/chosen": -0.09997088462114334, "rewards/margins": 0.37510305643081665, "rewards/rejected": -0.4750739634037018, "sft_loss": 0.9997088313102722, "step": 10095 }, { "epoch": 0.79, "grad_norm": 9.244364738464355, "learning_rate": 1.1076266520559136e-06, "logits/chosen": -1.4069405794143677, "logits/rejected": -0.9098762273788452, "logps/chosen": -0.9030729532241821, "logps/rejected": -6.667893409729004, "loss": 0.9154, "odds_ratio_loss": 0.12360890209674835, "rewards/accuracies": 1.0, "rewards/chosen": -0.09030728787183762, "rewards/margins": 0.5764819979667664, "rewards/rejected": -0.6667893528938293, "sft_loss": 0.9030729532241821, "step": 10100 }, { "epoch": 0.79, "grad_norm": 12.363853454589844, "learning_rate": 1.103764645087404e-06, "logits/chosen": -1.4370672702789307, "logits/rejected": -0.8995648622512817, "logps/chosen": -0.7316851615905762, "logps/rejected": -6.912254333496094, "loss": 0.7401, "odds_ratio_loss": 0.08367065340280533, "rewards/accuracies": 1.0, "rewards/chosen": -0.07316852360963821, "rewards/margins": 0.6180568933486938, "rewards/rejected": -0.6912254095077515, "sft_loss": 0.7316851615905762, "step": 10105 }, { "epoch": 0.79, "grad_norm": 15.516923904418945, "learning_rate": 1.0999085472544962e-06, "logits/chosen": -1.2609612941741943, "logits/rejected": -1.231152892112732, "logps/chosen": -0.884219765663147, "logps/rejected": -5.064759254455566, "loss": 0.9159, "odds_ratio_loss": 0.3166887164115906, "rewards/accuracies": 1.0, "rewards/chosen": -0.08842197060585022, "rewards/margins": 0.41805392503738403, "rewards/rejected": -0.5064759254455566, "sft_loss": 0.884219765663147, "step": 10110 }, { "epoch": 0.79, "grad_norm": 3.7335870265960693, "learning_rate": 1.0960583644054517e-06, "logits/chosen": -1.3824520111083984, "logits/rejected": -0.9480381011962891, "logps/chosen": -0.9319052696228027, "logps/rejected": -9.209867477416992, "loss": 0.9413, "odds_ratio_loss": 0.09413303434848785, "rewards/accuracies": 1.0, "rewards/chosen": -0.09319053590297699, "rewards/margins": 0.8277961611747742, "rewards/rejected": -0.9209867715835571, "sft_loss": 0.9319052696228027, "step": 10115 }, { "epoch": 0.79, "grad_norm": 5.4312567710876465, "learning_rate": 1.0922141023795601e-06, "logits/chosen": -1.3167814016342163, "logits/rejected": -0.8333790898323059, "logps/chosen": -0.9882022142410278, "logps/rejected": -9.491655349731445, "loss": 1.0609, "odds_ratio_loss": 0.7272026538848877, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09882022440433502, "rewards/margins": 0.8503454327583313, "rewards/rejected": -0.9491655230522156, "sft_loss": 0.9882022142410278, "step": 10120 }, { "epoch": 0.79, "grad_norm": 20.70174789428711, "learning_rate": 1.0883757670071355e-06, "logits/chosen": -1.2962617874145508, "logits/rejected": -1.2887189388275146, "logps/chosen": -0.810468852519989, "logps/rejected": -11.951204299926758, "loss": 0.8113, "odds_ratio_loss": 0.008780966512858868, "rewards/accuracies": 1.0, "rewards/chosen": -0.08104689419269562, "rewards/margins": 1.1140735149383545, "rewards/rejected": -1.1951204538345337, "sft_loss": 0.810468852519989, "step": 10125 }, { "epoch": 0.79, "grad_norm": 5.827946186065674, "learning_rate": 1.0845433641094988e-06, "logits/chosen": -1.345948576927185, "logits/rejected": -1.1523609161376953, "logps/chosen": -0.773142397403717, "logps/rejected": -8.959914207458496, "loss": 0.7836, "odds_ratio_loss": 0.10436198860406876, "rewards/accuracies": 1.0, "rewards/chosen": -0.07731424272060394, "rewards/margins": 0.8186771273612976, "rewards/rejected": -0.8959914445877075, "sft_loss": 0.773142397403717, "step": 10130 }, { "epoch": 0.79, "grad_norm": 38.608524322509766, "learning_rate": 1.0807168994989764e-06, "logits/chosen": -1.3648837804794312, "logits/rejected": -1.3312628269195557, "logps/chosen": -0.994695782661438, "logps/rejected": -11.334436416625977, "loss": 1.0198, "odds_ratio_loss": 0.25126659870147705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09946957975625992, "rewards/margins": 1.033974051475525, "rewards/rejected": -1.1334435939788818, "sft_loss": 0.994695782661438, "step": 10135 }, { "epoch": 0.79, "grad_norm": 137.97048950195312, "learning_rate": 1.0768963789788878e-06, "logits/chosen": -1.290567398071289, "logits/rejected": -0.9734798669815063, "logps/chosen": -1.486511468887329, "logps/rejected": -13.96313762664795, "loss": 1.4877, "odds_ratio_loss": 0.012327526696026325, "rewards/accuracies": 1.0, "rewards/chosen": -0.1486511528491974, "rewards/margins": 1.2476626634597778, "rewards/rejected": -1.3963139057159424, "sft_loss": 1.486511468887329, "step": 10140 }, { "epoch": 0.79, "grad_norm": 4.473769664764404, "learning_rate": 1.0730818083435369e-06, "logits/chosen": -1.4110952615737915, "logits/rejected": -0.8735581636428833, "logps/chosen": -1.2078263759613037, "logps/rejected": -12.34583854675293, "loss": 1.2174, "odds_ratio_loss": 0.09589570015668869, "rewards/accuracies": 1.0, "rewards/chosen": -0.12078263610601425, "rewards/margins": 1.1138012409210205, "rewards/rejected": -1.2345839738845825, "sft_loss": 1.2078263759613037, "step": 10145 }, { "epoch": 0.79, "grad_norm": 300.1793212890625, "learning_rate": 1.0692731933782046e-06, "logits/chosen": -1.379407286643982, "logits/rejected": -1.7840709686279297, "logps/chosen": -1.0849478244781494, "logps/rejected": -12.135259628295898, "loss": 1.0985, "odds_ratio_loss": 0.13510160148143768, "rewards/accuracies": 1.0, "rewards/chosen": -0.10849478095769882, "rewards/margins": 1.1050312519073486, "rewards/rejected": -1.2135260105133057, "sft_loss": 1.0849478244781494, "step": 10150 }, { "epoch": 0.79, "grad_norm": 8.062394142150879, "learning_rate": 1.0654705398591374e-06, "logits/chosen": -1.3931306600570679, "logits/rejected": -1.3168301582336426, "logps/chosen": -0.9894862174987793, "logps/rejected": -9.375204086303711, "loss": 0.9929, "odds_ratio_loss": 0.0342748686671257, "rewards/accuracies": 1.0, "rewards/chosen": -0.09894861280918121, "rewards/margins": 0.8385717272758484, "rewards/rejected": -0.9375203847885132, "sft_loss": 0.9894862174987793, "step": 10155 }, { "epoch": 0.79, "grad_norm": 12.118971824645996, "learning_rate": 1.0616738535535458e-06, "logits/chosen": -1.1479527950286865, "logits/rejected": -1.2802600860595703, "logps/chosen": -1.2164061069488525, "logps/rejected": -4.142562389373779, "loss": 1.2229, "odds_ratio_loss": 0.06503340601921082, "rewards/accuracies": 1.0, "rewards/chosen": -0.12164060771465302, "rewards/margins": 0.29261571168899536, "rewards/rejected": -0.4142562747001648, "sft_loss": 1.2164061069488525, "step": 10160 }, { "epoch": 0.79, "grad_norm": 27.138954162597656, "learning_rate": 1.0578831402195843e-06, "logits/chosen": -1.1534265279769897, "logits/rejected": -0.9197772741317749, "logps/chosen": -1.3329823017120361, "logps/rejected": -8.715575218200684, "loss": 1.3445, "odds_ratio_loss": 0.1155182346701622, "rewards/accuracies": 1.0, "rewards/chosen": -0.13329823315143585, "rewards/margins": 0.7382593750953674, "rewards/rejected": -0.8715575337409973, "sft_loss": 1.3329823017120361, "step": 10165 }, { "epoch": 0.79, "grad_norm": 6.607047080993652, "learning_rate": 1.0540984056063503e-06, "logits/chosen": -1.4123284816741943, "logits/rejected": -1.1352747678756714, "logps/chosen": -1.1117833852767944, "logps/rejected": -6.28924036026001, "loss": 1.1148, "odds_ratio_loss": 0.030421212315559387, "rewards/accuracies": 1.0, "rewards/chosen": -0.11117835342884064, "rewards/margins": 0.5177456736564636, "rewards/rejected": -0.6289240121841431, "sft_loss": 1.1117833852767944, "step": 10170 }, { "epoch": 0.79, "grad_norm": 4.502261161804199, "learning_rate": 1.0503196554538764e-06, "logits/chosen": -1.390334129333496, "logits/rejected": -0.9321194887161255, "logps/chosen": -0.9069440960884094, "logps/rejected": -8.51655387878418, "loss": 0.9154, "odds_ratio_loss": 0.08483470976352692, "rewards/accuracies": 1.0, "rewards/chosen": -0.09069441258907318, "rewards/margins": 0.76096111536026, "rewards/rejected": -0.8516554832458496, "sft_loss": 0.9069440960884094, "step": 10175 }, { "epoch": 0.79, "grad_norm": 13.18431568145752, "learning_rate": 1.0465468954931157e-06, "logits/chosen": -1.3280055522918701, "logits/rejected": -1.405785322189331, "logps/chosen": -0.9179110527038574, "logps/rejected": -7.704813480377197, "loss": 0.9182, "odds_ratio_loss": 0.0026637546252459288, "rewards/accuracies": 1.0, "rewards/chosen": -0.09179110825061798, "rewards/margins": 0.6786901950836182, "rewards/rejected": -0.7704813480377197, "sft_loss": 0.9179110527038574, "step": 10180 }, { "epoch": 0.79, "grad_norm": 5.563874244689941, "learning_rate": 1.0427801314459375e-06, "logits/chosen": -1.3925745487213135, "logits/rejected": -0.7530353665351868, "logps/chosen": -1.2764919996261597, "logps/rejected": -3.7269034385681152, "loss": 1.2981, "odds_ratio_loss": 0.21560311317443848, "rewards/accuracies": 1.0, "rewards/chosen": -0.1276492029428482, "rewards/margins": 0.245041161775589, "rewards/rejected": -0.3726903796195984, "sft_loss": 1.2764919996261597, "step": 10185 }, { "epoch": 0.79, "grad_norm": 7.671252250671387, "learning_rate": 1.0390193690251187e-06, "logits/chosen": -1.3380212783813477, "logits/rejected": -0.934618353843689, "logps/chosen": -1.2179018259048462, "logps/rejected": -7.096657752990723, "loss": 1.2647, "odds_ratio_loss": 0.4683496356010437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12179018557071686, "rewards/margins": 0.587875485420227, "rewards/rejected": -0.7096656560897827, "sft_loss": 1.2179018259048462, "step": 10190 }, { "epoch": 0.79, "grad_norm": 17.141416549682617, "learning_rate": 1.0352646139343325e-06, "logits/chosen": -1.3692445755004883, "logits/rejected": -1.280906319618225, "logps/chosen": -0.9419307708740234, "logps/rejected": -8.827807426452637, "loss": 0.9597, "odds_ratio_loss": 0.17793647944927216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09419308602809906, "rewards/margins": 0.7885876893997192, "rewards/rejected": -0.8827807307243347, "sft_loss": 0.9419307708740234, "step": 10195 }, { "epoch": 0.79, "grad_norm": 100.93508911132812, "learning_rate": 1.0315158718681417e-06, "logits/chosen": -1.163964867591858, "logits/rejected": -1.1963056325912476, "logps/chosen": -1.1907155513763428, "logps/rejected": -3.5567939281463623, "loss": 1.2565, "odds_ratio_loss": 0.657985270023346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11907155811786652, "rewards/margins": 0.23660783469676971, "rewards/rejected": -0.3556794226169586, "sft_loss": 1.1907155513763428, "step": 10200 }, { "epoch": 0.79, "grad_norm": 8.81802749633789, "learning_rate": 1.0277731485119903e-06, "logits/chosen": -1.4088776111602783, "logits/rejected": -1.4885118007659912, "logps/chosen": -0.9907560348510742, "logps/rejected": -11.762723922729492, "loss": 0.9908, "odds_ratio_loss": 0.0005440299864858389, "rewards/accuracies": 1.0, "rewards/chosen": -0.09907560795545578, "rewards/margins": 1.0771968364715576, "rewards/rejected": -1.1762722730636597, "sft_loss": 0.9907560348510742, "step": 10205 }, { "epoch": 0.79, "grad_norm": 9.657306671142578, "learning_rate": 1.0240364495421918e-06, "logits/chosen": -1.350857138633728, "logits/rejected": -1.3532549142837524, "logps/chosen": -1.101911187171936, "logps/rejected": -7.677412986755371, "loss": 1.1276, "odds_ratio_loss": 0.2564184069633484, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11019112914800644, "rewards/margins": 0.6575502157211304, "rewards/rejected": -0.767741322517395, "sft_loss": 1.101911187171936, "step": 10210 }, { "epoch": 0.79, "grad_norm": 10.653190612792969, "learning_rate": 1.0203057806259264e-06, "logits/chosen": -1.3104242086410522, "logits/rejected": -1.1077172756195068, "logps/chosen": -0.6702635884284973, "logps/rejected": -2.555192708969116, "loss": 0.6963, "odds_ratio_loss": 0.25997036695480347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06702636182308197, "rewards/margins": 0.18849292397499084, "rewards/rejected": -0.255519300699234, "sft_loss": 0.6702635884284973, "step": 10215 }, { "epoch": 0.8, "grad_norm": 85.22171783447266, "learning_rate": 1.0165811474212244e-06, "logits/chosen": -1.3750183582305908, "logits/rejected": -1.0446151494979858, "logps/chosen": -1.0259478092193604, "logps/rejected": -4.301419258117676, "loss": 1.0495, "odds_ratio_loss": 0.2350333034992218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1025947779417038, "rewards/margins": 0.32754719257354736, "rewards/rejected": -0.43014198541641235, "sft_loss": 1.0259478092193604, "step": 10220 }, { "epoch": 0.8, "grad_norm": 89.82392883300781, "learning_rate": 1.0128625555769682e-06, "logits/chosen": -1.25468909740448, "logits/rejected": -1.4248607158660889, "logps/chosen": -1.087563157081604, "logps/rejected": -6.856781005859375, "loss": 1.1295, "odds_ratio_loss": 0.41986173391342163, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10875630378723145, "rewards/margins": 0.5769217610359192, "rewards/rejected": -0.6856781244277954, "sft_loss": 1.087563157081604, "step": 10225 }, { "epoch": 0.8, "grad_norm": 22.732999801635742, "learning_rate": 1.0091500107328734e-06, "logits/chosen": -1.1168807744979858, "logits/rejected": -1.1486178636550903, "logps/chosen": -0.9088427424430847, "logps/rejected": -10.493136405944824, "loss": 0.917, "odds_ratio_loss": 0.08161326497793198, "rewards/accuracies": 1.0, "rewards/chosen": -0.09088428318500519, "rewards/margins": 0.9584293365478516, "rewards/rejected": -1.0493135452270508, "sft_loss": 0.9088427424430847, "step": 10230 }, { "epoch": 0.8, "grad_norm": 244.6911163330078, "learning_rate": 1.0054435185194845e-06, "logits/chosen": -1.324135184288025, "logits/rejected": -1.189325213432312, "logps/chosen": -1.0762240886688232, "logps/rejected": -7.894415378570557, "loss": 1.0774, "odds_ratio_loss": 0.012069101445376873, "rewards/accuracies": 1.0, "rewards/chosen": -0.1076224222779274, "rewards/margins": 0.6818190813064575, "rewards/rejected": -0.7894414663314819, "sft_loss": 1.0762240886688232, "step": 10235 }, { "epoch": 0.8, "grad_norm": 9.024249076843262, "learning_rate": 1.0017430845581688e-06, "logits/chosen": -1.2113773822784424, "logits/rejected": -1.0507694482803345, "logps/chosen": -2.6475517749786377, "logps/rejected": -8.984169006347656, "loss": 2.7514, "odds_ratio_loss": 1.038090467453003, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2647551894187927, "rewards/margins": 0.6336617469787598, "rewards/rejected": -0.8984168171882629, "sft_loss": 2.6475517749786377, "step": 10240 }, { "epoch": 0.8, "grad_norm": 16.64531707763672, "learning_rate": 9.980487144611045e-07, "logits/chosen": -1.3650559186935425, "logits/rejected": -1.6216930150985718, "logps/chosen": -1.3417785167694092, "logps/rejected": -16.168201446533203, "loss": 1.3418, "odds_ratio_loss": 0.00013019000471103936, "rewards/accuracies": 1.0, "rewards/chosen": -0.13417786359786987, "rewards/margins": 1.482642412185669, "rewards/rejected": -1.6168200969696045, "sft_loss": 1.3417785167694092, "step": 10245 }, { "epoch": 0.8, "grad_norm": 6.388486385345459, "learning_rate": 9.943604138312725e-07, "logits/chosen": -1.3851463794708252, "logits/rejected": -0.7398214340209961, "logps/chosen": -1.0597431659698486, "logps/rejected": -8.64242172241211, "loss": 1.1022, "odds_ratio_loss": 0.4248722195625305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10597433149814606, "rewards/margins": 0.7582677602767944, "rewards/rejected": -0.8642421960830688, "sft_loss": 1.0597431659698486, "step": 10250 }, { "epoch": 0.8, "grad_norm": 13.157958984375, "learning_rate": 9.906781882624483e-07, "logits/chosen": -1.1372522115707397, "logits/rejected": -1.2602009773254395, "logps/chosen": -1.0265510082244873, "logps/rejected": -6.398252487182617, "loss": 1.0279, "odds_ratio_loss": 0.013952387496829033, "rewards/accuracies": 1.0, "rewards/chosen": -0.10265511274337769, "rewards/margins": 0.5371701121330261, "rewards/rejected": -0.6398252248764038, "sft_loss": 1.0265510082244873, "step": 10255 }, { "epoch": 0.8, "grad_norm": 4.697621822357178, "learning_rate": 9.870020433391947e-07, "logits/chosen": -1.2178454399108887, "logits/rejected": -1.1374971866607666, "logps/chosen": -0.9389911890029907, "logps/rejected": -5.738818168640137, "loss": 0.9426, "odds_ratio_loss": 0.0358780138194561, "rewards/accuracies": 1.0, "rewards/chosen": -0.09389911592006683, "rewards/margins": 0.47998276352882385, "rewards/rejected": -0.5738819241523743, "sft_loss": 0.9389911890029907, "step": 10260 }, { "epoch": 0.8, "grad_norm": 124.55999755859375, "learning_rate": 9.833319846368527e-07, "logits/chosen": -1.4724111557006836, "logits/rejected": -1.449961543083191, "logps/chosen": -1.1817361116409302, "logps/rejected": -12.227134704589844, "loss": 1.2398, "odds_ratio_loss": 0.5806252956390381, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11817361414432526, "rewards/margins": 1.1045401096343994, "rewards/rejected": -1.2227134704589844, "sft_loss": 1.1817361116409302, "step": 10265 }, { "epoch": 0.8, "grad_norm": 19.355998992919922, "learning_rate": 9.796680177215307e-07, "logits/chosen": -1.2841970920562744, "logits/rejected": -1.0065765380859375, "logps/chosen": -1.2662365436553955, "logps/rejected": -8.185649871826172, "loss": 1.2759, "odds_ratio_loss": 0.09626002609729767, "rewards/accuracies": 1.0, "rewards/chosen": -0.12662364542484283, "rewards/margins": 0.6919413805007935, "rewards/rejected": -0.8185650110244751, "sft_loss": 1.2662365436553955, "step": 10270 }, { "epoch": 0.8, "grad_norm": 9.266251564025879, "learning_rate": 9.76010148150102e-07, "logits/chosen": -1.270081877708435, "logits/rejected": -1.4700524806976318, "logps/chosen": -0.9803020358085632, "logps/rejected": -11.615490913391113, "loss": 0.9894, "odds_ratio_loss": 0.09096785634756088, "rewards/accuracies": 1.0, "rewards/chosen": -0.098030224442482, "rewards/margins": 1.0635188817977905, "rewards/rejected": -1.1615490913391113, "sft_loss": 0.9803020358085632, "step": 10275 }, { "epoch": 0.8, "grad_norm": 12.899735450744629, "learning_rate": 9.723583814701904e-07, "logits/chosen": -1.3414711952209473, "logits/rejected": -1.2441836595535278, "logps/chosen": -1.0551906824111938, "logps/rejected": -6.592919826507568, "loss": 1.0625, "odds_ratio_loss": 0.07285496592521667, "rewards/accuracies": 1.0, "rewards/chosen": -0.10551907122135162, "rewards/margins": 0.5537729263305664, "rewards/rejected": -0.6592920422554016, "sft_loss": 1.0551906824111938, "step": 10280 }, { "epoch": 0.8, "grad_norm": 4.892205238342285, "learning_rate": 9.687127232201604e-07, "logits/chosen": -1.311057209968567, "logits/rejected": -1.251116156578064, "logps/chosen": -0.7263120412826538, "logps/rejected": -5.205277442932129, "loss": 0.7392, "odds_ratio_loss": 0.1287280023097992, "rewards/accuracies": 1.0, "rewards/chosen": -0.07263121008872986, "rewards/margins": 0.4478965401649475, "rewards/rejected": -0.520527720451355, "sft_loss": 0.7263120412826538, "step": 10285 }, { "epoch": 0.8, "grad_norm": 6.589762210845947, "learning_rate": 9.650731789291191e-07, "logits/chosen": -1.22185218334198, "logits/rejected": -1.0001389980316162, "logps/chosen": -0.8899606466293335, "logps/rejected": -4.561327934265137, "loss": 0.9094, "odds_ratio_loss": 0.19471469521522522, "rewards/accuracies": 1.0, "rewards/chosen": -0.08899606764316559, "rewards/margins": 0.36713671684265137, "rewards/rejected": -0.4561327397823334, "sft_loss": 0.8899606466293335, "step": 10290 }, { "epoch": 0.8, "grad_norm": 19.73723793029785, "learning_rate": 9.614397541168963e-07, "logits/chosen": -1.2932441234588623, "logits/rejected": -1.2435917854309082, "logps/chosen": -0.8818877935409546, "logps/rejected": -2.412689685821533, "loss": 0.9012, "odds_ratio_loss": 0.192880317568779, "rewards/accuracies": 1.0, "rewards/chosen": -0.0881887823343277, "rewards/margins": 0.15308019518852234, "rewards/rejected": -0.24126896262168884, "sft_loss": 0.8818877935409546, "step": 10295 }, { "epoch": 0.8, "grad_norm": 6.862296104431152, "learning_rate": 9.57812454294041e-07, "logits/chosen": -1.2940229177474976, "logits/rejected": -1.143575668334961, "logps/chosen": -1.2678958177566528, "logps/rejected": -11.522639274597168, "loss": 1.3019, "odds_ratio_loss": 0.33973243832588196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12678958475589752, "rewards/margins": 1.025474190711975, "rewards/rejected": -1.152263879776001, "sft_loss": 1.2678958177566528, "step": 10300 }, { "epoch": 0.8, "grad_norm": 15.191943168640137, "learning_rate": 9.541912849618157e-07, "logits/chosen": -1.375228762626648, "logits/rejected": -1.450534462928772, "logps/chosen": -1.5081673860549927, "logps/rejected": -9.468241691589355, "loss": 1.531, "odds_ratio_loss": 0.22854717075824738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15081675350666046, "rewards/margins": 0.796007513999939, "rewards/rejected": -0.9468242526054382, "sft_loss": 1.5081673860549927, "step": 10305 }, { "epoch": 0.8, "grad_norm": 1.882200837135315, "learning_rate": 9.50576251612183e-07, "logits/chosen": -1.235004186630249, "logits/rejected": -1.764304757118225, "logps/chosen": -0.7166417241096497, "logps/rejected": -11.786436080932617, "loss": 0.7311, "odds_ratio_loss": 0.14428307116031647, "rewards/accuracies": 1.0, "rewards/chosen": -0.07166416943073273, "rewards/margins": 1.1069793701171875, "rewards/rejected": -1.1786434650421143, "sft_loss": 0.7166417241096497, "step": 10310 }, { "epoch": 0.8, "grad_norm": 4.966947078704834, "learning_rate": 9.469673597277995e-07, "logits/chosen": -1.2578141689300537, "logits/rejected": -1.0698421001434326, "logps/chosen": -0.5962234735488892, "logps/rejected": -6.369223594665527, "loss": 0.6073, "odds_ratio_loss": 0.11079008877277374, "rewards/accuracies": 1.0, "rewards/chosen": -0.059622347354888916, "rewards/margins": 0.5773000121116638, "rewards/rejected": -0.6369223594665527, "sft_loss": 0.5962234735488892, "step": 10315 }, { "epoch": 0.8, "grad_norm": 25.989683151245117, "learning_rate": 9.43364614782008e-07, "logits/chosen": -1.3131908178329468, "logits/rejected": -0.9322491884231567, "logps/chosen": -1.1517027616500854, "logps/rejected": -7.070420742034912, "loss": 1.1769, "odds_ratio_loss": 0.2521767020225525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11517028510570526, "rewards/margins": 0.5918717384338379, "rewards/rejected": -0.7070420980453491, "sft_loss": 1.1517027616500854, "step": 10320 }, { "epoch": 0.8, "grad_norm": 11.165669441223145, "learning_rate": 9.397680222388289e-07, "logits/chosen": -1.2683213949203491, "logits/rejected": -1.3640797138214111, "logps/chosen": -0.9720544815063477, "logps/rejected": -7.684467315673828, "loss": 0.9952, "odds_ratio_loss": 0.2314900904893875, "rewards/accuracies": 1.0, "rewards/chosen": -0.09720546007156372, "rewards/margins": 0.671241283416748, "rewards/rejected": -0.7684467434883118, "sft_loss": 0.9720544815063477, "step": 10325 }, { "epoch": 0.8, "grad_norm": 3.5104997158050537, "learning_rate": 9.361775875529511e-07, "logits/chosen": -1.3714743852615356, "logits/rejected": -1.004473090171814, "logps/chosen": -0.8311120271682739, "logps/rejected": -7.721318244934082, "loss": 0.8415, "odds_ratio_loss": 0.10402430593967438, "rewards/accuracies": 1.0, "rewards/chosen": -0.08311120420694351, "rewards/margins": 0.689020574092865, "rewards/rejected": -0.7721318006515503, "sft_loss": 0.8311120271682739, "step": 10330 }, { "epoch": 0.8, "grad_norm": 12.460982322692871, "learning_rate": 9.325933161697237e-07, "logits/chosen": -1.2947807312011719, "logits/rejected": -1.1248445510864258, "logps/chosen": -0.7291210293769836, "logps/rejected": -6.313011646270752, "loss": 0.732, "odds_ratio_loss": 0.028608087450265884, "rewards/accuracies": 1.0, "rewards/chosen": -0.07291209697723389, "rewards/margins": 0.5583890676498413, "rewards/rejected": -0.6313011646270752, "sft_loss": 0.7291210293769836, "step": 10335 }, { "epoch": 0.8, "grad_norm": 8.893537521362305, "learning_rate": 9.290152135251513e-07, "logits/chosen": -1.374703049659729, "logits/rejected": -1.0444432497024536, "logps/chosen": -1.0620887279510498, "logps/rejected": -10.834625244140625, "loss": 1.0637, "odds_ratio_loss": 0.016157137230038643, "rewards/accuracies": 1.0, "rewards/chosen": -0.10620886087417603, "rewards/margins": 0.9772537350654602, "rewards/rejected": -1.0834624767303467, "sft_loss": 1.0620887279510498, "step": 10340 }, { "epoch": 0.8, "grad_norm": 44.764617919921875, "learning_rate": 9.2544328504588e-07, "logits/chosen": -1.250694990158081, "logits/rejected": -1.616758942604065, "logps/chosen": -0.9803198575973511, "logps/rejected": -8.175067901611328, "loss": 1.0013, "odds_ratio_loss": 0.20934104919433594, "rewards/accuracies": 1.0, "rewards/chosen": -0.09803198277950287, "rewards/margins": 0.7194747924804688, "rewards/rejected": -0.8175067901611328, "sft_loss": 0.9803198575973511, "step": 10345 }, { "epoch": 0.81, "grad_norm": 77.59735870361328, "learning_rate": 9.218775361491916e-07, "logits/chosen": -1.3645678758621216, "logits/rejected": -1.6805179119110107, "logps/chosen": -1.9639705419540405, "logps/rejected": -10.83612060546875, "loss": 2.062, "odds_ratio_loss": 0.9798789024353027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19639703631401062, "rewards/margins": 0.8872150182723999, "rewards/rejected": -1.083612084388733, "sft_loss": 1.9639705419540405, "step": 10350 }, { "epoch": 0.81, "grad_norm": 31.32939338684082, "learning_rate": 9.183179722429997e-07, "logits/chosen": -1.4463050365447998, "logits/rejected": -1.1317588090896606, "logps/chosen": -1.1434721946716309, "logps/rejected": -7.519809722900391, "loss": 1.1464, "odds_ratio_loss": 0.029724955558776855, "rewards/accuracies": 1.0, "rewards/chosen": -0.11434721946716309, "rewards/margins": 0.6376338005065918, "rewards/rejected": -0.7519810199737549, "sft_loss": 1.1434721946716309, "step": 10355 }, { "epoch": 0.81, "grad_norm": 4.483221530914307, "learning_rate": 9.14764598725833e-07, "logits/chosen": -1.2005016803741455, "logits/rejected": -1.0444400310516357, "logps/chosen": -1.065643072128296, "logps/rejected": -8.766390800476074, "loss": 1.0659, "odds_ratio_loss": 0.002804366173222661, "rewards/accuracies": 1.0, "rewards/chosen": -0.10656432807445526, "rewards/margins": 0.7700749039649963, "rewards/rejected": -0.876639187335968, "sft_loss": 1.065643072128296, "step": 10360 }, { "epoch": 0.81, "grad_norm": 6.713079929351807, "learning_rate": 9.112174209868341e-07, "logits/chosen": -1.287014126777649, "logits/rejected": -1.178283452987671, "logps/chosen": -0.884039044380188, "logps/rejected": -5.378254413604736, "loss": 0.9081, "odds_ratio_loss": 0.2402028739452362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08840389549732208, "rewards/margins": 0.44942155480384827, "rewards/rejected": -0.5378254652023315, "sft_loss": 0.884039044380188, "step": 10365 }, { "epoch": 0.81, "grad_norm": 7.158381938934326, "learning_rate": 9.07676444405749e-07, "logits/chosen": -1.3042536973953247, "logits/rejected": -1.2625794410705566, "logps/chosen": -0.8462556004524231, "logps/rejected": -7.293604373931885, "loss": 0.8499, "odds_ratio_loss": 0.03648758679628372, "rewards/accuracies": 1.0, "rewards/chosen": -0.08462555706501007, "rewards/margins": 0.6447348594665527, "rewards/rejected": -0.7293604016304016, "sft_loss": 0.8462556004524231, "step": 10370 }, { "epoch": 0.81, "grad_norm": 121.68801879882812, "learning_rate": 9.041416743529168e-07, "logits/chosen": -1.4717400074005127, "logits/rejected": -1.0101182460784912, "logps/chosen": -0.9590436816215515, "logps/rejected": -4.982767581939697, "loss": 0.9694, "odds_ratio_loss": 0.10307572036981583, "rewards/accuracies": 1.0, "rewards/chosen": -0.09590436518192291, "rewards/margins": 0.4023723602294922, "rewards/rejected": -0.4982767701148987, "sft_loss": 0.9590436816215515, "step": 10375 }, { "epoch": 0.81, "grad_norm": 5.824394702911377, "learning_rate": 9.006131161892662e-07, "logits/chosen": -1.3937454223632812, "logits/rejected": -1.1171293258666992, "logps/chosen": -0.864578366279602, "logps/rejected": -5.059448719024658, "loss": 0.9235, "odds_ratio_loss": 0.58892422914505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08645783364772797, "rewards/margins": 0.4194870591163635, "rewards/rejected": -0.5059448480606079, "sft_loss": 0.864578366279602, "step": 10380 }, { "epoch": 0.81, "grad_norm": 77.57972717285156, "learning_rate": 8.970907752663021e-07, "logits/chosen": -1.0274670124053955, "logits/rejected": -1.433475136756897, "logps/chosen": -1.136293888092041, "logps/rejected": -11.400399208068848, "loss": 1.1385, "odds_ratio_loss": 0.021981341764330864, "rewards/accuracies": 1.0, "rewards/chosen": -0.11362940073013306, "rewards/margins": 1.026410460472107, "rewards/rejected": -1.1400400400161743, "sft_loss": 1.136293888092041, "step": 10385 }, { "epoch": 0.81, "grad_norm": 8.37480354309082, "learning_rate": 8.935746569261045e-07, "logits/chosen": -1.3487589359283447, "logits/rejected": -1.5885874032974243, "logps/chosen": -1.007812261581421, "logps/rejected": -17.21298599243164, "loss": 1.0078, "odds_ratio_loss": 0.0001933839957928285, "rewards/accuracies": 1.0, "rewards/chosen": -0.10078122466802597, "rewards/margins": 1.620517373085022, "rewards/rejected": -1.7212985754013062, "sft_loss": 1.007812261581421, "step": 10390 }, { "epoch": 0.81, "grad_norm": 134.8335418701172, "learning_rate": 8.900647665013112e-07, "logits/chosen": -1.3777470588684082, "logits/rejected": -1.3657658100128174, "logps/chosen": -1.1525741815567017, "logps/rejected": -7.184072017669678, "loss": 1.1571, "odds_ratio_loss": 0.045270055532455444, "rewards/accuracies": 1.0, "rewards/chosen": -0.11525741964578629, "rewards/margins": 0.6031497716903687, "rewards/rejected": -0.718407154083252, "sft_loss": 1.1525741815567017, "step": 10395 }, { "epoch": 0.81, "grad_norm": 9.768054008483887, "learning_rate": 8.865611093151161e-07, "logits/chosen": -1.4446382522583008, "logits/rejected": -1.2962652444839478, "logps/chosen": -0.9836952090263367, "logps/rejected": -13.642313957214355, "loss": 0.9951, "odds_ratio_loss": 0.11401750147342682, "rewards/accuracies": 1.0, "rewards/chosen": -0.09836951643228531, "rewards/margins": 1.2658618688583374, "rewards/rejected": -1.3642313480377197, "sft_loss": 0.9836952090263367, "step": 10400 }, { "epoch": 0.81, "grad_norm": 9.077279090881348, "learning_rate": 8.830636906812628e-07, "logits/chosen": -1.2882994413375854, "logits/rejected": -1.3112610578536987, "logps/chosen": -0.8100179433822632, "logps/rejected": -6.998376369476318, "loss": 0.8153, "odds_ratio_loss": 0.0527278408408165, "rewards/accuracies": 1.0, "rewards/chosen": -0.08100180327892303, "rewards/margins": 0.6188358664512634, "rewards/rejected": -0.6998376846313477, "sft_loss": 0.8100179433822632, "step": 10405 }, { "epoch": 0.81, "grad_norm": 11.235574722290039, "learning_rate": 8.795725159040286e-07, "logits/chosen": -1.18384850025177, "logits/rejected": -1.3355462551116943, "logps/chosen": -0.8854089975357056, "logps/rejected": -9.00184154510498, "loss": 0.8944, "odds_ratio_loss": 0.09034241735935211, "rewards/accuracies": 1.0, "rewards/chosen": -0.08854089677333832, "rewards/margins": 0.8116433024406433, "rewards/rejected": -0.900184154510498, "sft_loss": 0.8854089975357056, "step": 10410 }, { "epoch": 0.81, "grad_norm": 6.1802496910095215, "learning_rate": 8.760875902782235e-07, "logits/chosen": -1.1961528062820435, "logits/rejected": -1.2074816226959229, "logps/chosen": -1.2426180839538574, "logps/rejected": -10.89729118347168, "loss": 1.2475, "odds_ratio_loss": 0.048336226493120193, "rewards/accuracies": 1.0, "rewards/chosen": -0.12426181882619858, "rewards/margins": 0.9654672741889954, "rewards/rejected": -1.0897290706634521, "sft_loss": 1.2426180839538574, "step": 10415 }, { "epoch": 0.81, "grad_norm": 26.631731033325195, "learning_rate": 8.726089190891807e-07, "logits/chosen": -1.357290506362915, "logits/rejected": -1.6016134023666382, "logps/chosen": -1.1133967638015747, "logps/rejected": -4.950104236602783, "loss": 1.1918, "odds_ratio_loss": 0.7840924263000488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11133966594934464, "rewards/margins": 0.38367074728012085, "rewards/rejected": -0.4950104355812073, "sft_loss": 1.1133967638015747, "step": 10420 }, { "epoch": 0.81, "grad_norm": 10.378015518188477, "learning_rate": 8.691365076127461e-07, "logits/chosen": -1.4252339601516724, "logits/rejected": -1.4014484882354736, "logps/chosen": -0.8036090135574341, "logps/rejected": -9.63952922821045, "loss": 0.8036, "odds_ratio_loss": 0.00032946295686997473, "rewards/accuracies": 1.0, "rewards/chosen": -0.08036090433597565, "rewards/margins": 0.8835920095443726, "rewards/rejected": -0.963952898979187, "sft_loss": 0.8036090135574341, "step": 10425 }, { "epoch": 0.81, "grad_norm": 10.65987777709961, "learning_rate": 8.656703611152728e-07, "logits/chosen": -1.2951816320419312, "logits/rejected": -1.3539683818817139, "logps/chosen": -0.9659156799316406, "logps/rejected": -15.016809463500977, "loss": 0.9659, "odds_ratio_loss": 4.964599429513328e-05, "rewards/accuracies": 1.0, "rewards/chosen": -0.09659156203269958, "rewards/margins": 1.4050893783569336, "rewards/rejected": -1.5016809701919556, "sft_loss": 0.9659156799316406, "step": 10430 }, { "epoch": 0.81, "grad_norm": 10.26109504699707, "learning_rate": 8.622104848536117e-07, "logits/chosen": -1.2839380502700806, "logits/rejected": -0.8194013833999634, "logps/chosen": -0.883658766746521, "logps/rejected": -5.024937152862549, "loss": 0.8999, "odds_ratio_loss": 0.16276155412197113, "rewards/accuracies": 1.0, "rewards/chosen": -0.08836588263511658, "rewards/margins": 0.41412782669067383, "rewards/rejected": -0.5024937391281128, "sft_loss": 0.883658766746521, "step": 10435 }, { "epoch": 0.81, "grad_norm": 7.266590595245361, "learning_rate": 8.587568840751043e-07, "logits/chosen": -1.2334734201431274, "logits/rejected": -1.316857099533081, "logps/chosen": -1.055189847946167, "logps/rejected": -8.211328506469727, "loss": 1.0711, "odds_ratio_loss": 0.1588059365749359, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10551897436380386, "rewards/margins": 0.7156140208244324, "rewards/rejected": -0.8211329579353333, "sft_loss": 1.055189847946167, "step": 10440 }, { "epoch": 0.81, "grad_norm": 25.591766357421875, "learning_rate": 8.553095640175751e-07, "logits/chosen": -1.3331174850463867, "logits/rejected": -0.950304388999939, "logps/chosen": -1.043084979057312, "logps/rejected": -7.412625789642334, "loss": 1.0696, "odds_ratio_loss": 0.26476946473121643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10430850088596344, "rewards/margins": 0.636954128742218, "rewards/rejected": -0.7412625551223755, "sft_loss": 1.043084979057312, "step": 10445 }, { "epoch": 0.81, "grad_norm": 41.862178802490234, "learning_rate": 8.518685299093216e-07, "logits/chosen": -1.2142364978790283, "logits/rejected": -1.0974957942962646, "logps/chosen": -0.8024848699569702, "logps/rejected": -10.103257179260254, "loss": 0.8044, "odds_ratio_loss": 0.019633423537015915, "rewards/accuracies": 1.0, "rewards/chosen": -0.08024848997592926, "rewards/margins": 0.9300772547721863, "rewards/rejected": -1.0103256702423096, "sft_loss": 0.8024848699569702, "step": 10450 }, { "epoch": 0.81, "grad_norm": 6.318872928619385, "learning_rate": 8.484337869691106e-07, "logits/chosen": -1.2735618352890015, "logits/rejected": -1.0776972770690918, "logps/chosen": -1.2491729259490967, "logps/rejected": -3.0578088760375977, "loss": 1.286, "odds_ratio_loss": 0.3680502772331238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12491729110479355, "rewards/margins": 0.18086357414722443, "rewards/rejected": -0.3057808578014374, "sft_loss": 1.2491729259490967, "step": 10455 }, { "epoch": 0.81, "grad_norm": 8.459473609924316, "learning_rate": 8.450053404061654e-07, "logits/chosen": -1.4461396932601929, "logits/rejected": -0.8473329544067383, "logps/chosen": -1.0210599899291992, "logps/rejected": -7.846470832824707, "loss": 1.0504, "odds_ratio_loss": 0.2929363548755646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1021059975028038, "rewards/margins": 0.6825410723686218, "rewards/rejected": -0.7846471071243286, "sft_loss": 1.0210599899291992, "step": 10460 }, { "epoch": 0.81, "grad_norm": 4.896653175354004, "learning_rate": 8.415831954201587e-07, "logits/chosen": -1.2814228534698486, "logits/rejected": -0.8261783719062805, "logps/chosen": -0.9416384696960449, "logps/rejected": -12.27811336517334, "loss": 0.9628, "odds_ratio_loss": 0.21157538890838623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09416385740041733, "rewards/margins": 1.1336476802825928, "rewards/rejected": -1.2278114557266235, "sft_loss": 0.9416384696960449, "step": 10465 }, { "epoch": 0.81, "grad_norm": 7.460947036743164, "learning_rate": 8.3816735720121e-07, "logits/chosen": -1.5645605325698853, "logits/rejected": -1.104880928993225, "logps/chosen": -1.0965392589569092, "logps/rejected": -6.61223840713501, "loss": 1.1432, "odds_ratio_loss": 0.4668787121772766, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10965392738580704, "rewards/margins": 0.5515699982643127, "rewards/rejected": -0.6612239480018616, "sft_loss": 1.0965392589569092, "step": 10470 }, { "epoch": 0.81, "grad_norm": 17.06568145751953, "learning_rate": 8.347578309298715e-07, "logits/chosen": -1.293723702430725, "logits/rejected": -1.6184883117675781, "logps/chosen": -0.9423452615737915, "logps/rejected": -11.385828971862793, "loss": 0.9426, "odds_ratio_loss": 0.0028821465093642473, "rewards/accuracies": 1.0, "rewards/chosen": -0.09423451125621796, "rewards/margins": 1.0443484783172607, "rewards/rejected": -1.1385829448699951, "sft_loss": 0.9423452615737915, "step": 10475 }, { "epoch": 0.82, "grad_norm": 10.77719497680664, "learning_rate": 8.313546217771224e-07, "logits/chosen": -1.4554004669189453, "logits/rejected": -1.0656113624572754, "logps/chosen": -0.8173044323921204, "logps/rejected": -1.7137502431869507, "loss": 0.8712, "odds_ratio_loss": 0.53852379322052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0817304402589798, "rewards/margins": 0.08964459598064423, "rewards/rejected": -0.17137503623962402, "sft_loss": 0.8173044323921204, "step": 10480 }, { "epoch": 0.82, "grad_norm": 5.053610801696777, "learning_rate": 8.27957734904361e-07, "logits/chosen": -1.4066091775894165, "logits/rejected": -0.704928994178772, "logps/chosen": -0.9944049715995789, "logps/rejected": -6.750539302825928, "loss": 1.0029, "odds_ratio_loss": 0.08509130030870438, "rewards/accuracies": 1.0, "rewards/chosen": -0.09944050014019012, "rewards/margins": 0.5756133794784546, "rewards/rejected": -0.6750538945198059, "sft_loss": 0.9944049715995789, "step": 10485 }, { "epoch": 0.82, "grad_norm": 5.587077617645264, "learning_rate": 8.245671754633977e-07, "logits/chosen": -1.446058988571167, "logits/rejected": -1.1737011671066284, "logps/chosen": -0.727880597114563, "logps/rejected": -6.857645511627197, "loss": 0.7469, "odds_ratio_loss": 0.19013085961341858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0727880671620369, "rewards/margins": 0.6129764914512634, "rewards/rejected": -0.6857645511627197, "sft_loss": 0.727880597114563, "step": 10490 }, { "epoch": 0.82, "grad_norm": 6.152067184448242, "learning_rate": 8.211829485964462e-07, "logits/chosen": -1.3971049785614014, "logits/rejected": -1.288309097290039, "logps/chosen": -1.004244089126587, "logps/rejected": -6.761834621429443, "loss": 1.0273, "odds_ratio_loss": 0.23082295060157776, "rewards/accuracies": 1.0, "rewards/chosen": -0.10042442381381989, "rewards/margins": 0.5757590532302856, "rewards/rejected": -0.6761834621429443, "sft_loss": 1.004244089126587, "step": 10495 }, { "epoch": 0.82, "grad_norm": 79.3951416015625, "learning_rate": 8.178050594361153e-07, "logits/chosen": -1.2680330276489258, "logits/rejected": -1.05678391456604, "logps/chosen": -0.8385862112045288, "logps/rejected": -4.127747535705566, "loss": 0.8477, "odds_ratio_loss": 0.09067679941654205, "rewards/accuracies": 1.0, "rewards/chosen": -0.08385862410068512, "rewards/margins": 0.32891613245010376, "rewards/rejected": -0.4127747416496277, "sft_loss": 0.8385862112045288, "step": 10500 }, { "epoch": 0.82, "grad_norm": 5.112764835357666, "learning_rate": 8.144335131054054e-07, "logits/chosen": -1.1656320095062256, "logits/rejected": -1.1373666524887085, "logps/chosen": -1.0548193454742432, "logps/rejected": -10.11992359161377, "loss": 1.0553, "odds_ratio_loss": 0.004356134682893753, "rewards/accuracies": 1.0, "rewards/chosen": -0.10548193752765656, "rewards/margins": 0.9065103530883789, "rewards/rejected": -1.011992335319519, "sft_loss": 1.0548193454742432, "step": 10505 }, { "epoch": 0.82, "grad_norm": 38.178993225097656, "learning_rate": 8.110683147176929e-07, "logits/chosen": -1.3449701070785522, "logits/rejected": -1.1206490993499756, "logps/chosen": -0.8311041593551636, "logps/rejected": -4.470694541931152, "loss": 0.8728, "odds_ratio_loss": 0.4171561300754547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08311041444540024, "rewards/margins": 0.36395907402038574, "rewards/rejected": -0.4470694959163666, "sft_loss": 0.8311041593551636, "step": 10510 }, { "epoch": 0.82, "grad_norm": 7.466664791107178, "learning_rate": 8.077094693767274e-07, "logits/chosen": -1.4264633655548096, "logits/rejected": -0.9952713251113892, "logps/chosen": -0.6681760549545288, "logps/rejected": -4.286257743835449, "loss": 0.6758, "odds_ratio_loss": 0.07590831816196442, "rewards/accuracies": 1.0, "rewards/chosen": -0.06681760400533676, "rewards/margins": 0.3618081510066986, "rewards/rejected": -0.42862576246261597, "sft_loss": 0.6681760549545288, "step": 10515 }, { "epoch": 0.82, "grad_norm": 6.742861270904541, "learning_rate": 8.043569821766267e-07, "logits/chosen": -1.3641449213027954, "logits/rejected": -0.7682236433029175, "logps/chosen": -0.9841006994247437, "logps/rejected": -9.281465530395508, "loss": 0.9896, "odds_ratio_loss": 0.055423516780138016, "rewards/accuracies": 1.0, "rewards/chosen": -0.09841008484363556, "rewards/margins": 0.8297365307807922, "rewards/rejected": -0.9281465411186218, "sft_loss": 0.9841006994247437, "step": 10520 }, { "epoch": 0.82, "grad_norm": 45.302364349365234, "learning_rate": 8.010108582018622e-07, "logits/chosen": -1.3702471256256104, "logits/rejected": -1.1153770685195923, "logps/chosen": -1.0117090940475464, "logps/rejected": -2.893731117248535, "loss": 1.0545, "odds_ratio_loss": 0.42815542221069336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10117091238498688, "rewards/margins": 0.18820220232009888, "rewards/rejected": -0.28937309980392456, "sft_loss": 1.0117090940475464, "step": 10525 }, { "epoch": 0.82, "grad_norm": 8.680965423583984, "learning_rate": 7.976711025272549e-07, "logits/chosen": -1.3835210800170898, "logits/rejected": -1.3382296562194824, "logps/chosen": -0.7052081227302551, "logps/rejected": -10.351499557495117, "loss": 0.7243, "odds_ratio_loss": 0.19136479496955872, "rewards/accuracies": 1.0, "rewards/chosen": -0.0705208107829094, "rewards/margins": 0.9646291732788086, "rewards/rejected": -1.0351499319076538, "sft_loss": 0.7052081227302551, "step": 10530 }, { "epoch": 0.82, "grad_norm": 37.77129364013672, "learning_rate": 7.943377202179697e-07, "logits/chosen": -1.252140998840332, "logits/rejected": -0.9507595300674438, "logps/chosen": -0.8741302490234375, "logps/rejected": -8.739225387573242, "loss": 0.8753, "odds_ratio_loss": 0.01146181020885706, "rewards/accuracies": 1.0, "rewards/chosen": -0.08741302788257599, "rewards/margins": 0.7865095734596252, "rewards/rejected": -0.8739225268363953, "sft_loss": 0.8741302490234375, "step": 10535 }, { "epoch": 0.82, "grad_norm": 9.636751174926758, "learning_rate": 7.910107163295034e-07, "logits/chosen": -1.291235327720642, "logits/rejected": -1.383276343345642, "logps/chosen": -1.13475501537323, "logps/rejected": -11.56714916229248, "loss": 1.1404, "odds_ratio_loss": 0.056367408484220505, "rewards/accuracies": 1.0, "rewards/chosen": -0.113475501537323, "rewards/margins": 1.0432393550872803, "rewards/rejected": -1.1567147970199585, "sft_loss": 1.13475501537323, "step": 10540 }, { "epoch": 0.82, "grad_norm": 13.887544631958008, "learning_rate": 7.876900959076806e-07, "logits/chosen": -1.290006160736084, "logits/rejected": -0.8440510034561157, "logps/chosen": -0.8624340295791626, "logps/rejected": -2.5278334617614746, "loss": 0.8844, "odds_ratio_loss": 0.21950320899486542, "rewards/accuracies": 1.0, "rewards/chosen": -0.0862434059381485, "rewards/margins": 0.1665399670600891, "rewards/rejected": -0.2527833580970764, "sft_loss": 0.8624340295791626, "step": 10545 }, { "epoch": 0.82, "grad_norm": 37.36802673339844, "learning_rate": 7.843758639886423e-07, "logits/chosen": -1.547479510307312, "logits/rejected": -1.271514654159546, "logps/chosen": -1.0148048400878906, "logps/rejected": -2.815554141998291, "loss": 1.0743, "odds_ratio_loss": 0.5949819087982178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10148048400878906, "rewards/margins": 0.18007493019104004, "rewards/rejected": -0.2815554141998291, "sft_loss": 1.0148048400878906, "step": 10550 }, { "epoch": 0.82, "grad_norm": 3.9318273067474365, "learning_rate": 7.810680255988428e-07, "logits/chosen": -1.4175784587860107, "logits/rejected": -1.2164119482040405, "logps/chosen": -0.9271456003189087, "logps/rejected": -18.189111709594727, "loss": 0.9413, "odds_ratio_loss": 0.1411227434873581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09271456301212311, "rewards/margins": 1.7261966466903687, "rewards/rejected": -1.8189113140106201, "sft_loss": 0.9271456003189087, "step": 10555 }, { "epoch": 0.82, "grad_norm": 24.3516845703125, "learning_rate": 7.777665857550392e-07, "logits/chosen": -1.471502661705017, "logits/rejected": -1.1980304718017578, "logps/chosen": -0.7790975570678711, "logps/rejected": -4.073418140411377, "loss": 0.8185, "odds_ratio_loss": 0.39446350932121277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07790975272655487, "rewards/margins": 0.32943207025527954, "rewards/rejected": -0.4073418080806732, "sft_loss": 0.7790975570678711, "step": 10560 }, { "epoch": 0.82, "grad_norm": 33.29905700683594, "learning_rate": 7.74471549464283e-07, "logits/chosen": -1.2413263320922852, "logits/rejected": -1.1940476894378662, "logps/chosen": -0.9201310873031616, "logps/rejected": -9.094204902648926, "loss": 0.9233, "odds_ratio_loss": 0.031404972076416016, "rewards/accuracies": 1.0, "rewards/chosen": -0.09201310575008392, "rewards/margins": 0.8174074292182922, "rewards/rejected": -0.909420371055603, "sft_loss": 0.9201310873031616, "step": 10565 }, { "epoch": 0.82, "grad_norm": 62.213478088378906, "learning_rate": 7.711829217239169e-07, "logits/chosen": -1.107883095741272, "logits/rejected": -1.278570532798767, "logps/chosen": -1.1935635805130005, "logps/rejected": -7.609461307525635, "loss": 1.214, "odds_ratio_loss": 0.2044333517551422, "rewards/accuracies": 1.0, "rewards/chosen": -0.11935635656118393, "rewards/margins": 0.6415897607803345, "rewards/rejected": -0.7609461545944214, "sft_loss": 1.1935635805130005, "step": 10570 }, { "epoch": 0.82, "grad_norm": 21.677597045898438, "learning_rate": 7.679007075215616e-07, "logits/chosen": -1.4407821893692017, "logits/rejected": -0.8825246095657349, "logps/chosen": -1.1419273614883423, "logps/rejected": -4.68762731552124, "loss": 1.154, "odds_ratio_loss": 0.12092401832342148, "rewards/accuracies": 1.0, "rewards/chosen": -0.11419273912906647, "rewards/margins": 0.3545700013637543, "rewards/rejected": -0.46876272559165955, "sft_loss": 1.1419273614883423, "step": 10575 }, { "epoch": 0.82, "grad_norm": 60.53403854370117, "learning_rate": 7.646249118351106e-07, "logits/chosen": -1.1642895936965942, "logits/rejected": -1.0220810174942017, "logps/chosen": -0.8388074636459351, "logps/rejected": -7.616601467132568, "loss": 0.8497, "odds_ratio_loss": 0.1086864247918129, "rewards/accuracies": 1.0, "rewards/chosen": -0.08388075977563858, "rewards/margins": 0.6777793765068054, "rewards/rejected": -0.7616601586341858, "sft_loss": 0.8388074636459351, "step": 10580 }, { "epoch": 0.82, "grad_norm": 8.826997756958008, "learning_rate": 7.61355539632726e-07, "logits/chosen": -1.378531575202942, "logits/rejected": -0.9921766519546509, "logps/chosen": -0.8884096145629883, "logps/rejected": -5.2243757247924805, "loss": 0.9135, "odds_ratio_loss": 0.25051718950271606, "rewards/accuracies": 1.0, "rewards/chosen": -0.08884096145629883, "rewards/margins": 0.433596670627594, "rewards/rejected": -0.522437572479248, "sft_loss": 0.8884096145629883, "step": 10585 }, { "epoch": 0.82, "grad_norm": 12.614011764526367, "learning_rate": 7.580925958728247e-07, "logits/chosen": -1.5484821796417236, "logits/rejected": -1.3846181631088257, "logps/chosen": -0.9231731295585632, "logps/rejected": -14.208297729492188, "loss": 0.9368, "odds_ratio_loss": 0.13594523072242737, "rewards/accuracies": 1.0, "rewards/chosen": -0.09231732785701752, "rewards/margins": 1.32851243019104, "rewards/rejected": -1.4208297729492188, "sft_loss": 0.9231731295585632, "step": 10590 }, { "epoch": 0.82, "grad_norm": 21.662193298339844, "learning_rate": 7.548360855040754e-07, "logits/chosen": -1.4308409690856934, "logits/rejected": -0.9334679841995239, "logps/chosen": -0.8962264060974121, "logps/rejected": -10.339228630065918, "loss": 0.8963, "odds_ratio_loss": 0.0004231159982737154, "rewards/accuracies": 1.0, "rewards/chosen": -0.0896226316690445, "rewards/margins": 0.9443003535270691, "rewards/rejected": -1.0339229106903076, "sft_loss": 0.8962264060974121, "step": 10595 }, { "epoch": 0.82, "grad_norm": 7.128372669219971, "learning_rate": 7.515860134653897e-07, "logits/chosen": -1.347833275794983, "logits/rejected": -1.1420245170593262, "logps/chosen": -0.8281259536743164, "logps/rejected": -7.346166133880615, "loss": 0.8299, "odds_ratio_loss": 0.017481762915849686, "rewards/accuracies": 1.0, "rewards/chosen": -0.0828125923871994, "rewards/margins": 0.6518040895462036, "rewards/rejected": -0.7346166372299194, "sft_loss": 0.8281259536743164, "step": 10600 }, { "epoch": 0.82, "grad_norm": 14.502819061279297, "learning_rate": 7.483423846859133e-07, "logits/chosen": -1.3140825033187866, "logits/rejected": -1.5204674005508423, "logps/chosen": -0.798182487487793, "logps/rejected": -5.050746917724609, "loss": 0.8016, "odds_ratio_loss": 0.03423731029033661, "rewards/accuracies": 1.0, "rewards/chosen": -0.0798182487487793, "rewards/margins": 0.4252564311027527, "rewards/rejected": -0.505074679851532, "sft_loss": 0.798182487487793, "step": 10605 }, { "epoch": 0.83, "grad_norm": 4.6367716789245605, "learning_rate": 7.451052040850221e-07, "logits/chosen": -1.4010194540023804, "logits/rejected": -0.8769040107727051, "logps/chosen": -1.209674596786499, "logps/rejected": -6.710930824279785, "loss": 1.2768, "odds_ratio_loss": 0.670904815196991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12096747010946274, "rewards/margins": 0.5501256585121155, "rewards/rejected": -0.6710931062698364, "sft_loss": 1.209674596786499, "step": 10610 }, { "epoch": 0.83, "grad_norm": 26.443431854248047, "learning_rate": 7.418744765723118e-07, "logits/chosen": -1.3006012439727783, "logits/rejected": -1.4145643711090088, "logps/chosen": -0.8388000726699829, "logps/rejected": -5.752911567687988, "loss": 0.8537, "odds_ratio_loss": 0.14872387051582336, "rewards/accuracies": 1.0, "rewards/chosen": -0.08388000726699829, "rewards/margins": 0.49141111969947815, "rewards/rejected": -0.5752911567687988, "sft_loss": 0.8388000726699829, "step": 10615 }, { "epoch": 0.83, "grad_norm": 22.965103149414062, "learning_rate": 7.386502070475904e-07, "logits/chosen": -1.3692954778671265, "logits/rejected": -0.9632579684257507, "logps/chosen": -1.0018607378005981, "logps/rejected": -7.105264186859131, "loss": 1.0268, "odds_ratio_loss": 0.24981114268302917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1001860722899437, "rewards/margins": 0.610340416431427, "rewards/rejected": -0.7105264663696289, "sft_loss": 1.0018607378005981, "step": 10620 }, { "epoch": 0.83, "grad_norm": 9.114238739013672, "learning_rate": 7.354324004008723e-07, "logits/chosen": -1.4396076202392578, "logits/rejected": -0.8397786021232605, "logps/chosen": -0.9166440963745117, "logps/rejected": -2.550844669342041, "loss": 0.9506, "odds_ratio_loss": 0.3398217558860779, "rewards/accuracies": 1.0, "rewards/chosen": -0.09166441112756729, "rewards/margins": 0.16342003643512726, "rewards/rejected": -0.25508445501327515, "sft_loss": 0.9166440963745117, "step": 10625 }, { "epoch": 0.83, "grad_norm": 21.649497985839844, "learning_rate": 7.322210615123688e-07, "logits/chosen": -1.4119873046875, "logits/rejected": -1.0898716449737549, "logps/chosen": -0.959582507610321, "logps/rejected": -5.670458793640137, "loss": 1.0098, "odds_ratio_loss": 0.5025702714920044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09595826268196106, "rewards/margins": 0.4710876941680908, "rewards/rejected": -0.5670458674430847, "sft_loss": 0.959582507610321, "step": 10630 }, { "epoch": 0.83, "grad_norm": 15.845293045043945, "learning_rate": 7.290161952524843e-07, "logits/chosen": -1.4333869218826294, "logits/rejected": -1.5032893419265747, "logps/chosen": -0.8169578313827515, "logps/rejected": -10.404666900634766, "loss": 0.8321, "odds_ratio_loss": 0.15138432383537292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0816957876086235, "rewards/margins": 0.9587709307670593, "rewards/rejected": -1.0404666662216187, "sft_loss": 0.8169578313827515, "step": 10635 }, { "epoch": 0.83, "grad_norm": 6.184009552001953, "learning_rate": 7.258178064818056e-07, "logits/chosen": -1.3951168060302734, "logits/rejected": -1.4423763751983643, "logps/chosen": -4.069607734680176, "logps/rejected": -13.736379623413086, "loss": 4.1026, "odds_ratio_loss": 0.3303770422935486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.40696078538894653, "rewards/margins": 0.9666773080825806, "rewards/rejected": -1.3736379146575928, "sft_loss": 4.069607734680176, "step": 10640 }, { "epoch": 0.83, "grad_norm": 6.67935037612915, "learning_rate": 7.226259000510932e-07, "logits/chosen": -1.2222254276275635, "logits/rejected": -1.2913544178009033, "logps/chosen": -1.056786298751831, "logps/rejected": -5.585196018218994, "loss": 1.07, "odds_ratio_loss": 0.1319592297077179, "rewards/accuracies": 1.0, "rewards/chosen": -0.10567863285541534, "rewards/margins": 0.4528409540653229, "rewards/rejected": -0.5585195422172546, "sft_loss": 1.056786298751831, "step": 10645 }, { "epoch": 0.83, "grad_norm": 69.80462646484375, "learning_rate": 7.194404808012811e-07, "logits/chosen": -1.450503945350647, "logits/rejected": -0.8002532124519348, "logps/chosen": -1.2184422016143799, "logps/rejected": -6.215879440307617, "loss": 1.2687, "odds_ratio_loss": 0.5026370882987976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12184421718120575, "rewards/margins": 0.4997437596321106, "rewards/rejected": -0.6215879321098328, "sft_loss": 1.2184422016143799, "step": 10650 }, { "epoch": 0.83, "grad_norm": 14.467700004577637, "learning_rate": 7.162615535634609e-07, "logits/chosen": -1.3063253164291382, "logits/rejected": -1.1163030862808228, "logps/chosen": -1.0223617553710938, "logps/rejected": -2.57954740524292, "loss": 1.04, "odds_ratio_loss": 0.17633824050426483, "rewards/accuracies": 1.0, "rewards/chosen": -0.10223618894815445, "rewards/margins": 0.15571856498718262, "rewards/rejected": -0.25795474648475647, "sft_loss": 1.0223617553710938, "step": 10655 }, { "epoch": 0.83, "grad_norm": 6.114673137664795, "learning_rate": 7.130891231588794e-07, "logits/chosen": -1.2796623706817627, "logits/rejected": -0.8105155229568481, "logps/chosen": -1.2943050861358643, "logps/rejected": -2.6557037830352783, "loss": 1.3144, "odds_ratio_loss": 0.20114263892173767, "rewards/accuracies": 1.0, "rewards/chosen": -0.12943051755428314, "rewards/margins": 0.1361398547887802, "rewards/rejected": -0.26557037234306335, "sft_loss": 1.2943050861358643, "step": 10660 }, { "epoch": 0.83, "grad_norm": 24.761369705200195, "learning_rate": 7.099231943989299e-07, "logits/chosen": -1.2193286418914795, "logits/rejected": -1.3471779823303223, "logps/chosen": -0.9623664617538452, "logps/rejected": -6.8237504959106445, "loss": 0.98, "odds_ratio_loss": 0.17595073580741882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09623664617538452, "rewards/margins": 0.5861383676528931, "rewards/rejected": -0.6823750734329224, "sft_loss": 0.9623664617538452, "step": 10665 }, { "epoch": 0.83, "grad_norm": 15.483434677124023, "learning_rate": 7.067637720851451e-07, "logits/chosen": -1.322331428527832, "logits/rejected": -1.5640705823898315, "logps/chosen": -0.6776281595230103, "logps/rejected": -8.613502502441406, "loss": 0.698, "odds_ratio_loss": 0.2034587413072586, "rewards/accuracies": 1.0, "rewards/chosen": -0.0677628144621849, "rewards/margins": 0.793587327003479, "rewards/rejected": -0.8613502383232117, "sft_loss": 0.6776281595230103, "step": 10670 }, { "epoch": 0.83, "grad_norm": 14.496145248413086, "learning_rate": 7.036108610091896e-07, "logits/chosen": -1.404266595840454, "logits/rejected": -1.0386518239974976, "logps/chosen": -1.0749711990356445, "logps/rejected": -3.9387428760528564, "loss": 1.1257, "odds_ratio_loss": 0.5070328712463379, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.10749713331460953, "rewards/margins": 0.2863771915435791, "rewards/rejected": -0.39387431740760803, "sft_loss": 1.0749711990356445, "step": 10675 }, { "epoch": 0.83, "grad_norm": 316.1288757324219, "learning_rate": 7.004644659528559e-07, "logits/chosen": -0.9859359860420227, "logits/rejected": -1.2559010982513428, "logps/chosen": -0.8468478322029114, "logps/rejected": -9.959203720092773, "loss": 0.855, "odds_ratio_loss": 0.08120620250701904, "rewards/accuracies": 1.0, "rewards/chosen": -0.08468478918075562, "rewards/margins": 0.9112356901168823, "rewards/rejected": -0.9959205389022827, "sft_loss": 0.8468478322029114, "step": 10680 }, { "epoch": 0.83, "grad_norm": 9.3912353515625, "learning_rate": 6.973245916880494e-07, "logits/chosen": -1.4307042360305786, "logits/rejected": -1.6631122827529907, "logps/chosen": -1.0856106281280518, "logps/rejected": -15.808072090148926, "loss": 1.0857, "odds_ratio_loss": 0.000607538444455713, "rewards/accuracies": 1.0, "rewards/chosen": -0.10856107622385025, "rewards/margins": 1.4722459316253662, "rewards/rejected": -1.580807089805603, "sft_loss": 1.0856106281280518, "step": 10685 }, { "epoch": 0.83, "grad_norm": 48.749019622802734, "learning_rate": 6.941912429767883e-07, "logits/chosen": -1.369105577468872, "logits/rejected": -1.0214558839797974, "logps/chosen": -0.8400972485542297, "logps/rejected": -6.254521369934082, "loss": 0.8957, "odds_ratio_loss": 0.5555503964424133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08400972187519073, "rewards/margins": 0.5414424538612366, "rewards/rejected": -0.6254521012306213, "sft_loss": 0.8400972485542297, "step": 10690 }, { "epoch": 0.83, "grad_norm": 11.4762544631958, "learning_rate": 6.910644245711933e-07, "logits/chosen": -1.3264143466949463, "logits/rejected": -1.117030382156372, "logps/chosen": -1.1049288511276245, "logps/rejected": -12.542996406555176, "loss": 1.1266, "odds_ratio_loss": 0.2170540988445282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11049288511276245, "rewards/margins": 1.1438066959381104, "rewards/rejected": -1.2542996406555176, "sft_loss": 1.1049288511276245, "step": 10695 }, { "epoch": 0.83, "grad_norm": 16.640090942382812, "learning_rate": 6.879441412134829e-07, "logits/chosen": -1.3166162967681885, "logits/rejected": -1.4665424823760986, "logps/chosen": -1.2805297374725342, "logps/rejected": -14.314722061157227, "loss": 1.2831, "odds_ratio_loss": 0.025562694296240807, "rewards/accuracies": 1.0, "rewards/chosen": -0.1280529797077179, "rewards/margins": 1.3034193515777588, "rewards/rejected": -1.4314724206924438, "sft_loss": 1.2805297374725342, "step": 10700 }, { "epoch": 0.83, "grad_norm": 4.9655632972717285, "learning_rate": 6.848303976359627e-07, "logits/chosen": -1.2870609760284424, "logits/rejected": -0.8258674740791321, "logps/chosen": -1.148829460144043, "logps/rejected": -12.691110610961914, "loss": 1.1623, "odds_ratio_loss": 0.1348555088043213, "rewards/accuracies": 1.0, "rewards/chosen": -0.1148829460144043, "rewards/margins": 1.1542279720306396, "rewards/rejected": -1.269110918045044, "sft_loss": 1.148829460144043, "step": 10705 }, { "epoch": 0.83, "grad_norm": 4.1600847244262695, "learning_rate": 6.8172319856102e-07, "logits/chosen": -1.3312809467315674, "logits/rejected": -1.062102198600769, "logps/chosen": -0.932320237159729, "logps/rejected": -5.111158847808838, "loss": 0.9464, "odds_ratio_loss": 0.14102238416671753, "rewards/accuracies": 1.0, "rewards/chosen": -0.09323202073574066, "rewards/margins": 0.41788387298583984, "rewards/rejected": -0.5111159086227417, "sft_loss": 0.932320237159729, "step": 10710 }, { "epoch": 0.83, "grad_norm": 22.662845611572266, "learning_rate": 6.786225487011161e-07, "logits/chosen": -1.292526125907898, "logits/rejected": -0.7720105648040771, "logps/chosen": -0.8446162343025208, "logps/rejected": -7.427974700927734, "loss": 0.8559, "odds_ratio_loss": 0.11322052776813507, "rewards/accuracies": 1.0, "rewards/chosen": -0.08446161448955536, "rewards/margins": 0.65833580493927, "rewards/rejected": -0.7427974939346313, "sft_loss": 0.8446162343025208, "step": 10715 }, { "epoch": 0.83, "grad_norm": 27.110597610473633, "learning_rate": 6.755284527587808e-07, "logits/chosen": -1.2771958112716675, "logits/rejected": -1.2682626247406006, "logps/chosen": -0.8162559270858765, "logps/rejected": -3.1453375816345215, "loss": 0.8354, "odds_ratio_loss": 0.19193263351917267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08162559568881989, "rewards/margins": 0.23290812969207764, "rewards/rejected": -0.3145337700843811, "sft_loss": 0.8162559270858765, "step": 10720 }, { "epoch": 0.83, "grad_norm": 17.56296730041504, "learning_rate": 6.724409154266015e-07, "logits/chosen": -1.3354074954986572, "logits/rejected": -1.6221174001693726, "logps/chosen": -0.7041760683059692, "logps/rejected": -5.976828098297119, "loss": 0.7242, "odds_ratio_loss": 0.19975678622722626, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07041759788990021, "rewards/margins": 0.5272652506828308, "rewards/rejected": -0.5976828336715698, "sft_loss": 0.7041760683059692, "step": 10725 }, { "epoch": 0.83, "grad_norm": 23.91651725769043, "learning_rate": 6.693599413872237e-07, "logits/chosen": -1.4807329177856445, "logits/rejected": -1.0074899196624756, "logps/chosen": -0.9828524589538574, "logps/rejected": -3.0085151195526123, "loss": 0.9976, "odds_ratio_loss": 0.14699013531208038, "rewards/accuracies": 1.0, "rewards/chosen": -0.0982852429151535, "rewards/margins": 0.2025662660598755, "rewards/rejected": -0.3008515238761902, "sft_loss": 0.9828524589538574, "step": 10730 }, { "epoch": 0.84, "grad_norm": 15.844127655029297, "learning_rate": 6.662855353133347e-07, "logits/chosen": -1.4604413509368896, "logits/rejected": -1.3962243795394897, "logps/chosen": -1.2593581676483154, "logps/rejected": -2.065171003341675, "loss": 1.3106, "odds_ratio_loss": 0.5126217603683472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12593582272529602, "rewards/margins": 0.08058128505945206, "rewards/rejected": -0.20651713013648987, "sft_loss": 1.2593581676483154, "step": 10735 }, { "epoch": 0.84, "grad_norm": 8.233352661132812, "learning_rate": 6.632177018676605e-07, "logits/chosen": -1.3083667755126953, "logits/rejected": -1.0775474309921265, "logps/chosen": -1.0371109247207642, "logps/rejected": -3.8532378673553467, "loss": 1.0483, "odds_ratio_loss": 0.11168348789215088, "rewards/accuracies": 1.0, "rewards/chosen": -0.1037110835313797, "rewards/margins": 0.28161272406578064, "rewards/rejected": -0.38532382249832153, "sft_loss": 1.0371109247207642, "step": 10740 }, { "epoch": 0.84, "grad_norm": 4.413276195526123, "learning_rate": 6.601564457029597e-07, "logits/chosen": -1.3865123987197876, "logits/rejected": -1.3351116180419922, "logps/chosen": -0.9906963109970093, "logps/rejected": -7.515824794769287, "loss": 1.036, "odds_ratio_loss": 0.4525395333766937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09906964004039764, "rewards/margins": 0.6525128483772278, "rewards/rejected": -0.7515825033187866, "sft_loss": 0.9906963109970093, "step": 10745 }, { "epoch": 0.84, "grad_norm": 5.07670259475708, "learning_rate": 6.571017714620187e-07, "logits/chosen": -1.4065673351287842, "logits/rejected": -0.9796167612075806, "logps/chosen": -0.8060011863708496, "logps/rejected": -3.166170597076416, "loss": 0.8262, "odds_ratio_loss": 0.20244893431663513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08060012757778168, "rewards/margins": 0.23601694405078888, "rewards/rejected": -0.31661707162857056, "sft_loss": 0.8060011863708496, "step": 10750 }, { "epoch": 0.84, "grad_norm": 219.17034912109375, "learning_rate": 6.540536837776367e-07, "logits/chosen": -1.2626028060913086, "logits/rejected": -1.4447038173675537, "logps/chosen": -0.995640754699707, "logps/rejected": -6.835412502288818, "loss": 1.008, "odds_ratio_loss": 0.12361223995685577, "rewards/accuracies": 1.0, "rewards/chosen": -0.0995640829205513, "rewards/margins": 0.583977222442627, "rewards/rejected": -0.6835412979125977, "sft_loss": 0.995640754699707, "step": 10755 }, { "epoch": 0.84, "grad_norm": 6.518074989318848, "learning_rate": 6.510121872726249e-07, "logits/chosen": -1.3091720342636108, "logits/rejected": -0.8189018368721008, "logps/chosen": -1.0482852458953857, "logps/rejected": -8.858360290527344, "loss": 1.0521, "odds_ratio_loss": 0.03843696787953377, "rewards/accuracies": 1.0, "rewards/chosen": -0.10482852160930634, "rewards/margins": 0.7810075283050537, "rewards/rejected": -0.8858360052108765, "sft_loss": 1.0482852458953857, "step": 10760 }, { "epoch": 0.84, "grad_norm": 26.186777114868164, "learning_rate": 6.479772865598016e-07, "logits/chosen": -1.4154765605926514, "logits/rejected": -1.3204116821289062, "logps/chosen": -1.1116999387741089, "logps/rejected": -7.335646629333496, "loss": 1.1134, "odds_ratio_loss": 0.01742752455174923, "rewards/accuracies": 1.0, "rewards/chosen": -0.11117000877857208, "rewards/margins": 0.6223946809768677, "rewards/rejected": -0.7335646748542786, "sft_loss": 1.1116999387741089, "step": 10765 }, { "epoch": 0.84, "grad_norm": 8.995668411254883, "learning_rate": 6.449489862419772e-07, "logits/chosen": -1.2886964082717896, "logits/rejected": -1.0613796710968018, "logps/chosen": -1.0305818319320679, "logps/rejected": -4.333034515380859, "loss": 1.1145, "odds_ratio_loss": 0.8395366668701172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10305819660425186, "rewards/margins": 0.33024531602859497, "rewards/rejected": -0.43330350518226624, "sft_loss": 1.0305818319320679, "step": 10770 }, { "epoch": 0.84, "grad_norm": 20.093276977539062, "learning_rate": 6.419272909119539e-07, "logits/chosen": -1.175619125366211, "logits/rejected": -1.290102243423462, "logps/chosen": -1.2042921781539917, "logps/rejected": -7.513049125671387, "loss": 1.2206, "odds_ratio_loss": 0.1633186638355255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12042923271656036, "rewards/margins": 0.6308757066726685, "rewards/rejected": -0.7513049244880676, "sft_loss": 1.2042921781539917, "step": 10775 }, { "epoch": 0.84, "grad_norm": 38.16044235229492, "learning_rate": 6.38912205152517e-07, "logits/chosen": -1.269337773323059, "logits/rejected": -0.9223520159721375, "logps/chosen": -0.87409907579422, "logps/rejected": -5.193105697631836, "loss": 0.8866, "odds_ratio_loss": 0.1254083812236786, "rewards/accuracies": 1.0, "rewards/chosen": -0.08740990608930588, "rewards/margins": 0.43190065026283264, "rewards/rejected": -0.5193105936050415, "sft_loss": 0.87409907579422, "step": 10780 }, { "epoch": 0.84, "grad_norm": 4.398747444152832, "learning_rate": 6.35903733536426e-07, "logits/chosen": -1.3488149642944336, "logits/rejected": -0.831713080406189, "logps/chosen": -0.7239896655082703, "logps/rejected": -5.531470775604248, "loss": 0.7265, "odds_ratio_loss": 0.0255146324634552, "rewards/accuracies": 1.0, "rewards/chosen": -0.07239897549152374, "rewards/margins": 0.48074811697006226, "rewards/rejected": -0.5531471371650696, "sft_loss": 0.7239896655082703, "step": 10785 }, { "epoch": 0.84, "grad_norm": 18.752788543701172, "learning_rate": 6.329018806264092e-07, "logits/chosen": -1.303001046180725, "logits/rejected": -1.189507246017456, "logps/chosen": -0.9213204383850098, "logps/rejected": -6.262149810791016, "loss": 0.932, "odds_ratio_loss": 0.1070183515548706, "rewards/accuracies": 1.0, "rewards/chosen": -0.09213204681873322, "rewards/margins": 0.5340828895568848, "rewards/rejected": -0.6262149810791016, "sft_loss": 0.9213204383850098, "step": 10790 }, { "epoch": 0.84, "grad_norm": 6.965780735015869, "learning_rate": 6.299066509751595e-07, "logits/chosen": -1.4093542098999023, "logits/rejected": -1.1180508136749268, "logps/chosen": -1.06251060962677, "logps/rejected": -10.950661659240723, "loss": 1.0841, "odds_ratio_loss": 0.21567395329475403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.106251060962677, "rewards/margins": 0.9888151288032532, "rewards/rejected": -1.0950661897659302, "sft_loss": 1.06251060962677, "step": 10795 }, { "epoch": 0.84, "grad_norm": 4.436639308929443, "learning_rate": 6.26918049125323e-07, "logits/chosen": -1.2421128749847412, "logits/rejected": -1.2047935724258423, "logps/chosen": -1.0232081413269043, "logps/rejected": -10.297616004943848, "loss": 1.0383, "odds_ratio_loss": 0.15066194534301758, "rewards/accuracies": 1.0, "rewards/chosen": -0.10232081264257431, "rewards/margins": 0.9274408221244812, "rewards/rejected": -1.029761552810669, "sft_loss": 1.0232081413269043, "step": 10800 }, { "epoch": 0.84, "grad_norm": 6.202176570892334, "learning_rate": 6.239360796094923e-07, "logits/chosen": -1.4031856060028076, "logits/rejected": -1.1629369258880615, "logps/chosen": -0.8468238711357117, "logps/rejected": -2.1136059761047363, "loss": 0.8931, "odds_ratio_loss": 0.4629918932914734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0846823900938034, "rewards/margins": 0.1266781985759735, "rewards/rejected": -0.2113606035709381, "sft_loss": 0.8468238711357117, "step": 10805 }, { "epoch": 0.84, "grad_norm": 10.24286937713623, "learning_rate": 6.209607469502032e-07, "logits/chosen": -1.394173264503479, "logits/rejected": -1.002078652381897, "logps/chosen": -0.8187114596366882, "logps/rejected": -9.146513938903809, "loss": 0.8499, "odds_ratio_loss": 0.312236487865448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0818711444735527, "rewards/margins": 0.8327803611755371, "rewards/rejected": -0.9146515130996704, "sft_loss": 0.8187114596366882, "step": 10810 }, { "epoch": 0.84, "grad_norm": 104.1180191040039, "learning_rate": 6.179920556599267e-07, "logits/chosen": -1.2264460325241089, "logits/rejected": -1.3998425006866455, "logps/chosen": -1.2771530151367188, "logps/rejected": -10.008929252624512, "loss": 1.3034, "odds_ratio_loss": 0.2629183828830719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1277153044939041, "rewards/margins": 0.8731776475906372, "rewards/rejected": -1.0008928775787354, "sft_loss": 1.2771530151367188, "step": 10815 }, { "epoch": 0.84, "grad_norm": 6.116608619689941, "learning_rate": 6.150300102410589e-07, "logits/chosen": -1.199758768081665, "logits/rejected": -1.174690842628479, "logps/chosen": -1.4607865810394287, "logps/rejected": -6.701357364654541, "loss": 1.4728, "odds_ratio_loss": 0.12056130170822144, "rewards/accuracies": 1.0, "rewards/chosen": -0.1460786759853363, "rewards/margins": 0.5240570306777954, "rewards/rejected": -0.6701357364654541, "sft_loss": 1.4607865810394287, "step": 10820 }, { "epoch": 0.84, "grad_norm": 13.097474098205566, "learning_rate": 6.120746151859186e-07, "logits/chosen": -1.4636832475662231, "logits/rejected": -1.1174657344818115, "logps/chosen": -0.9332340955734253, "logps/rejected": -3.634037494659424, "loss": 0.9573, "odds_ratio_loss": 0.24098041653633118, "rewards/accuracies": 1.0, "rewards/chosen": -0.09332340210676193, "rewards/margins": 0.2700802981853485, "rewards/rejected": -0.36340370774269104, "sft_loss": 0.9332340955734253, "step": 10825 }, { "epoch": 0.84, "grad_norm": 4.853806972503662, "learning_rate": 6.091258749767365e-07, "logits/chosen": -1.1578795909881592, "logits/rejected": -1.2788320779800415, "logps/chosen": -0.7345417737960815, "logps/rejected": -15.559396743774414, "loss": 0.7417, "odds_ratio_loss": 0.07208183407783508, "rewards/accuracies": 1.0, "rewards/chosen": -0.07345417886972427, "rewards/margins": 1.4824855327606201, "rewards/rejected": -1.5559396743774414, "sft_loss": 0.7345417737960815, "step": 10830 }, { "epoch": 0.84, "grad_norm": 8.429411888122559, "learning_rate": 6.061837940856524e-07, "logits/chosen": -1.3931564092636108, "logits/rejected": -0.8877042531967163, "logps/chosen": -1.1764130592346191, "logps/rejected": -5.6686506271362305, "loss": 1.2116, "odds_ratio_loss": 0.35157907009124756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11764131486415863, "rewards/margins": 0.44922375679016113, "rewards/rejected": -0.566865086555481, "sft_loss": 1.1764130592346191, "step": 10835 }, { "epoch": 0.84, "grad_norm": 12.441805839538574, "learning_rate": 6.032483769747044e-07, "logits/chosen": -1.273625135421753, "logits/rejected": -1.008589506149292, "logps/chosen": -0.9708470106124878, "logps/rejected": -8.170413970947266, "loss": 0.9988, "odds_ratio_loss": 0.279986172914505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09708471596240997, "rewards/margins": 0.7199567556381226, "rewards/rejected": -0.8170413970947266, "sft_loss": 0.9708470106124878, "step": 10840 }, { "epoch": 0.84, "grad_norm": 5.836912155151367, "learning_rate": 6.003196280958268e-07, "logits/chosen": -1.370110273361206, "logits/rejected": -0.9305510520935059, "logps/chosen": -0.7876640558242798, "logps/rejected": -2.4059929847717285, "loss": 0.8388, "odds_ratio_loss": 0.5111384391784668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07876641303300858, "rewards/margins": 0.16183289885520935, "rewards/rejected": -0.24059931933879852, "sft_loss": 0.7876640558242798, "step": 10845 }, { "epoch": 0.84, "grad_norm": 5.9041948318481445, "learning_rate": 5.973975518908381e-07, "logits/chosen": -1.2838035821914673, "logits/rejected": -0.8337491750717163, "logps/chosen": -0.9224063754081726, "logps/rejected": -6.886902809143066, "loss": 0.937, "odds_ratio_loss": 0.14572349190711975, "rewards/accuracies": 1.0, "rewards/chosen": -0.09224063903093338, "rewards/margins": 0.5964496731758118, "rewards/rejected": -0.6886903047561646, "sft_loss": 0.9224063754081726, "step": 10850 }, { "epoch": 0.84, "grad_norm": 6.783423900604248, "learning_rate": 5.94482152791438e-07, "logits/chosen": -1.383037805557251, "logits/rejected": -1.1398518085479736, "logps/chosen": -1.2712384462356567, "logps/rejected": -8.102948188781738, "loss": 1.283, "odds_ratio_loss": 0.11787731945514679, "rewards/accuracies": 1.0, "rewards/chosen": -0.1271238625049591, "rewards/margins": 0.6831710338592529, "rewards/rejected": -0.8102949261665344, "sft_loss": 1.2712384462356567, "step": 10855 }, { "epoch": 0.84, "grad_norm": 20.92612648010254, "learning_rate": 5.915734352191998e-07, "logits/chosen": -1.3900169134140015, "logits/rejected": -1.3074524402618408, "logps/chosen": -0.7815436720848083, "logps/rejected": -8.142717361450195, "loss": 0.7898, "odds_ratio_loss": 0.08238000422716141, "rewards/accuracies": 1.0, "rewards/chosen": -0.07815437018871307, "rewards/margins": 0.736117422580719, "rewards/rejected": -0.8142718076705933, "sft_loss": 0.7815436720848083, "step": 10860 }, { "epoch": 0.85, "grad_norm": 4.04435396194458, "learning_rate": 5.886714035855629e-07, "logits/chosen": -1.300445318222046, "logits/rejected": -0.8074096441268921, "logps/chosen": -0.7334375381469727, "logps/rejected": -8.39263916015625, "loss": 0.7431, "odds_ratio_loss": 0.09648707509040833, "rewards/accuracies": 1.0, "rewards/chosen": -0.07334376126527786, "rewards/margins": 0.7659201622009277, "rewards/rejected": -0.839263916015625, "sft_loss": 0.7334375381469727, "step": 10865 }, { "epoch": 0.85, "grad_norm": 5.290650844573975, "learning_rate": 5.857760622918263e-07, "logits/chosen": -1.2743334770202637, "logits/rejected": -0.46047696471214294, "logps/chosen": -0.8884002566337585, "logps/rejected": -4.124817848205566, "loss": 0.9124, "odds_ratio_loss": 0.23984280228614807, "rewards/accuracies": 1.0, "rewards/chosen": -0.08884003013372421, "rewards/margins": 0.32364171743392944, "rewards/rejected": -0.41248178482055664, "sft_loss": 0.8884002566337585, "step": 10870 }, { "epoch": 0.85, "grad_norm": 8.239819526672363, "learning_rate": 5.828874157291425e-07, "logits/chosen": -1.1602166891098022, "logits/rejected": -1.1731626987457275, "logps/chosen": -0.8925191164016724, "logps/rejected": -5.278420448303223, "loss": 0.9066, "odds_ratio_loss": 0.14100593328475952, "rewards/accuracies": 1.0, "rewards/chosen": -0.08925192058086395, "rewards/margins": 0.4385901093482971, "rewards/rejected": -0.5278420448303223, "sft_loss": 0.8925191164016724, "step": 10875 }, { "epoch": 0.85, "grad_norm": 8.573440551757812, "learning_rate": 5.800054682785117e-07, "logits/chosen": -1.296282410621643, "logits/rejected": -0.8988644480705261, "logps/chosen": -0.9847795367240906, "logps/rejected": -5.011839389801025, "loss": 1.0078, "odds_ratio_loss": 0.22980418801307678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09847795963287354, "rewards/margins": 0.40270599722862244, "rewards/rejected": -0.5011839866638184, "sft_loss": 0.9847795367240906, "step": 10880 }, { "epoch": 0.85, "grad_norm": 23.42486572265625, "learning_rate": 5.771302243107729e-07, "logits/chosen": -1.478144884109497, "logits/rejected": -1.2906591892242432, "logps/chosen": -0.8750435709953308, "logps/rejected": -2.1842732429504395, "loss": 0.9128, "odds_ratio_loss": 0.377769410610199, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08750435709953308, "rewards/margins": 0.13092297315597534, "rewards/rejected": -0.21842734515666962, "sft_loss": 0.8750435709953308, "step": 10885 }, { "epoch": 0.85, "grad_norm": 8.602510452270508, "learning_rate": 5.742616881865981e-07, "logits/chosen": -1.3302183151245117, "logits/rejected": -0.9966436624526978, "logps/chosen": -1.0099607706069946, "logps/rejected": -7.133604526519775, "loss": 1.0168, "odds_ratio_loss": 0.06856737285852432, "rewards/accuracies": 1.0, "rewards/chosen": -0.10099606215953827, "rewards/margins": 0.6123644113540649, "rewards/rejected": -0.7133604288101196, "sft_loss": 1.0099607706069946, "step": 10890 }, { "epoch": 0.85, "grad_norm": 11.181099891662598, "learning_rate": 5.713998642564872e-07, "logits/chosen": -1.4327882528305054, "logits/rejected": -0.9655070304870605, "logps/chosen": -0.8675923347473145, "logps/rejected": -2.817263126373291, "loss": 0.8961, "odds_ratio_loss": 0.285393625497818, "rewards/accuracies": 1.0, "rewards/chosen": -0.08675923198461533, "rewards/margins": 0.19496707618236542, "rewards/rejected": -0.28172630071640015, "sft_loss": 0.8675923347473145, "step": 10895 }, { "epoch": 0.85, "grad_norm": 7.003217697143555, "learning_rate": 5.685447568607589e-07, "logits/chosen": -1.1521581411361694, "logits/rejected": -1.2851454019546509, "logps/chosen": -0.9078611135482788, "logps/rejected": -8.256436347961426, "loss": 0.9093, "odds_ratio_loss": 0.014228816144168377, "rewards/accuracies": 1.0, "rewards/chosen": -0.09078611433506012, "rewards/margins": 0.7348575592041016, "rewards/rejected": -0.8256436586380005, "sft_loss": 0.9078611135482788, "step": 10900 }, { "epoch": 0.85, "grad_norm": 293.4844055175781, "learning_rate": 5.656963703295454e-07, "logits/chosen": -1.4213796854019165, "logits/rejected": -0.9011304974555969, "logps/chosen": -1.1318405866622925, "logps/rejected": -9.844264030456543, "loss": 1.1469, "odds_ratio_loss": 0.1509471833705902, "rewards/accuracies": 1.0, "rewards/chosen": -0.11318407207727432, "rewards/margins": 0.871242344379425, "rewards/rejected": -0.9844264984130859, "sft_loss": 1.1318405866622925, "step": 10905 }, { "epoch": 0.85, "grad_norm": 7.686473846435547, "learning_rate": 5.628547089827885e-07, "logits/chosen": -1.1069039106369019, "logits/rejected": -1.015209674835205, "logps/chosen": -1.0385732650756836, "logps/rejected": -5.666954517364502, "loss": 1.0521, "odds_ratio_loss": 0.1351829469203949, "rewards/accuracies": 1.0, "rewards/chosen": -0.10385732352733612, "rewards/margins": 0.4628380835056305, "rewards/rejected": -0.5666954517364502, "sft_loss": 1.0385732650756836, "step": 10910 }, { "epoch": 0.85, "grad_norm": 20.32103729248047, "learning_rate": 5.600197771302274e-07, "logits/chosen": -1.2023155689239502, "logits/rejected": -0.7322795987129211, "logps/chosen": -0.8296264410018921, "logps/rejected": -1.8694965839385986, "loss": 0.857, "odds_ratio_loss": 0.2740650773048401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08296265453100204, "rewards/margins": 0.10398700088262558, "rewards/rejected": -0.18694964051246643, "sft_loss": 0.8296264410018921, "step": 10915 }, { "epoch": 0.85, "grad_norm": 13.354987144470215, "learning_rate": 5.571915790713944e-07, "logits/chosen": -1.400775671005249, "logits/rejected": -1.0198614597320557, "logps/chosen": -0.9796463251113892, "logps/rejected": -3.354994535446167, "loss": 1.0359, "odds_ratio_loss": 0.5625422596931458, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09796462953090668, "rewards/margins": 0.23753483593463898, "rewards/rejected": -0.33549946546554565, "sft_loss": 0.9796463251113892, "step": 10920 }, { "epoch": 0.85, "grad_norm": 6.634857177734375, "learning_rate": 5.543701190956146e-07, "logits/chosen": -1.442639708518982, "logits/rejected": -1.0874769687652588, "logps/chosen": -1.7671304941177368, "logps/rejected": -5.139246940612793, "loss": 1.8229, "odds_ratio_loss": 0.55790114402771, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17671306431293488, "rewards/margins": 0.33721163868904114, "rewards/rejected": -0.5139247179031372, "sft_loss": 1.7671304941177368, "step": 10925 }, { "epoch": 0.85, "grad_norm": 13.042349815368652, "learning_rate": 5.515554014819879e-07, "logits/chosen": -1.3886566162109375, "logits/rejected": -0.9926946759223938, "logps/chosen": -0.8931914567947388, "logps/rejected": -2.3885300159454346, "loss": 0.9148, "odds_ratio_loss": 0.21583838760852814, "rewards/accuracies": 1.0, "rewards/chosen": -0.08931914716959, "rewards/margins": 0.14953383803367615, "rewards/rejected": -0.23885297775268555, "sft_loss": 0.8931914567947388, "step": 10930 }, { "epoch": 0.85, "grad_norm": 11.30074405670166, "learning_rate": 5.487474304993912e-07, "logits/chosen": -1.3836137056350708, "logits/rejected": -1.1608842611312866, "logps/chosen": -1.0646214485168457, "logps/rejected": -4.637225151062012, "loss": 1.0853, "odds_ratio_loss": 0.20709529519081116, "rewards/accuracies": 1.0, "rewards/chosen": -0.10646214336156845, "rewards/margins": 0.35726040601730347, "rewards/rejected": -0.4637225270271301, "sft_loss": 1.0646214485168457, "step": 10935 }, { "epoch": 0.85, "grad_norm": 36.60148620605469, "learning_rate": 5.459462104064695e-07, "logits/chosen": -1.4806654453277588, "logits/rejected": -1.1578395366668701, "logps/chosen": -1.047041654586792, "logps/rejected": -3.035254955291748, "loss": 1.0983, "odds_ratio_loss": 0.5123556852340698, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10470416396856308, "rewards/margins": 0.19882136583328247, "rewards/rejected": -0.30352550745010376, "sft_loss": 1.047041654586792, "step": 10940 }, { "epoch": 0.85, "grad_norm": 17.35590171813965, "learning_rate": 5.431517454516282e-07, "logits/chosen": -1.360613465309143, "logits/rejected": -1.1922509670257568, "logps/chosen": -0.706468403339386, "logps/rejected": -5.037663459777832, "loss": 0.7226, "odds_ratio_loss": 0.161125048995018, "rewards/accuracies": 1.0, "rewards/chosen": -0.07064683735370636, "rewards/margins": 0.433119535446167, "rewards/rejected": -0.5037663578987122, "sft_loss": 0.706468403339386, "step": 10945 }, { "epoch": 0.85, "grad_norm": 14.495502471923828, "learning_rate": 5.403640398730286e-07, "logits/chosen": -1.2188652753829956, "logits/rejected": -1.199636697769165, "logps/chosen": -1.0238749980926514, "logps/rejected": -5.018639087677002, "loss": 1.0654, "odds_ratio_loss": 0.4154825806617737, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10238751024007797, "rewards/margins": 0.3994763493537903, "rewards/rejected": -0.5018638372421265, "sft_loss": 1.0238749980926514, "step": 10950 }, { "epoch": 0.85, "grad_norm": 13.935104370117188, "learning_rate": 5.375830978985791e-07, "logits/chosen": -1.3460266590118408, "logits/rejected": -1.4006439447402954, "logps/chosen": -1.05868399143219, "logps/rejected": -16.683874130249023, "loss": 1.0589, "odds_ratio_loss": 0.0025215111672878265, "rewards/accuracies": 1.0, "rewards/chosen": -0.10586841404438019, "rewards/margins": 1.5625190734863281, "rewards/rejected": -1.6683876514434814, "sft_loss": 1.05868399143219, "step": 10955 }, { "epoch": 0.85, "grad_norm": 7.0826239585876465, "learning_rate": 5.34808923745933e-07, "logits/chosen": -1.3722484111785889, "logits/rejected": -1.1384642124176025, "logps/chosen": -1.3418304920196533, "logps/rejected": -8.644782066345215, "loss": 1.3495, "odds_ratio_loss": 0.07701762765645981, "rewards/accuracies": 1.0, "rewards/chosen": -0.13418304920196533, "rewards/margins": 0.7302952408790588, "rewards/rejected": -0.8644782304763794, "sft_loss": 1.3418304920196533, "step": 10960 }, { "epoch": 0.85, "grad_norm": 5.142551898956299, "learning_rate": 5.320415216224767e-07, "logits/chosen": -1.3383108377456665, "logits/rejected": -0.87104332447052, "logps/chosen": -0.9291396141052246, "logps/rejected": -6.74056339263916, "loss": 0.9505, "odds_ratio_loss": 0.21329009532928467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09291397035121918, "rewards/margins": 0.5811423063278198, "rewards/rejected": -0.674056351184845, "sft_loss": 0.9291396141052246, "step": 10965 }, { "epoch": 0.85, "grad_norm": 6.258543491363525, "learning_rate": 5.292808957253265e-07, "logits/chosen": -1.3454580307006836, "logits/rejected": -1.1424548625946045, "logps/chosen": -0.8718706965446472, "logps/rejected": -6.058313846588135, "loss": 0.878, "odds_ratio_loss": 0.06122884154319763, "rewards/accuracies": 1.0, "rewards/chosen": -0.08718707412481308, "rewards/margins": 0.5186443328857422, "rewards/rejected": -0.6058313846588135, "sft_loss": 0.8718706965446472, "step": 10970 }, { "epoch": 0.85, "grad_norm": 29.74590301513672, "learning_rate": 5.265270502413228e-07, "logits/chosen": -1.266903281211853, "logits/rejected": -1.1316707134246826, "logps/chosen": -0.8083049058914185, "logps/rejected": -6.998124122619629, "loss": 0.8188, "odds_ratio_loss": 0.10462143272161484, "rewards/accuracies": 1.0, "rewards/chosen": -0.08083049952983856, "rewards/margins": 0.6189819574356079, "rewards/rejected": -0.6998124122619629, "sft_loss": 0.8083049058914185, "step": 10975 }, { "epoch": 0.85, "grad_norm": 10.037148475646973, "learning_rate": 5.237799893470219e-07, "logits/chosen": -1.4532562494277954, "logits/rejected": -0.8590501546859741, "logps/chosen": -1.0203161239624023, "logps/rejected": -7.895735263824463, "loss": 1.0291, "odds_ratio_loss": 0.08833594620227814, "rewards/accuracies": 1.0, "rewards/chosen": -0.10203162580728531, "rewards/margins": 0.6875419020652771, "rewards/rejected": -0.7895735502243042, "sft_loss": 1.0203161239624023, "step": 10980 }, { "epoch": 0.85, "grad_norm": 73.04912567138672, "learning_rate": 5.210397172086906e-07, "logits/chosen": -1.3931853771209717, "logits/rejected": -1.0637333393096924, "logps/chosen": -1.1848360300064087, "logps/rejected": -6.328829765319824, "loss": 1.1876, "odds_ratio_loss": 0.027847150340676308, "rewards/accuracies": 1.0, "rewards/chosen": -0.11848358809947968, "rewards/margins": 0.5143994092941284, "rewards/rejected": -0.6328829526901245, "sft_loss": 1.1848360300064087, "step": 10985 }, { "epoch": 0.85, "grad_norm": 16.441225051879883, "learning_rate": 5.183062379822978e-07, "logits/chosen": -1.3455500602722168, "logits/rejected": -1.5203089714050293, "logps/chosen": -0.9536484479904175, "logps/rejected": -9.948080062866211, "loss": 0.954, "odds_ratio_loss": 0.003847536165267229, "rewards/accuracies": 1.0, "rewards/chosen": -0.09536485373973846, "rewards/margins": 0.8994432687759399, "rewards/rejected": -0.9948080778121948, "sft_loss": 0.9536484479904175, "step": 10990 }, { "epoch": 0.86, "grad_norm": 6.653332233428955, "learning_rate": 5.155795558135141e-07, "logits/chosen": -1.4233778715133667, "logits/rejected": -0.9469043612480164, "logps/chosen": -1.0065068006515503, "logps/rejected": -5.959763050079346, "loss": 1.009, "odds_ratio_loss": 0.024972127750515938, "rewards/accuracies": 1.0, "rewards/chosen": -0.10065068304538727, "rewards/margins": 0.49532565474510193, "rewards/rejected": -0.5959763526916504, "sft_loss": 1.0065068006515503, "step": 10995 }, { "epoch": 0.86, "grad_norm": 173.84759521484375, "learning_rate": 5.128596748376979e-07, "logits/chosen": -1.4137773513793945, "logits/rejected": -0.8344324231147766, "logps/chosen": -0.9149678945541382, "logps/rejected": -7.3621320724487305, "loss": 0.9347, "odds_ratio_loss": 0.1972806602716446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09149680286645889, "rewards/margins": 0.6447164416313171, "rewards/rejected": -0.7362133264541626, "sft_loss": 0.9149678945541382, "step": 11000 }, { "epoch": 0.86, "grad_norm": 5.335231304168701, "learning_rate": 5.101465991798948e-07, "logits/chosen": -1.3805840015411377, "logits/rejected": -1.3037879467010498, "logps/chosen": -1.1943247318267822, "logps/rejected": -7.626928806304932, "loss": 1.2359, "odds_ratio_loss": 0.41593390703201294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1194324716925621, "rewards/margins": 0.6432604193687439, "rewards/rejected": -0.7626928091049194, "sft_loss": 1.1943247318267822, "step": 11005 }, { "epoch": 0.86, "grad_norm": 9.778425216674805, "learning_rate": 5.074403329548277e-07, "logits/chosen": -1.3550758361816406, "logits/rejected": -0.9329360723495483, "logps/chosen": -1.3633568286895752, "logps/rejected": -3.0032997131347656, "loss": 1.3902, "odds_ratio_loss": 0.2689247131347656, "rewards/accuracies": 1.0, "rewards/chosen": -0.13633567094802856, "rewards/margins": 0.16399429738521576, "rewards/rejected": -0.3003299832344055, "sft_loss": 1.3633568286895752, "step": 11010 }, { "epoch": 0.86, "grad_norm": 5.133905410766602, "learning_rate": 5.047408802668935e-07, "logits/chosen": -1.2931458950042725, "logits/rejected": -1.0412628650665283, "logps/chosen": -1.0921659469604492, "logps/rejected": -8.542851448059082, "loss": 1.116, "odds_ratio_loss": 0.2381249964237213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10921660810709, "rewards/margins": 0.7450685501098633, "rewards/rejected": -0.8542851209640503, "sft_loss": 1.0921659469604492, "step": 11015 }, { "epoch": 0.86, "grad_norm": 19.411243438720703, "learning_rate": 5.020482452101539e-07, "logits/chosen": -1.3749643564224243, "logits/rejected": -0.825789749622345, "logps/chosen": -0.9931381940841675, "logps/rejected": -13.348337173461914, "loss": 1.0066, "odds_ratio_loss": 0.13428032398223877, "rewards/accuracies": 1.0, "rewards/chosen": -0.09931383281946182, "rewards/margins": 1.2355200052261353, "rewards/rejected": -1.3348338603973389, "sft_loss": 0.9931381940841675, "step": 11020 }, { "epoch": 0.86, "grad_norm": 10.71738052368164, "learning_rate": 4.993624318683332e-07, "logits/chosen": -1.4427053928375244, "logits/rejected": -0.8330078125, "logps/chosen": -1.0185954570770264, "logps/rejected": -4.659827709197998, "loss": 1.0898, "odds_ratio_loss": 0.7123147249221802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10185954719781876, "rewards/margins": 0.3641231954097748, "rewards/rejected": -0.4659827649593353, "sft_loss": 1.0185954570770264, "step": 11025 }, { "epoch": 0.86, "grad_norm": 30.706079483032227, "learning_rate": 4.966834443148078e-07, "logits/chosen": -1.3195933103561401, "logits/rejected": -0.9074063301086426, "logps/chosen": -1.1589607000350952, "logps/rejected": -3.9544196128845215, "loss": 1.1997, "odds_ratio_loss": 0.4074534475803375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1158960685133934, "rewards/margins": 0.2795459032058716, "rewards/rejected": -0.3954419791698456, "sft_loss": 1.1589607000350952, "step": 11030 }, { "epoch": 0.86, "grad_norm": 58.909297943115234, "learning_rate": 4.940112866126018e-07, "logits/chosen": -1.4745378494262695, "logits/rejected": -1.5084199905395508, "logps/chosen": -1.2225674390792847, "logps/rejected": -6.94110631942749, "loss": 1.2467, "odds_ratio_loss": 0.24120387434959412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12225675582885742, "rewards/margins": 0.5718539357185364, "rewards/rejected": -0.6941106915473938, "sft_loss": 1.2225674390792847, "step": 11035 }, { "epoch": 0.86, "grad_norm": 4.487308025360107, "learning_rate": 4.913459628143829e-07, "logits/chosen": -1.391526222229004, "logits/rejected": -0.8932541608810425, "logps/chosen": -1.1362192630767822, "logps/rejected": -8.21275806427002, "loss": 1.1451, "odds_ratio_loss": 0.0883936733007431, "rewards/accuracies": 1.0, "rewards/chosen": -0.11362192779779434, "rewards/margins": 0.7076539397239685, "rewards/rejected": -0.8212758302688599, "sft_loss": 1.1362192630767822, "step": 11040 }, { "epoch": 0.86, "grad_norm": 23.82204246520996, "learning_rate": 4.886874769624528e-07, "logits/chosen": -1.4121615886688232, "logits/rejected": -0.8175728917121887, "logps/chosen": -0.7233660221099854, "logps/rejected": -3.2058944702148438, "loss": 0.7563, "odds_ratio_loss": 0.32946401834487915, "rewards/accuracies": 1.0, "rewards/chosen": -0.0723365992307663, "rewards/margins": 0.24825282394886017, "rewards/rejected": -0.32058948278427124, "sft_loss": 0.7233660221099854, "step": 11045 }, { "epoch": 0.86, "grad_norm": 7.111661911010742, "learning_rate": 4.860358330887421e-07, "logits/chosen": -1.4883744716644287, "logits/rejected": -1.2798588275909424, "logps/chosen": -1.2233302593231201, "logps/rejected": -7.7917656898498535, "loss": 1.2635, "odds_ratio_loss": 0.4016781449317932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12233302742242813, "rewards/margins": 0.6568435430526733, "rewards/rejected": -0.7791765928268433, "sft_loss": 1.2233302593231201, "step": 11050 }, { "epoch": 0.86, "grad_norm": 18.886579513549805, "learning_rate": 4.833910352148057e-07, "logits/chosen": -1.3979682922363281, "logits/rejected": -1.0197150707244873, "logps/chosen": -1.0084316730499268, "logps/rejected": -6.580392360687256, "loss": 1.0177, "odds_ratio_loss": 0.09293156862258911, "rewards/accuracies": 1.0, "rewards/chosen": -0.10084317624568939, "rewards/margins": 0.5571960806846619, "rewards/rejected": -0.6580392718315125, "sft_loss": 1.0084316730499268, "step": 11055 }, { "epoch": 0.86, "grad_norm": 401.8313293457031, "learning_rate": 4.807530873518157e-07, "logits/chosen": -1.48651921749115, "logits/rejected": -1.1749073266983032, "logps/chosen": -1.508371353149414, "logps/rejected": -16.79873275756836, "loss": 1.5167, "odds_ratio_loss": 0.0837855190038681, "rewards/accuracies": 1.0, "rewards/chosen": -0.15083713829517365, "rewards/margins": 1.5290361642837524, "rewards/rejected": -1.6798732280731201, "sft_loss": 1.508371353149414, "step": 11060 }, { "epoch": 0.86, "grad_norm": 10.496895790100098, "learning_rate": 4.781219935005548e-07, "logits/chosen": -1.273923635482788, "logits/rejected": -1.082719087600708, "logps/chosen": -0.9612051844596863, "logps/rejected": -9.734685897827148, "loss": 0.963, "odds_ratio_loss": 0.017861105501651764, "rewards/accuracies": 1.0, "rewards/chosen": -0.09612051397562027, "rewards/margins": 0.8773480653762817, "rewards/rejected": -0.9734686017036438, "sft_loss": 0.9612051844596863, "step": 11065 }, { "epoch": 0.86, "grad_norm": 79.49195098876953, "learning_rate": 4.754977576514097e-07, "logits/chosen": -1.2368603944778442, "logits/rejected": -0.8288145065307617, "logps/chosen": -1.0270787477493286, "logps/rejected": -4.39780855178833, "loss": 1.0337, "odds_ratio_loss": 0.06579282134771347, "rewards/accuracies": 1.0, "rewards/chosen": -0.1027078777551651, "rewards/margins": 0.3370729982852936, "rewards/rejected": -0.4397808611392975, "sft_loss": 1.0270787477493286, "step": 11070 }, { "epoch": 0.86, "grad_norm": 11.613289833068848, "learning_rate": 4.7288038378436876e-07, "logits/chosen": -1.3377621173858643, "logits/rejected": -0.9755362272262573, "logps/chosen": -1.0771583318710327, "logps/rejected": -3.616690158843994, "loss": 1.1031, "odds_ratio_loss": 0.2592793107032776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10771583020687103, "rewards/margins": 0.25395315885543823, "rewards/rejected": -0.36166900396347046, "sft_loss": 1.0771583318710327, "step": 11075 }, { "epoch": 0.86, "grad_norm": 28.060232162475586, "learning_rate": 4.702698758690116e-07, "logits/chosen": -1.3476572036743164, "logits/rejected": -1.550072431564331, "logps/chosen": -0.9806027412414551, "logps/rejected": -4.352713108062744, "loss": 0.9851, "odds_ratio_loss": 0.04496455565094948, "rewards/accuracies": 1.0, "rewards/chosen": -0.09806027263402939, "rewards/margins": 0.3372109830379486, "rewards/rejected": -0.4352712631225586, "sft_loss": 0.9806027412414551, "step": 11080 }, { "epoch": 0.86, "grad_norm": 242.48379516601562, "learning_rate": 4.676662378645042e-07, "logits/chosen": -1.3592426776885986, "logits/rejected": -1.5002645254135132, "logps/chosen": -1.485435962677002, "logps/rejected": -9.078450202941895, "loss": 1.5257, "odds_ratio_loss": 0.4027434289455414, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1485435962677002, "rewards/margins": 0.759301483631134, "rewards/rejected": -0.9078450202941895, "sft_loss": 1.485435962677002, "step": 11085 }, { "epoch": 0.86, "grad_norm": 23.109437942504883, "learning_rate": 4.650694737195949e-07, "logits/chosen": -1.4202067852020264, "logits/rejected": -1.1966689825057983, "logps/chosen": -0.6969403028488159, "logps/rejected": -7.00976037979126, "loss": 0.7074, "odds_ratio_loss": 0.10413110256195068, "rewards/accuracies": 1.0, "rewards/chosen": -0.06969402730464935, "rewards/margins": 0.6312819719314575, "rewards/rejected": -0.7009760141372681, "sft_loss": 0.6969403028488159, "step": 11090 }, { "epoch": 0.86, "grad_norm": 5.188475131988525, "learning_rate": 4.6247958737260623e-07, "logits/chosen": -1.4141209125518799, "logits/rejected": -1.0835773944854736, "logps/chosen": -0.8121021389961243, "logps/rejected": -14.08741283416748, "loss": 0.8122, "odds_ratio_loss": 0.0008768331026658416, "rewards/accuracies": 1.0, "rewards/chosen": -0.08121021836996078, "rewards/margins": 1.327531099319458, "rewards/rejected": -1.4087413549423218, "sft_loss": 0.8121021389961243, "step": 11095 }, { "epoch": 0.86, "grad_norm": 6.953471660614014, "learning_rate": 4.598965827514279e-07, "logits/chosen": -1.4587968587875366, "logits/rejected": -1.3610684871673584, "logps/chosen": -0.9946497082710266, "logps/rejected": -9.220718383789062, "loss": 1.01, "odds_ratio_loss": 0.15365691483020782, "rewards/accuracies": 1.0, "rewards/chosen": -0.09946496784687042, "rewards/margins": 0.8226070404052734, "rewards/rejected": -0.9220719337463379, "sft_loss": 0.9946497082710266, "step": 11100 }, { "epoch": 0.86, "grad_norm": 2.0856804847717285, "learning_rate": 4.573204637735174e-07, "logits/chosen": -1.212342619895935, "logits/rejected": -0.8433161973953247, "logps/chosen": -0.6405612826347351, "logps/rejected": -5.117362022399902, "loss": 0.6488, "odds_ratio_loss": 0.08268775045871735, "rewards/accuracies": 1.0, "rewards/chosen": -0.06405612826347351, "rewards/margins": 0.44768014550209045, "rewards/rejected": -0.511736273765564, "sft_loss": 0.6405612826347351, "step": 11105 }, { "epoch": 0.86, "grad_norm": 6.809403896331787, "learning_rate": 4.547512343458843e-07, "logits/chosen": -1.3145198822021484, "logits/rejected": -0.7401739954948425, "logps/chosen": -0.9998018145561218, "logps/rejected": -3.8390610218048096, "loss": 1.0124, "odds_ratio_loss": 0.1258481740951538, "rewards/accuracies": 1.0, "rewards/chosen": -0.0999801829457283, "rewards/margins": 0.2839259207248688, "rewards/rejected": -0.3839060664176941, "sft_loss": 0.9998018145561218, "step": 11110 }, { "epoch": 0.86, "grad_norm": 6.046141147613525, "learning_rate": 4.5218889836509185e-07, "logits/chosen": -1.3540902137756348, "logits/rejected": -1.4832431077957153, "logps/chosen": -0.7950665950775146, "logps/rejected": -10.756556510925293, "loss": 0.7951, "odds_ratio_loss": 0.0003343525168020278, "rewards/accuracies": 1.0, "rewards/chosen": -0.07950666546821594, "rewards/margins": 0.9961490631103516, "rewards/rejected": -1.0756556987762451, "sft_loss": 0.7950665950775146, "step": 11115 }, { "epoch": 0.87, "grad_norm": 5.530214309692383, "learning_rate": 4.4963345971724747e-07, "logits/chosen": -1.4007446765899658, "logits/rejected": -0.9551407098770142, "logps/chosen": -0.9888604879379272, "logps/rejected": -4.050307273864746, "loss": 1.0175, "odds_ratio_loss": 0.28623491525650024, "rewards/accuracies": 1.0, "rewards/chosen": -0.09888605773448944, "rewards/margins": 0.3061446249485016, "rewards/rejected": -0.40503066778182983, "sft_loss": 0.9888604879379272, "step": 11120 }, { "epoch": 0.87, "grad_norm": 7.572410583496094, "learning_rate": 4.4708492227799824e-07, "logits/chosen": -1.2860050201416016, "logits/rejected": -1.1336227655410767, "logps/chosen": -0.8657090067863464, "logps/rejected": -9.67644214630127, "loss": 0.8677, "odds_ratio_loss": 0.02012895792722702, "rewards/accuracies": 1.0, "rewards/chosen": -0.08657090365886688, "rewards/margins": 0.8810732960700989, "rewards/rejected": -0.967644214630127, "sft_loss": 0.8657090067863464, "step": 11125 }, { "epoch": 0.87, "grad_norm": 9.17696475982666, "learning_rate": 4.4454328991252517e-07, "logits/chosen": -1.2129541635513306, "logits/rejected": -1.0243322849273682, "logps/chosen": -0.9325492978096008, "logps/rejected": -6.651510715484619, "loss": 0.9378, "odds_ratio_loss": 0.05253799632191658, "rewards/accuracies": 1.0, "rewards/chosen": -0.0932549387216568, "rewards/margins": 0.5718960762023926, "rewards/rejected": -0.6651510000228882, "sft_loss": 0.9325492978096008, "step": 11130 }, { "epoch": 0.87, "grad_norm": 6.805229663848877, "learning_rate": 4.4200856647553527e-07, "logits/chosen": -1.3854291439056396, "logits/rejected": -1.1689527034759521, "logps/chosen": -1.0263032913208008, "logps/rejected": -4.803722858428955, "loss": 1.0321, "odds_ratio_loss": 0.057866036891937256, "rewards/accuracies": 1.0, "rewards/chosen": -0.10263033211231232, "rewards/margins": 0.3777419924736023, "rewards/rejected": -0.48037227988243103, "sft_loss": 1.0263032913208008, "step": 11135 }, { "epoch": 0.87, "grad_norm": 6.040757179260254, "learning_rate": 4.394807558112607e-07, "logits/chosen": -1.3923509120941162, "logits/rejected": -1.3340778350830078, "logps/chosen": -0.826248824596405, "logps/rejected": -9.858784675598145, "loss": 0.873, "odds_ratio_loss": 0.46788063645362854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0826248899102211, "rewards/margins": 0.9032536745071411, "rewards/rejected": -0.985878586769104, "sft_loss": 0.826248824596405, "step": 11140 }, { "epoch": 0.87, "grad_norm": 6.630565643310547, "learning_rate": 4.3695986175344596e-07, "logits/chosen": -1.365642786026001, "logits/rejected": -1.1266989707946777, "logps/chosen": -0.995932400226593, "logps/rejected": -5.503636837005615, "loss": 1.0326, "odds_ratio_loss": 0.3664228916168213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09959324449300766, "rewards/margins": 0.4507705271244049, "rewards/rejected": -0.5503637194633484, "sft_loss": 0.995932400226593, "step": 11145 }, { "epoch": 0.87, "grad_norm": 14.042057037353516, "learning_rate": 4.344458881253455e-07, "logits/chosen": -1.4202449321746826, "logits/rejected": -0.9090083837509155, "logps/chosen": -1.0853300094604492, "logps/rejected": -6.483142852783203, "loss": 1.1, "odds_ratio_loss": 0.1471116840839386, "rewards/accuracies": 1.0, "rewards/chosen": -0.10853300988674164, "rewards/margins": 0.5397812724113464, "rewards/rejected": -0.6483142375946045, "sft_loss": 1.0853300094604492, "step": 11150 }, { "epoch": 0.87, "grad_norm": 83.15284729003906, "learning_rate": 4.319388387397228e-07, "logits/chosen": -1.2567228078842163, "logits/rejected": -0.9255503416061401, "logps/chosen": -0.9807448387145996, "logps/rejected": -11.109251022338867, "loss": 0.9967, "odds_ratio_loss": 0.15947206318378448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09807448089122772, "rewards/margins": 1.0128507614135742, "rewards/rejected": -1.110925316810608, "sft_loss": 0.9807448387145996, "step": 11155 }, { "epoch": 0.87, "grad_norm": 13.364371299743652, "learning_rate": 4.29438717398834e-07, "logits/chosen": -1.3048971891403198, "logits/rejected": -1.3854659795761108, "logps/chosen": -0.9401968717575073, "logps/rejected": -8.30219554901123, "loss": 0.9664, "odds_ratio_loss": 0.26237112283706665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09401968866586685, "rewards/margins": 0.7361998558044434, "rewards/rejected": -0.830219566822052, "sft_loss": 0.9401968717575073, "step": 11160 }, { "epoch": 0.87, "grad_norm": 17.632877349853516, "learning_rate": 4.2694552789443177e-07, "logits/chosen": -1.3702666759490967, "logits/rejected": -0.9406827688217163, "logps/chosen": -0.9656890630722046, "logps/rejected": -2.2116432189941406, "loss": 1.1027, "odds_ratio_loss": 1.3702127933502197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09656891971826553, "rewards/margins": 0.12459540367126465, "rewards/rejected": -0.22116431593894958, "sft_loss": 0.9656890630722046, "step": 11165 }, { "epoch": 0.87, "grad_norm": 7.403985977172852, "learning_rate": 4.244592740077541e-07, "logits/chosen": -1.410797119140625, "logits/rejected": -1.1965923309326172, "logps/chosen": -0.9599423408508301, "logps/rejected": -9.67354679107666, "loss": 0.9907, "odds_ratio_loss": 0.307929128408432, "rewards/accuracies": 1.0, "rewards/chosen": -0.09599423408508301, "rewards/margins": 0.8713604211807251, "rewards/rejected": -0.9673545956611633, "sft_loss": 0.9599423408508301, "step": 11170 }, { "epoch": 0.87, "grad_norm": 14.449532508850098, "learning_rate": 4.2197995950952084e-07, "logits/chosen": -1.4222707748413086, "logits/rejected": -1.011182427406311, "logps/chosen": -0.7854120135307312, "logps/rejected": -1.555874228477478, "loss": 0.8625, "odds_ratio_loss": 0.7705218195915222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07854120433330536, "rewards/margins": 0.0770462304353714, "rewards/rejected": -0.15558741986751556, "sft_loss": 0.7854120135307312, "step": 11175 }, { "epoch": 0.87, "grad_norm": 23.771411895751953, "learning_rate": 4.1950758815992645e-07, "logits/chosen": -1.3169772624969482, "logits/rejected": -0.9580669403076172, "logps/chosen": -0.9869791269302368, "logps/rejected": -5.212032318115234, "loss": 1.0013, "odds_ratio_loss": 0.1432759016752243, "rewards/accuracies": 1.0, "rewards/chosen": -0.09869791567325592, "rewards/margins": 0.42250528931617737, "rewards/rejected": -0.5212032198905945, "sft_loss": 0.9869791269302368, "step": 11180 }, { "epoch": 0.87, "grad_norm": 132.80825805664062, "learning_rate": 4.170421637086364e-07, "logits/chosen": -1.4070345163345337, "logits/rejected": -1.1488429307937622, "logps/chosen": -1.3363525867462158, "logps/rejected": -8.6935453414917, "loss": 1.3374, "odds_ratio_loss": 0.010287756100296974, "rewards/accuracies": 1.0, "rewards/chosen": -0.1336352527141571, "rewards/margins": 0.7357192635536194, "rewards/rejected": -0.8693544268608093, "sft_loss": 1.3363525867462158, "step": 11185 }, { "epoch": 0.87, "grad_norm": 5.368658065795898, "learning_rate": 4.145836898947808e-07, "logits/chosen": -1.4635095596313477, "logits/rejected": -1.2153332233428955, "logps/chosen": -0.8983847498893738, "logps/rejected": -9.342020034790039, "loss": 0.9048, "odds_ratio_loss": 0.06378494203090668, "rewards/accuracies": 1.0, "rewards/chosen": -0.08983847498893738, "rewards/margins": 0.8443635106086731, "rewards/rejected": -0.9342020153999329, "sft_loss": 0.8983847498893738, "step": 11190 }, { "epoch": 0.87, "grad_norm": 413.733642578125, "learning_rate": 4.121321704469461e-07, "logits/chosen": -1.367821455001831, "logits/rejected": -0.9976612329483032, "logps/chosen": -1.4773330688476562, "logps/rejected": -15.2203950881958, "loss": 1.4774, "odds_ratio_loss": 0.00020272521942388266, "rewards/accuracies": 1.0, "rewards/chosen": -0.14773330092430115, "rewards/margins": 1.3743062019348145, "rewards/rejected": -1.522039532661438, "sft_loss": 1.4773330688476562, "step": 11195 }, { "epoch": 0.87, "grad_norm": 12.227411270141602, "learning_rate": 4.0968760908317304e-07, "logits/chosen": -1.2860291004180908, "logits/rejected": -1.047076940536499, "logps/chosen": -1.0616978406906128, "logps/rejected": -4.308385372161865, "loss": 1.0905, "odds_ratio_loss": 0.2877461612224579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10616978257894516, "rewards/margins": 0.3246687948703766, "rewards/rejected": -0.43083858489990234, "sft_loss": 1.0616978406906128, "step": 11200 }, { "epoch": 0.87, "grad_norm": 100.4394302368164, "learning_rate": 4.0725000951094994e-07, "logits/chosen": -1.2629293203353882, "logits/rejected": -1.1446837186813354, "logps/chosen": -0.8070009350776672, "logps/rejected": -5.544220447540283, "loss": 0.8434, "odds_ratio_loss": 0.3644154667854309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0807000994682312, "rewards/margins": 0.4737219214439392, "rewards/rejected": -0.5544220209121704, "sft_loss": 0.8070009350776672, "step": 11205 }, { "epoch": 0.87, "grad_norm": 23.128013610839844, "learning_rate": 4.0481937542720615e-07, "logits/chosen": -1.3773269653320312, "logits/rejected": -0.9969242215156555, "logps/chosen": -0.6857340931892395, "logps/rejected": -3.768573760986328, "loss": 0.6966, "odds_ratio_loss": 0.10868772119283676, "rewards/accuracies": 1.0, "rewards/chosen": -0.06857341527938843, "rewards/margins": 0.3082839548587799, "rewards/rejected": -0.37685737013816833, "sft_loss": 0.6857340931892395, "step": 11210 }, { "epoch": 0.87, "grad_norm": 23.43471908569336, "learning_rate": 4.023957105183052e-07, "logits/chosen": -1.2603623867034912, "logits/rejected": -1.3566380739212036, "logps/chosen": -1.3014967441558838, "logps/rejected": -4.620263576507568, "loss": 1.3311, "odds_ratio_loss": 0.29601508378982544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13014967739582062, "rewards/margins": 0.3318766951560974, "rewards/rejected": -0.4620264172554016, "sft_loss": 1.3014967441558838, "step": 11215 }, { "epoch": 0.87, "grad_norm": 323.7796630859375, "learning_rate": 3.999790184600449e-07, "logits/chosen": -1.2544281482696533, "logits/rejected": -1.3168442249298096, "logps/chosen": -1.0642973184585571, "logps/rejected": -5.73541259765625, "loss": 1.075, "odds_ratio_loss": 0.10698781907558441, "rewards/accuracies": 1.0, "rewards/chosen": -0.10642973333597183, "rewards/margins": 0.4671115279197693, "rewards/rejected": -0.5735412836074829, "sft_loss": 1.0642973184585571, "step": 11220 }, { "epoch": 0.87, "grad_norm": 10.882088661193848, "learning_rate": 3.975693029176447e-07, "logits/chosen": -1.3870043754577637, "logits/rejected": -1.2248777151107788, "logps/chosen": -1.020098328590393, "logps/rejected": -6.03187370300293, "loss": 1.059, "odds_ratio_loss": 0.3891030550003052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1020098328590393, "rewards/margins": 0.5011776089668274, "rewards/rejected": -0.6031874418258667, "sft_loss": 1.020098328590393, "step": 11225 }, { "epoch": 0.87, "grad_norm": 142.18104553222656, "learning_rate": 3.951665675457433e-07, "logits/chosen": -1.3323876857757568, "logits/rejected": -1.2044421434402466, "logps/chosen": -1.1101775169372559, "logps/rejected": -9.553766250610352, "loss": 1.1284, "odds_ratio_loss": 0.1818692684173584, "rewards/accuracies": 1.0, "rewards/chosen": -0.11101774871349335, "rewards/margins": 0.8443588018417358, "rewards/rejected": -0.9553766250610352, "sft_loss": 1.1101775169372559, "step": 11230 }, { "epoch": 0.87, "grad_norm": 13.318952560424805, "learning_rate": 3.9277081598839526e-07, "logits/chosen": -1.2420969009399414, "logits/rejected": -1.1654160022735596, "logps/chosen": -1.0236912965774536, "logps/rejected": -6.747973442077637, "loss": 1.0563, "odds_ratio_loss": 0.32635438442230225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10236912965774536, "rewards/margins": 0.5724282264709473, "rewards/rejected": -0.6747973561286926, "sft_loss": 1.0236912965774536, "step": 11235 }, { "epoch": 0.87, "grad_norm": 9.043617248535156, "learning_rate": 3.903820518790613e-07, "logits/chosen": -1.3946565389633179, "logits/rejected": -1.2541202306747437, "logps/chosen": -1.0738626718521118, "logps/rejected": -10.492671012878418, "loss": 1.0827, "odds_ratio_loss": 0.0884605199098587, "rewards/accuracies": 1.0, "rewards/chosen": -0.1073862686753273, "rewards/margins": 0.9418808221817017, "rewards/rejected": -1.0492671728134155, "sft_loss": 1.0738626718521118, "step": 11240 }, { "epoch": 0.87, "grad_norm": 17.10003089904785, "learning_rate": 3.8800027884060564e-07, "logits/chosen": -1.3552266359329224, "logits/rejected": -1.5189950466156006, "logps/chosen": -1.181077480316162, "logps/rejected": -6.34781551361084, "loss": 1.201, "odds_ratio_loss": 0.19888582825660706, "rewards/accuracies": 1.0, "rewards/chosen": -0.11810775101184845, "rewards/margins": 0.5166738629341125, "rewards/rejected": -0.6347816586494446, "sft_loss": 1.181077480316162, "step": 11245 }, { "epoch": 0.88, "grad_norm": 9.954345703125, "learning_rate": 3.8562550048528823e-07, "logits/chosen": -1.1790051460266113, "logits/rejected": -1.1716651916503906, "logps/chosen": -1.0941330194473267, "logps/rejected": -10.212206840515137, "loss": 1.0994, "odds_ratio_loss": 0.05296441912651062, "rewards/accuracies": 1.0, "rewards/chosen": -0.10941328853368759, "rewards/margins": 0.9118073582649231, "rewards/rejected": -1.0212206840515137, "sft_loss": 1.0941330194473267, "step": 11250 }, { "epoch": 0.88, "grad_norm": 17.24154281616211, "learning_rate": 3.832577204147642e-07, "logits/chosen": -1.3043853044509888, "logits/rejected": -1.6192779541015625, "logps/chosen": -0.9539308547973633, "logps/rejected": -6.1093950271606445, "loss": 0.9641, "odds_ratio_loss": 0.10161396116018295, "rewards/accuracies": 1.0, "rewards/chosen": -0.09539308398962021, "rewards/margins": 0.5155463814735413, "rewards/rejected": -0.6109394431114197, "sft_loss": 0.9539308547973633, "step": 11255 }, { "epoch": 0.88, "grad_norm": 5.719317436218262, "learning_rate": 3.8089694222007144e-07, "logits/chosen": -1.3731346130371094, "logits/rejected": -0.7326057553291321, "logps/chosen": -0.9768118858337402, "logps/rejected": -4.670573711395264, "loss": 1.0153, "odds_ratio_loss": 0.38526564836502075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0976811945438385, "rewards/margins": 0.36937612295150757, "rewards/rejected": -0.46705737709999084, "sft_loss": 0.9768118858337402, "step": 11260 }, { "epoch": 0.88, "grad_norm": 6.651812553405762, "learning_rate": 3.785431694816294e-07, "logits/chosen": -1.2109453678131104, "logits/rejected": -1.645216941833496, "logps/chosen": -1.0555239915847778, "logps/rejected": -10.367178916931152, "loss": 1.072, "odds_ratio_loss": 0.16514301300048828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10555239766836166, "rewards/margins": 0.9311655163764954, "rewards/rejected": -1.0367180109024048, "sft_loss": 1.0555239915847778, "step": 11265 }, { "epoch": 0.88, "grad_norm": 13.967916488647461, "learning_rate": 3.761964057692341e-07, "logits/chosen": -1.1737172603607178, "logits/rejected": -1.1151050329208374, "logps/chosen": -0.8013264536857605, "logps/rejected": -5.700438976287842, "loss": 0.809, "odds_ratio_loss": 0.07676169276237488, "rewards/accuracies": 1.0, "rewards/chosen": -0.08013264834880829, "rewards/margins": 0.4899112582206726, "rewards/rejected": -0.5700439214706421, "sft_loss": 0.8013264536857605, "step": 11270 }, { "epoch": 0.88, "grad_norm": 38.32868957519531, "learning_rate": 3.738566546420513e-07, "logits/chosen": -1.3379786014556885, "logits/rejected": -1.4890538454055786, "logps/chosen": -0.9517307281494141, "logps/rejected": -5.213587284088135, "loss": 0.973, "odds_ratio_loss": 0.2128792256116867, "rewards/accuracies": 1.0, "rewards/chosen": -0.09517307579517365, "rewards/margins": 0.426185667514801, "rewards/rejected": -0.5213587284088135, "sft_loss": 0.9517307281494141, "step": 11275 }, { "epoch": 0.88, "grad_norm": 6.2815141677856445, "learning_rate": 3.7152391964860924e-07, "logits/chosen": -1.3357675075531006, "logits/rejected": -1.0225775241851807, "logps/chosen": -0.9496587514877319, "logps/rejected": -7.482272148132324, "loss": 0.9504, "odds_ratio_loss": 0.007433583028614521, "rewards/accuracies": 1.0, "rewards/chosen": -0.0949658751487732, "rewards/margins": 0.6532613635063171, "rewards/rejected": -0.7482272386550903, "sft_loss": 0.9496587514877319, "step": 11280 }, { "epoch": 0.88, "grad_norm": 6.459371566772461, "learning_rate": 3.691982043267972e-07, "logits/chosen": -1.4276762008666992, "logits/rejected": -0.9678457975387573, "logps/chosen": -0.990170955657959, "logps/rejected": -2.987020969390869, "loss": 1.0233, "odds_ratio_loss": 0.33171772956848145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09901710599660873, "rewards/margins": 0.1996849775314331, "rewards/rejected": -0.29870206117630005, "sft_loss": 0.990170955657959, "step": 11285 }, { "epoch": 0.88, "grad_norm": 10.674115180969238, "learning_rate": 3.668795122038582e-07, "logits/chosen": -1.3349639177322388, "logits/rejected": -1.1982815265655518, "logps/chosen": -0.7671822905540466, "logps/rejected": -4.252364158630371, "loss": 0.7922, "odds_ratio_loss": 0.2504265308380127, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07671823352575302, "rewards/margins": 0.3485181927680969, "rewards/rejected": -0.42523640394210815, "sft_loss": 0.7671822905540466, "step": 11290 }, { "epoch": 0.88, "grad_norm": 4.621835708618164, "learning_rate": 3.6456784679638256e-07, "logits/chosen": -1.3116695880889893, "logits/rejected": -0.8658286333084106, "logps/chosen": -0.89985591173172, "logps/rejected": -6.814452171325684, "loss": 0.9004, "odds_ratio_loss": 0.005296620540320873, "rewards/accuracies": 1.0, "rewards/chosen": -0.08998559415340424, "rewards/margins": 0.5914596319198608, "rewards/rejected": -0.6814452409744263, "sft_loss": 0.89985591173172, "step": 11295 }, { "epoch": 0.88, "grad_norm": 7.848481178283691, "learning_rate": 3.6226321161030367e-07, "logits/chosen": -1.4205824136734009, "logits/rejected": -1.0507280826568604, "logps/chosen": -0.968035876750946, "logps/rejected": -3.0656139850616455, "loss": 1.0092, "odds_ratio_loss": 0.41196268796920776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09680359065532684, "rewards/margins": 0.20975780487060547, "rewards/rejected": -0.3065614104270935, "sft_loss": 0.968035876750946, "step": 11300 }, { "epoch": 0.88, "grad_norm": 8.965922355651855, "learning_rate": 3.599656101408955e-07, "logits/chosen": -1.1956322193145752, "logits/rejected": -1.2085590362548828, "logps/chosen": -0.8699887990951538, "logps/rejected": -7.0282883644104, "loss": 0.8702, "odds_ratio_loss": 0.0024738397914916277, "rewards/accuracies": 1.0, "rewards/chosen": -0.08699888736009598, "rewards/margins": 0.6158300042152405, "rewards/rejected": -0.7028288841247559, "sft_loss": 0.8699887990951538, "step": 11305 }, { "epoch": 0.88, "grad_norm": 27.91067123413086, "learning_rate": 3.5767504587276124e-07, "logits/chosen": -1.1915757656097412, "logits/rejected": -1.0515540838241577, "logps/chosen": -0.9764666557312012, "logps/rejected": -9.82097053527832, "loss": 0.9774, "odds_ratio_loss": 0.009811131283640862, "rewards/accuracies": 1.0, "rewards/chosen": -0.09764666110277176, "rewards/margins": 0.8844503164291382, "rewards/rejected": -0.9820969700813293, "sft_loss": 0.9764666557312012, "step": 11310 }, { "epoch": 0.88, "grad_norm": 16.605636596679688, "learning_rate": 3.5539152227983155e-07, "logits/chosen": -1.1500060558319092, "logits/rejected": -1.403136968612671, "logps/chosen": -1.2175300121307373, "logps/rejected": -5.237689971923828, "loss": 1.2674, "odds_ratio_loss": 0.4989433288574219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1217530146241188, "rewards/margins": 0.4020160138607025, "rewards/rejected": -0.5237690210342407, "sft_loss": 1.2175300121307373, "step": 11315 }, { "epoch": 0.88, "grad_norm": 12.524778366088867, "learning_rate": 3.531150428253616e-07, "logits/chosen": -1.311918020248413, "logits/rejected": -0.9865154027938843, "logps/chosen": -1.1745331287384033, "logps/rejected": -7.8709306716918945, "loss": 1.1877, "odds_ratio_loss": 0.13171400129795074, "rewards/accuracies": 1.0, "rewards/chosen": -0.11745331436395645, "rewards/margins": 0.6696397066116333, "rewards/rejected": -0.7870929837226868, "sft_loss": 1.1745331287384033, "step": 11320 }, { "epoch": 0.88, "grad_norm": 10.650303840637207, "learning_rate": 3.508456109619207e-07, "logits/chosen": -1.5033023357391357, "logits/rejected": -1.136890172958374, "logps/chosen": -1.2400705814361572, "logps/rejected": -3.5096442699432373, "loss": 1.2745, "odds_ratio_loss": 0.3439212441444397, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12400706857442856, "rewards/margins": 0.22695736587047577, "rewards/rejected": -0.35096442699432373, "sft_loss": 1.2400705814361572, "step": 11325 }, { "epoch": 0.88, "grad_norm": 414.8785400390625, "learning_rate": 3.485832301313896e-07, "logits/chosen": -1.3364065885543823, "logits/rejected": -0.9892686605453491, "logps/chosen": -1.139243721961975, "logps/rejected": -7.371476173400879, "loss": 1.1449, "odds_ratio_loss": 0.056998781859874725, "rewards/accuracies": 1.0, "rewards/chosen": -0.11392436176538467, "rewards/margins": 0.6232232451438904, "rewards/rejected": -0.7371476292610168, "sft_loss": 1.139243721961975, "step": 11330 }, { "epoch": 0.88, "grad_norm": 5.380289554595947, "learning_rate": 3.463279037649575e-07, "logits/chosen": -1.3991533517837524, "logits/rejected": -0.8975147008895874, "logps/chosen": -0.9783090353012085, "logps/rejected": -5.975537300109863, "loss": 1.0223, "odds_ratio_loss": 0.4397381842136383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09783090651035309, "rewards/margins": 0.49972280859947205, "rewards/rejected": -0.5975537300109863, "sft_loss": 0.9783090353012085, "step": 11335 }, { "epoch": 0.88, "grad_norm": 4.831082820892334, "learning_rate": 3.440796352831133e-07, "logits/chosen": -1.316606879234314, "logits/rejected": -1.2849032878875732, "logps/chosen": -0.7927159070968628, "logps/rejected": -8.25066089630127, "loss": 0.8168, "odds_ratio_loss": 0.24042615294456482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.079271599650383, "rewards/margins": 0.745794415473938, "rewards/rejected": -0.8250659704208374, "sft_loss": 0.7927159070968628, "step": 11340 }, { "epoch": 0.88, "grad_norm": 24.93393325805664, "learning_rate": 3.4183842809563993e-07, "logits/chosen": -1.4244930744171143, "logits/rejected": -1.3379783630371094, "logps/chosen": -1.170127272605896, "logps/rejected": -3.386500120162964, "loss": 1.1928, "odds_ratio_loss": 0.2265014946460724, "rewards/accuracies": 1.0, "rewards/chosen": -0.11701272428035736, "rewards/margins": 0.2216372936964035, "rewards/rejected": -0.33865001797676086, "sft_loss": 1.170127272605896, "step": 11345 }, { "epoch": 0.88, "grad_norm": 11.719467163085938, "learning_rate": 3.396042856016141e-07, "logits/chosen": -1.2225192785263062, "logits/rejected": -1.03545081615448, "logps/chosen": -0.8588349223136902, "logps/rejected": -6.034967422485352, "loss": 0.8724, "odds_ratio_loss": 0.13602833449840546, "rewards/accuracies": 1.0, "rewards/chosen": -0.08588350564241409, "rewards/margins": 0.5176132321357727, "rewards/rejected": -0.6034967303276062, "sft_loss": 0.8588349223136902, "step": 11350 }, { "epoch": 0.88, "grad_norm": 5.902450084686279, "learning_rate": 3.3737721118939637e-07, "logits/chosen": -1.2625263929367065, "logits/rejected": -1.2296942472457886, "logps/chosen": -1.0912928581237793, "logps/rejected": -8.46910572052002, "loss": 1.0968, "odds_ratio_loss": 0.054971031844615936, "rewards/accuracies": 1.0, "rewards/chosen": -0.10912929475307465, "rewards/margins": 0.7377813458442688, "rewards/rejected": -0.8469105958938599, "sft_loss": 1.0912928581237793, "step": 11355 }, { "epoch": 0.88, "grad_norm": 29.052635192871094, "learning_rate": 3.351572082366267e-07, "logits/chosen": -1.0655205249786377, "logits/rejected": -1.3246183395385742, "logps/chosen": -1.7015535831451416, "logps/rejected": -3.9158992767333984, "loss": 1.7642, "odds_ratio_loss": 0.6268733739852905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1701553612947464, "rewards/margins": 0.2214345484972, "rewards/rejected": -0.3915899395942688, "sft_loss": 1.7015535831451416, "step": 11360 }, { "epoch": 0.88, "grad_norm": 7.776186943054199, "learning_rate": 3.329442801102223e-07, "logits/chosen": -1.2450320720672607, "logits/rejected": -1.2915815114974976, "logps/chosen": -1.362554907798767, "logps/rejected": -8.15864086151123, "loss": 1.3835, "odds_ratio_loss": 0.20922088623046875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13625548779964447, "rewards/margins": 0.6796085238456726, "rewards/rejected": -0.815864086151123, "sft_loss": 1.362554907798767, "step": 11365 }, { "epoch": 0.88, "grad_norm": 102.14395904541016, "learning_rate": 3.3073843016636964e-07, "logits/chosen": -1.1582549810409546, "logits/rejected": -0.6190916299819946, "logps/chosen": -1.317775011062622, "logps/rejected": -6.590193271636963, "loss": 1.3365, "odds_ratio_loss": 0.18727946281433105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13177749514579773, "rewards/margins": 0.5272418260574341, "rewards/rejected": -0.6590193510055542, "sft_loss": 1.317775011062622, "step": 11370 }, { "epoch": 0.88, "grad_norm": 918.7132568359375, "learning_rate": 3.285396617505204e-07, "logits/chosen": -1.2746855020523071, "logits/rejected": -1.293368935585022, "logps/chosen": -1.331923007965088, "logps/rejected": -6.492204189300537, "loss": 1.3342, "odds_ratio_loss": 0.023071136325597763, "rewards/accuracies": 1.0, "rewards/chosen": -0.1331923007965088, "rewards/margins": 0.516028106212616, "rewards/rejected": -0.6492204070091248, "sft_loss": 1.331923007965088, "step": 11375 }, { "epoch": 0.89, "grad_norm": 10.767485618591309, "learning_rate": 3.263479781973855e-07, "logits/chosen": -1.3824785947799683, "logits/rejected": -1.3801668882369995, "logps/chosen": -0.8377906680107117, "logps/rejected": -6.220038414001465, "loss": 0.8751, "odds_ratio_loss": 0.37284550070762634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08377906680107117, "rewards/margins": 0.5382248163223267, "rewards/rejected": -0.6220038533210754, "sft_loss": 0.8377906680107117, "step": 11380 }, { "epoch": 0.89, "grad_norm": 6.239452838897705, "learning_rate": 3.2416338283093207e-07, "logits/chosen": -1.3986380100250244, "logits/rejected": -0.7916896343231201, "logps/chosen": -1.043060541152954, "logps/rejected": -12.193835258483887, "loss": 1.0588, "odds_ratio_loss": 0.15780356526374817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10430605709552765, "rewards/margins": 1.1150774955749512, "rewards/rejected": -1.2193834781646729, "sft_loss": 1.043060541152954, "step": 11385 }, { "epoch": 0.89, "grad_norm": 6.802426338195801, "learning_rate": 3.2198587896437593e-07, "logits/chosen": -1.327599287033081, "logits/rejected": -0.993168830871582, "logps/chosen": -0.9170148968696594, "logps/rejected": -2.109870433807373, "loss": 0.9439, "odds_ratio_loss": 0.2683650553226471, "rewards/accuracies": 1.0, "rewards/chosen": -0.09170148521661758, "rewards/margins": 0.11928558349609375, "rewards/rejected": -0.21098704636096954, "sft_loss": 0.9170148968696594, "step": 11390 }, { "epoch": 0.89, "grad_norm": 13.555157661437988, "learning_rate": 3.198154699001782e-07, "logits/chosen": -0.9407708048820496, "logits/rejected": -1.7492096424102783, "logps/chosen": -0.8097691535949707, "logps/rejected": -7.6386284828186035, "loss": 0.8157, "odds_ratio_loss": 0.0593709759414196, "rewards/accuracies": 1.0, "rewards/chosen": -0.08097691833972931, "rewards/margins": 0.6828858256340027, "rewards/rejected": -0.7638627886772156, "sft_loss": 0.8097691535949707, "step": 11395 }, { "epoch": 0.89, "grad_norm": 187.50027465820312, "learning_rate": 3.1765215893003967e-07, "logits/chosen": -1.0976942777633667, "logits/rejected": -1.2680959701538086, "logps/chosen": -1.2602325677871704, "logps/rejected": -6.308053970336914, "loss": 1.3144, "odds_ratio_loss": 0.5419572591781616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12602324783802032, "rewards/margins": 0.5047820806503296, "rewards/rejected": -0.6308053731918335, "sft_loss": 1.2602325677871704, "step": 11400 }, { "epoch": 0.89, "grad_norm": 35.68206787109375, "learning_rate": 3.154959493348958e-07, "logits/chosen": -1.3055843114852905, "logits/rejected": -1.199350118637085, "logps/chosen": -0.7830213308334351, "logps/rejected": -4.249791145324707, "loss": 0.8357, "odds_ratio_loss": 0.526390016078949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07830213010311127, "rewards/margins": 0.3466770648956299, "rewards/rejected": -0.42497915029525757, "sft_loss": 0.7830213308334351, "step": 11405 }, { "epoch": 0.89, "grad_norm": 12.375039100646973, "learning_rate": 3.133468443849125e-07, "logits/chosen": -1.3636467456817627, "logits/rejected": -0.858871340751648, "logps/chosen": -0.8683420419692993, "logps/rejected": -3.3262736797332764, "loss": 0.8901, "odds_ratio_loss": 0.21752576529979706, "rewards/accuracies": 1.0, "rewards/chosen": -0.08683420717716217, "rewards/margins": 0.2457931786775589, "rewards/rejected": -0.33262738585472107, "sft_loss": 0.8683420419692993, "step": 11410 }, { "epoch": 0.89, "grad_norm": 7.882063865661621, "learning_rate": 3.1120484733948073e-07, "logits/chosen": -1.2236177921295166, "logits/rejected": -1.404359221458435, "logps/chosen": -0.7861829996109009, "logps/rejected": -7.9908766746521, "loss": 0.7957, "odds_ratio_loss": 0.0947074443101883, "rewards/accuracies": 1.0, "rewards/chosen": -0.07861830294132233, "rewards/margins": 0.7204692959785461, "rewards/rejected": -0.799087643623352, "sft_loss": 0.7861829996109009, "step": 11415 }, { "epoch": 0.89, "grad_norm": 6.249751091003418, "learning_rate": 3.090699614472109e-07, "logits/chosen": -1.2637525796890259, "logits/rejected": -0.9572528600692749, "logps/chosen": -0.8270605206489563, "logps/rejected": -4.8529839515686035, "loss": 0.8522, "odds_ratio_loss": 0.2513573169708252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08270604908466339, "rewards/margins": 0.40259236097335815, "rewards/rejected": -0.48529839515686035, "sft_loss": 0.8270605206489563, "step": 11420 }, { "epoch": 0.89, "grad_norm": 26.97721290588379, "learning_rate": 3.0694218994592793e-07, "logits/chosen": -1.4073753356933594, "logits/rejected": -0.9085363149642944, "logps/chosen": -1.1127631664276123, "logps/rejected": -8.869595527648926, "loss": 1.1132, "odds_ratio_loss": 0.0041475556790828705, "rewards/accuracies": 1.0, "rewards/chosen": -0.11127632856369019, "rewards/margins": 0.7756832242012024, "rewards/rejected": -0.8869596719741821, "sft_loss": 1.1127631664276123, "step": 11425 }, { "epoch": 0.89, "grad_norm": 6.019398212432861, "learning_rate": 3.0482153606266716e-07, "logits/chosen": -1.211147427558899, "logits/rejected": -1.127651333808899, "logps/chosen": -1.1163737773895264, "logps/rejected": -7.680233001708984, "loss": 1.1226, "odds_ratio_loss": 0.06235046312212944, "rewards/accuracies": 1.0, "rewards/chosen": -0.11163737624883652, "rewards/margins": 0.6563860177993774, "rewards/rejected": -0.7680233120918274, "sft_loss": 1.1163737773895264, "step": 11430 }, { "epoch": 0.89, "grad_norm": 5.228885650634766, "learning_rate": 3.027080030136703e-07, "logits/chosen": -1.296543836593628, "logits/rejected": -1.1578459739685059, "logps/chosen": -0.8815022706985474, "logps/rejected": -9.213228225708008, "loss": 0.8854, "odds_ratio_loss": 0.038652561604976654, "rewards/accuracies": 1.0, "rewards/chosen": -0.08815022557973862, "rewards/margins": 0.833172619342804, "rewards/rejected": -0.9213228225708008, "sft_loss": 0.8815022706985474, "step": 11435 }, { "epoch": 0.89, "grad_norm": 9.065093040466309, "learning_rate": 3.0060159400437883e-07, "logits/chosen": -1.3202693462371826, "logits/rejected": -1.380171537399292, "logps/chosen": -0.9314519762992859, "logps/rejected": -5.277139663696289, "loss": 0.9535, "odds_ratio_loss": 0.2203933447599411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09314519912004471, "rewards/margins": 0.4345687925815582, "rewards/rejected": -0.5277139544487, "sft_loss": 0.9314519762992859, "step": 11440 }, { "epoch": 0.89, "grad_norm": 13.972500801086426, "learning_rate": 2.985023122294278e-07, "logits/chosen": -1.3101141452789307, "logits/rejected": -1.179225206375122, "logps/chosen": -1.0576988458633423, "logps/rejected": -7.225750923156738, "loss": 1.068, "odds_ratio_loss": 0.1029224544763565, "rewards/accuracies": 1.0, "rewards/chosen": -0.10576988756656647, "rewards/margins": 0.6168051958084106, "rewards/rejected": -0.7225750684738159, "sft_loss": 1.0576988458633423, "step": 11445 }, { "epoch": 0.89, "grad_norm": 50.68267059326172, "learning_rate": 2.9641016087264716e-07, "logits/chosen": -1.3971431255340576, "logits/rejected": -0.978039562702179, "logps/chosen": -0.9783371090888977, "logps/rejected": -14.972844123840332, "loss": 0.9813, "odds_ratio_loss": 0.03002532199025154, "rewards/accuracies": 1.0, "rewards/chosen": -0.09783372282981873, "rewards/margins": 1.3994505405426025, "rewards/rejected": -1.4972842931747437, "sft_loss": 0.9783371090888977, "step": 11450 }, { "epoch": 0.89, "grad_norm": 13.629805564880371, "learning_rate": 2.943251431070476e-07, "logits/chosen": -1.184397578239441, "logits/rejected": -1.0515797138214111, "logps/chosen": -0.9648865461349487, "logps/rejected": -4.7971930503845215, "loss": 0.9836, "odds_ratio_loss": 0.18691152334213257, "rewards/accuracies": 1.0, "rewards/chosen": -0.09648866951465607, "rewards/margins": 0.38323062658309937, "rewards/rejected": -0.4797193109989166, "sft_loss": 0.9648865461349487, "step": 11455 }, { "epoch": 0.89, "grad_norm": 7.995943546295166, "learning_rate": 2.9224726209482524e-07, "logits/chosen": -1.3726367950439453, "logits/rejected": -1.2525684833526611, "logps/chosen": -0.8402360677719116, "logps/rejected": -7.144128322601318, "loss": 0.8457, "odds_ratio_loss": 0.05475843697786331, "rewards/accuracies": 1.0, "rewards/chosen": -0.0840236097574234, "rewards/margins": 0.6303892731666565, "rewards/rejected": -0.7144128680229187, "sft_loss": 0.8402360677719116, "step": 11460 }, { "epoch": 0.89, "grad_norm": 7.930908679962158, "learning_rate": 2.901765209873486e-07, "logits/chosen": -1.2770615816116333, "logits/rejected": -1.3665797710418701, "logps/chosen": -0.9703947901725769, "logps/rejected": -7.10650634765625, "loss": 0.9743, "odds_ratio_loss": 0.03855733200907707, "rewards/accuracies": 1.0, "rewards/chosen": -0.09703948348760605, "rewards/margins": 0.6136112213134766, "rewards/rejected": -0.7106507420539856, "sft_loss": 0.9703947901725769, "step": 11465 }, { "epoch": 0.89, "grad_norm": 8.73951244354248, "learning_rate": 2.881129229251611e-07, "logits/chosen": -1.2204939126968384, "logits/rejected": -0.9099424481391907, "logps/chosen": -1.0175669193267822, "logps/rejected": -9.779008865356445, "loss": 1.0212, "odds_ratio_loss": 0.03633153438568115, "rewards/accuracies": 1.0, "rewards/chosen": -0.10175669193267822, "rewards/margins": 0.8761442303657532, "rewards/rejected": -0.9779008626937866, "sft_loss": 1.0175669193267822, "step": 11470 }, { "epoch": 0.89, "grad_norm": 8.854645729064941, "learning_rate": 2.860564710379693e-07, "logits/chosen": -1.334038496017456, "logits/rejected": -1.1563862562179565, "logps/chosen": -1.0889811515808105, "logps/rejected": -5.698180675506592, "loss": 1.1267, "odds_ratio_loss": 0.37702035903930664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1088981181383133, "rewards/margins": 0.46091994643211365, "rewards/rejected": -0.5698180794715881, "sft_loss": 1.0889811515808105, "step": 11475 }, { "epoch": 0.89, "grad_norm": 4.634920120239258, "learning_rate": 2.840071684446455e-07, "logits/chosen": -1.3449212312698364, "logits/rejected": -0.7937895059585571, "logps/chosen": -1.0932824611663818, "logps/rejected": -9.422958374023438, "loss": 1.0967, "odds_ratio_loss": 0.034093089401721954, "rewards/accuracies": 1.0, "rewards/chosen": -0.1093282550573349, "rewards/margins": 0.8329676389694214, "rewards/rejected": -0.9422958493232727, "sft_loss": 1.0932824611663818, "step": 11480 }, { "epoch": 0.89, "grad_norm": 301.2113037109375, "learning_rate": 2.8196501825321686e-07, "logits/chosen": -1.271220088005066, "logits/rejected": -0.934947669506073, "logps/chosen": -0.6947768330574036, "logps/rejected": -4.528558731079102, "loss": 0.6972, "odds_ratio_loss": 0.023960549384355545, "rewards/accuracies": 1.0, "rewards/chosen": -0.06947768479585648, "rewards/margins": 0.38337820768356323, "rewards/rejected": -0.4528558850288391, "sft_loss": 0.6947768330574036, "step": 11485 }, { "epoch": 0.89, "grad_norm": 43.53444290161133, "learning_rate": 2.799300235608626e-07, "logits/chosen": -1.4846608638763428, "logits/rejected": -1.35506010055542, "logps/chosen": -0.753076434135437, "logps/rejected": -8.12567138671875, "loss": 0.7927, "odds_ratio_loss": 0.3962857127189636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07530764490365982, "rewards/margins": 0.737259566783905, "rewards/rejected": -0.8125671148300171, "sft_loss": 0.753076434135437, "step": 11490 }, { "epoch": 0.89, "grad_norm": 7.579301357269287, "learning_rate": 2.779021874539106e-07, "logits/chosen": -1.3991349935531616, "logits/rejected": -1.2568550109863281, "logps/chosen": -1.0853722095489502, "logps/rejected": -5.782435417175293, "loss": 1.13, "odds_ratio_loss": 0.44617921113967896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1085372194647789, "rewards/margins": 0.46970629692077637, "rewards/rejected": -0.5782435536384583, "sft_loss": 1.0853722095489502, "step": 11495 }, { "epoch": 0.89, "grad_norm": 4.319366455078125, "learning_rate": 2.758815130078329e-07, "logits/chosen": -1.2385519742965698, "logits/rejected": -1.0318421125411987, "logps/chosen": -0.9813858270645142, "logps/rejected": -8.16090202331543, "loss": 0.9943, "odds_ratio_loss": 0.1292736828327179, "rewards/accuracies": 1.0, "rewards/chosen": -0.09813857823610306, "rewards/margins": 0.7179515361785889, "rewards/rejected": -0.8160901069641113, "sft_loss": 0.9813858270645142, "step": 11500 }, { "epoch": 0.89, "grad_norm": 24.704221725463867, "learning_rate": 2.7386800328723815e-07, "logits/chosen": -1.3197424411773682, "logits/rejected": -1.1460809707641602, "logps/chosen": -0.9466512799263, "logps/rejected": -11.379068374633789, "loss": 0.9551, "odds_ratio_loss": 0.08447456359863281, "rewards/accuracies": 1.0, "rewards/chosen": -0.09466513246297836, "rewards/margins": 1.0432417392730713, "rewards/rejected": -1.1379069089889526, "sft_loss": 0.9466512799263, "step": 11505 }, { "epoch": 0.9, "grad_norm": 24.183088302612305, "learning_rate": 2.7186166134586964e-07, "logits/chosen": -1.2981321811676025, "logits/rejected": -0.7681884765625, "logps/chosen": -1.0729738473892212, "logps/rejected": -5.5854268074035645, "loss": 1.0831, "odds_ratio_loss": 0.10156464576721191, "rewards/accuracies": 1.0, "rewards/chosen": -0.107297383248806, "rewards/margins": 0.4512453079223633, "rewards/rejected": -0.5585426688194275, "sft_loss": 1.0729738473892212, "step": 11510 }, { "epoch": 0.9, "grad_norm": 14.686610221862793, "learning_rate": 2.698624902265995e-07, "logits/chosen": -1.3769340515136719, "logits/rejected": -1.1684379577636719, "logps/chosen": -0.8651742935180664, "logps/rejected": -5.289219856262207, "loss": 0.8924, "odds_ratio_loss": 0.27234646677970886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08651743084192276, "rewards/margins": 0.442404568195343, "rewards/rejected": -0.5289219617843628, "sft_loss": 0.8651742935180664, "step": 11515 }, { "epoch": 0.9, "grad_norm": 16.206825256347656, "learning_rate": 2.6787049296142455e-07, "logits/chosen": -1.2861360311508179, "logits/rejected": -1.3285562992095947, "logps/chosen": -0.5772618055343628, "logps/rejected": -5.894922256469727, "loss": 0.5908, "odds_ratio_loss": 0.1356671303510666, "rewards/accuracies": 1.0, "rewards/chosen": -0.0577261820435524, "rewards/margins": 0.5317661166191101, "rewards/rejected": -0.5894922614097595, "sft_loss": 0.5772618055343628, "step": 11520 }, { "epoch": 0.9, "grad_norm": 23.26406478881836, "learning_rate": 2.658856725714609e-07, "logits/chosen": -1.4277960062026978, "logits/rejected": -1.2865487337112427, "logps/chosen": -0.6814032793045044, "logps/rejected": -5.886070251464844, "loss": 0.6937, "odds_ratio_loss": 0.12267593294382095, "rewards/accuracies": 1.0, "rewards/chosen": -0.06814032047986984, "rewards/margins": 0.520466685295105, "rewards/rejected": -0.5886070132255554, "sft_loss": 0.6814032793045044, "step": 11525 }, { "epoch": 0.9, "grad_norm": 28.9005069732666, "learning_rate": 2.639080320669424e-07, "logits/chosen": -1.3326749801635742, "logits/rejected": -1.0623148679733276, "logps/chosen": -0.8971265554428101, "logps/rejected": -6.110349655151367, "loss": 0.9131, "odds_ratio_loss": 0.15930424630641937, "rewards/accuracies": 1.0, "rewards/chosen": -0.08971264958381653, "rewards/margins": 0.5213223695755005, "rewards/rejected": -0.6110349893569946, "sft_loss": 0.8971265554428101, "step": 11530 }, { "epoch": 0.9, "grad_norm": 176.95278930664062, "learning_rate": 2.619375744472102e-07, "logits/chosen": -1.2410593032836914, "logits/rejected": -0.9277191162109375, "logps/chosen": -1.0788233280181885, "logps/rejected": -5.1116623878479, "loss": 1.1172, "odds_ratio_loss": 0.38364213705062866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10788233578205109, "rewards/margins": 0.4032839238643646, "rewards/rejected": -0.5111662149429321, "sft_loss": 1.0788233280181885, "step": 11535 }, { "epoch": 0.9, "grad_norm": 12.5931978225708, "learning_rate": 2.599743027007151e-07, "logits/chosen": -1.495727777481079, "logits/rejected": -0.9932398796081543, "logps/chosen": -2.0280823707580566, "logps/rejected": -5.652215957641602, "loss": 2.0938, "odds_ratio_loss": 0.6574202179908752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.20280826091766357, "rewards/margins": 0.36241334676742554, "rewards/rejected": -0.5652216672897339, "sft_loss": 2.0280823707580566, "step": 11540 }, { "epoch": 0.9, "grad_norm": 6.531797885894775, "learning_rate": 2.5801821980500574e-07, "logits/chosen": -1.5165177583694458, "logits/rejected": -1.4731080532073975, "logps/chosen": -0.9339659810066223, "logps/rejected": -9.524686813354492, "loss": 0.9376, "odds_ratio_loss": 0.03651661053299904, "rewards/accuracies": 1.0, "rewards/chosen": -0.09339660406112671, "rewards/margins": 0.8590720891952515, "rewards/rejected": -0.952468752861023, "sft_loss": 0.9339659810066223, "step": 11545 }, { "epoch": 0.9, "grad_norm": 8.201682090759277, "learning_rate": 2.560693287267324e-07, "logits/chosen": -1.4987179040908813, "logits/rejected": -1.4307103157043457, "logps/chosen": -0.5705666542053223, "logps/rejected": -10.991838455200195, "loss": 0.5793, "odds_ratio_loss": 0.08719013631343842, "rewards/accuracies": 1.0, "rewards/chosen": -0.05705666542053223, "rewards/margins": 1.0421271324157715, "rewards/rejected": -1.0991837978363037, "sft_loss": 0.5705666542053223, "step": 11550 }, { "epoch": 0.9, "grad_norm": 9.921911239624023, "learning_rate": 2.5412763242163463e-07, "logits/chosen": -1.3099538087844849, "logits/rejected": -1.3377288579940796, "logps/chosen": -1.147143006324768, "logps/rejected": -6.625881195068359, "loss": 1.1616, "odds_ratio_loss": 0.14414840936660767, "rewards/accuracies": 1.0, "rewards/chosen": -0.11471428722143173, "rewards/margins": 0.547873854637146, "rewards/rejected": -0.6625881195068359, "sft_loss": 1.147143006324768, "step": 11555 }, { "epoch": 0.9, "grad_norm": 13.260001182556152, "learning_rate": 2.5219313383454124e-07, "logits/chosen": -1.3728363513946533, "logits/rejected": -0.907455325126648, "logps/chosen": -1.068935751914978, "logps/rejected": -2.6968321800231934, "loss": 1.1036, "odds_ratio_loss": 0.3463651239871979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10689357668161392, "rewards/margins": 0.16278965771198273, "rewards/rejected": -0.26968321204185486, "sft_loss": 1.068935751914978, "step": 11560 }, { "epoch": 0.9, "grad_norm": 5.01524543762207, "learning_rate": 2.5026583589936646e-07, "logits/chosen": -1.2656458616256714, "logits/rejected": -1.123988389968872, "logps/chosen": -1.0185540914535522, "logps/rejected": -7.923954010009766, "loss": 1.0207, "odds_ratio_loss": 0.021134402602910995, "rewards/accuracies": 1.0, "rewards/chosen": -0.10185541957616806, "rewards/margins": 0.6905400156974792, "rewards/rejected": -0.7923954725265503, "sft_loss": 1.0185540914535522, "step": 11565 }, { "epoch": 0.9, "grad_norm": 27.029821395874023, "learning_rate": 2.483457415391005e-07, "logits/chosen": -1.4600824117660522, "logits/rejected": -1.0570296049118042, "logps/chosen": -0.9342508316040039, "logps/rejected": -8.031071662902832, "loss": 0.973, "odds_ratio_loss": 0.3876782953739166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09342508018016815, "rewards/margins": 0.7096821069717407, "rewards/rejected": -0.8031071424484253, "sft_loss": 0.9342508316040039, "step": 11570 }, { "epoch": 0.9, "grad_norm": 5.063310146331787, "learning_rate": 2.464328536658117e-07, "logits/chosen": -1.3322535753250122, "logits/rejected": -1.235365390777588, "logps/chosen": -0.9622739553451538, "logps/rejected": -11.337159156799316, "loss": 0.9671, "odds_ratio_loss": 0.047843582928180695, "rewards/accuracies": 1.0, "rewards/chosen": -0.09622739255428314, "rewards/margins": 1.0374884605407715, "rewards/rejected": -1.1337158679962158, "sft_loss": 0.9622739553451538, "step": 11575 }, { "epoch": 0.9, "grad_norm": 16.952817916870117, "learning_rate": 2.445271751806366e-07, "logits/chosen": -1.4535599946975708, "logits/rejected": -0.9216348528862, "logps/chosen": -0.8693429827690125, "logps/rejected": -5.077768802642822, "loss": 1.0057, "odds_ratio_loss": 1.363244652748108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08693429827690125, "rewards/margins": 0.42084255814552307, "rewards/rejected": -0.5077768564224243, "sft_loss": 0.8693429827690125, "step": 11580 }, { "epoch": 0.9, "grad_norm": 14.601397514343262, "learning_rate": 2.426287089737783e-07, "logits/chosen": -1.3307750225067139, "logits/rejected": -0.9158192873001099, "logps/chosen": -1.0141366720199585, "logps/rejected": -10.266256332397461, "loss": 1.0207, "odds_ratio_loss": 0.06609180569648743, "rewards/accuracies": 1.0, "rewards/chosen": -0.10141368210315704, "rewards/margins": 0.925212025642395, "rewards/rejected": -1.0266257524490356, "sft_loss": 1.0141366720199585, "step": 11585 }, { "epoch": 0.9, "grad_norm": 16.586641311645508, "learning_rate": 2.40737457924502e-07, "logits/chosen": -1.478084683418274, "logits/rejected": -1.2017533779144287, "logps/chosen": -1.712651014328003, "logps/rejected": -3.850841999053955, "loss": 1.7535, "odds_ratio_loss": 0.4086402952671051, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1712651252746582, "rewards/margins": 0.21381907165050507, "rewards/rejected": -0.3850841820240021, "sft_loss": 1.712651014328003, "step": 11590 }, { "epoch": 0.9, "grad_norm": 6.445002555847168, "learning_rate": 2.3885342490113096e-07, "logits/chosen": -1.1681509017944336, "logits/rejected": -0.9963008761405945, "logps/chosen": -1.4345595836639404, "logps/rejected": -1.9659076929092407, "loss": 1.5061, "odds_ratio_loss": 0.7152605056762695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14345595240592957, "rewards/margins": 0.05313481017947197, "rewards/rejected": -0.19659076631069183, "sft_loss": 1.4345595836639404, "step": 11595 }, { "epoch": 0.9, "grad_norm": 7.320664405822754, "learning_rate": 2.3697661276103956e-07, "logits/chosen": -0.9666527509689331, "logits/rejected": -1.5034465789794922, "logps/chosen": -0.8804365992546082, "logps/rejected": -4.796812534332275, "loss": 0.8856, "odds_ratio_loss": 0.05147816985845566, "rewards/accuracies": 1.0, "rewards/chosen": -0.08804366737604141, "rewards/margins": 0.39163756370544434, "rewards/rejected": -0.47968125343322754, "sft_loss": 0.8804365992546082, "step": 11600 }, { "epoch": 0.9, "grad_norm": 16.727750778198242, "learning_rate": 2.3510702435065202e-07, "logits/chosen": -1.3985929489135742, "logits/rejected": -0.8663476705551147, "logps/chosen": -0.91816246509552, "logps/rejected": -6.086346626281738, "loss": 0.9357, "odds_ratio_loss": 0.17579717934131622, "rewards/accuracies": 1.0, "rewards/chosen": -0.091816246509552, "rewards/margins": 0.5168184041976929, "rewards/rejected": -0.6086345911026001, "sft_loss": 0.91816246509552, "step": 11605 }, { "epoch": 0.9, "grad_norm": 7.229618072509766, "learning_rate": 2.3324466250543577e-07, "logits/chosen": -1.2521380186080933, "logits/rejected": -1.2696040868759155, "logps/chosen": -1.0962046384811401, "logps/rejected": -8.138738632202148, "loss": 1.1084, "odds_ratio_loss": 0.1215173751115799, "rewards/accuracies": 1.0, "rewards/chosen": -0.10962046682834625, "rewards/margins": 0.7042534947395325, "rewards/rejected": -0.8138739466667175, "sft_loss": 1.0962046384811401, "step": 11610 }, { "epoch": 0.9, "grad_norm": 25.551485061645508, "learning_rate": 2.3138953004990027e-07, "logits/chosen": -1.4428867101669312, "logits/rejected": -1.410568356513977, "logps/chosen": -0.6762143969535828, "logps/rejected": -4.25193977355957, "loss": 0.6972, "odds_ratio_loss": 0.21020916104316711, "rewards/accuracies": 1.0, "rewards/chosen": -0.06762143224477768, "rewards/margins": 0.3575725555419922, "rewards/rejected": -0.4251939654350281, "sft_loss": 0.6762143969535828, "step": 11615 }, { "epoch": 0.9, "grad_norm": 10.37377643585205, "learning_rate": 2.2954162979758886e-07, "logits/chosen": -1.334389567375183, "logits/rejected": -0.9677041172981262, "logps/chosen": -1.1673810482025146, "logps/rejected": -3.838226318359375, "loss": 1.1857, "odds_ratio_loss": 0.18330082297325134, "rewards/accuracies": 1.0, "rewards/chosen": -0.11673810333013535, "rewards/margins": 0.267084538936615, "rewards/rejected": -0.38382261991500854, "sft_loss": 1.1673810482025146, "step": 11620 }, { "epoch": 0.9, "grad_norm": 9.882246971130371, "learning_rate": 2.2770096455107692e-07, "logits/chosen": -1.306133508682251, "logits/rejected": -1.1123729944229126, "logps/chosen": -1.3784221410751343, "logps/rejected": -5.239972114562988, "loss": 1.3836, "odds_ratio_loss": 0.051394373178482056, "rewards/accuracies": 1.0, "rewards/chosen": -0.13784220814704895, "rewards/margins": 0.38615497946739197, "rewards/rejected": -0.5239971876144409, "sft_loss": 1.3784221410751343, "step": 11625 }, { "epoch": 0.9, "grad_norm": 96.8947982788086, "learning_rate": 2.2586753710196697e-07, "logits/chosen": -1.4276647567749023, "logits/rejected": -0.7415833473205566, "logps/chosen": -2.1991066932678223, "logps/rejected": -8.009876251220703, "loss": 2.2116, "odds_ratio_loss": 0.1253366321325302, "rewards/accuracies": 1.0, "rewards/chosen": -0.21991066634655, "rewards/margins": 0.581076979637146, "rewards/rejected": -0.8009876012802124, "sft_loss": 2.1991066932678223, "step": 11630 }, { "epoch": 0.91, "grad_norm": 12.267698287963867, "learning_rate": 2.2404135023088415e-07, "logits/chosen": -1.3456906080245972, "logits/rejected": -1.3257023096084595, "logps/chosen": -0.8446849584579468, "logps/rejected": -7.658097743988037, "loss": 0.8484, "odds_ratio_loss": 0.03717377409338951, "rewards/accuracies": 1.0, "rewards/chosen": -0.08446849882602692, "rewards/margins": 0.6813413500785828, "rewards/rejected": -0.7658098340034485, "sft_loss": 0.8446849584579468, "step": 11635 }, { "epoch": 0.91, "grad_norm": 20.532861709594727, "learning_rate": 2.2222240670747297e-07, "logits/chosen": -1.3424028158187866, "logits/rejected": -1.1350343227386475, "logps/chosen": -0.8087455630302429, "logps/rejected": -5.615707874298096, "loss": 0.8261, "odds_ratio_loss": 0.17381128668785095, "rewards/accuracies": 1.0, "rewards/chosen": -0.08087456226348877, "rewards/margins": 0.4806962013244629, "rewards/rejected": -0.5615707635879517, "sft_loss": 0.8087455630302429, "step": 11640 }, { "epoch": 0.91, "grad_norm": 21.549776077270508, "learning_rate": 2.2041070929039233e-07, "logits/chosen": -1.2935678958892822, "logits/rejected": -0.947970986366272, "logps/chosen": -0.965768039226532, "logps/rejected": -8.586902618408203, "loss": 0.9884, "odds_ratio_loss": 0.22596082091331482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09657682478427887, "rewards/margins": 0.7621134519577026, "rewards/rejected": -0.8586903810501099, "sft_loss": 0.965768039226532, "step": 11645 }, { "epoch": 0.91, "grad_norm": 7.439232349395752, "learning_rate": 2.186062607273115e-07, "logits/chosen": -1.5062514543533325, "logits/rejected": -1.2926065921783447, "logps/chosen": -0.9130525588989258, "logps/rejected": -7.870516777038574, "loss": 0.9295, "odds_ratio_loss": 0.16428081691265106, "rewards/accuracies": 1.0, "rewards/chosen": -0.09130525588989258, "rewards/margins": 0.6957464218139648, "rewards/rejected": -0.7870516777038574, "sft_loss": 0.9130525588989258, "step": 11650 }, { "epoch": 0.91, "grad_norm": 6.630489349365234, "learning_rate": 2.1680906375490529e-07, "logits/chosen": -1.2369283437728882, "logits/rejected": -1.2240400314331055, "logps/chosen": -1.8176237344741821, "logps/rejected": -5.021418571472168, "loss": 1.8622, "odds_ratio_loss": 0.4459160268306732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.18176238238811493, "rewards/margins": 0.3203795254230499, "rewards/rejected": -0.5021418333053589, "sft_loss": 1.8176237344741821, "step": 11655 }, { "epoch": 0.91, "grad_norm": 10.349519729614258, "learning_rate": 2.150191210988517e-07, "logits/chosen": -1.407854437828064, "logits/rejected": -1.0259262323379517, "logps/chosen": -1.0435562133789062, "logps/rejected": -6.068276405334473, "loss": 1.0683, "odds_ratio_loss": 0.24705128371715546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10435561835765839, "rewards/margins": 0.5024720430374146, "rewards/rejected": -0.6068276166915894, "sft_loss": 1.0435562133789062, "step": 11660 }, { "epoch": 0.91, "grad_norm": 7.212589263916016, "learning_rate": 2.1323643547382645e-07, "logits/chosen": -1.3311125040054321, "logits/rejected": -1.3817427158355713, "logps/chosen": -1.0456585884094238, "logps/rejected": -6.869471073150635, "loss": 1.0677, "odds_ratio_loss": 0.22031307220458984, "rewards/accuracies": 1.0, "rewards/chosen": -0.10456587374210358, "rewards/margins": 0.5823811888694763, "rewards/rejected": -0.6869471073150635, "sft_loss": 1.0456585884094238, "step": 11665 }, { "epoch": 0.91, "grad_norm": 8.162760734558105, "learning_rate": 2.1146100958349736e-07, "logits/chosen": -1.2604087591171265, "logits/rejected": -1.1009435653686523, "logps/chosen": -0.7238745093345642, "logps/rejected": -2.706557035446167, "loss": 0.7602, "odds_ratio_loss": 0.3634505569934845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0723874568939209, "rewards/margins": 0.19826826453208923, "rewards/rejected": -0.27065569162368774, "sft_loss": 0.7238745093345642, "step": 11670 }, { "epoch": 0.91, "grad_norm": 36.05037307739258, "learning_rate": 2.096928461205233e-07, "logits/chosen": -1.3813269138336182, "logits/rejected": -0.827099621295929, "logps/chosen": -1.0514827966690063, "logps/rejected": -4.236260414123535, "loss": 1.0595, "odds_ratio_loss": 0.07986799627542496, "rewards/accuracies": 1.0, "rewards/chosen": -0.10514827817678452, "rewards/margins": 0.3184778392314911, "rewards/rejected": -0.4236261248588562, "sft_loss": 1.0514827966690063, "step": 11675 }, { "epoch": 0.91, "grad_norm": 9.704886436462402, "learning_rate": 2.0793194776655034e-07, "logits/chosen": -1.2689664363861084, "logits/rejected": -1.1581445932388306, "logps/chosen": -0.964913010597229, "logps/rejected": -5.2943010330200195, "loss": 0.9863, "odds_ratio_loss": 0.21359559893608093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09649130702018738, "rewards/margins": 0.432938814163208, "rewards/rejected": -0.5294301509857178, "sft_loss": 0.964913010597229, "step": 11680 }, { "epoch": 0.91, "grad_norm": 26.077938079833984, "learning_rate": 2.0617831719220273e-07, "logits/chosen": -1.2547130584716797, "logits/rejected": -1.3344730138778687, "logps/chosen": -1.2342766523361206, "logps/rejected": -7.203073024749756, "loss": 1.2524, "odds_ratio_loss": 0.1816709190607071, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12342766672372818, "rewards/margins": 0.5968796610832214, "rewards/rejected": -0.7203073501586914, "sft_loss": 1.2342766523361206, "step": 11685 }, { "epoch": 0.91, "grad_norm": 6.128324031829834, "learning_rate": 2.0443195705708464e-07, "logits/chosen": -1.4026262760162354, "logits/rejected": -1.2854896783828735, "logps/chosen": -1.093899130821228, "logps/rejected": -7.937254905700684, "loss": 1.1075, "odds_ratio_loss": 0.13566581904888153, "rewards/accuracies": 1.0, "rewards/chosen": -0.10938992351293564, "rewards/margins": 0.6843355298042297, "rewards/rejected": -0.7937254905700684, "sft_loss": 1.093899130821228, "step": 11690 }, { "epoch": 0.91, "grad_norm": 8.042830467224121, "learning_rate": 2.0269287000977244e-07, "logits/chosen": -1.3604000806808472, "logits/rejected": -0.6384872198104858, "logps/chosen": -0.9876667857170105, "logps/rejected": -2.6367809772491455, "loss": 1.0193, "odds_ratio_loss": 0.31643930077552795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09876667708158493, "rewards/margins": 0.1649114191532135, "rewards/rejected": -0.26367810368537903, "sft_loss": 0.9876667857170105, "step": 11695 }, { "epoch": 0.91, "grad_norm": 11.282980918884277, "learning_rate": 2.00961058687813e-07, "logits/chosen": -1.3280357122421265, "logits/rejected": -1.0343616008758545, "logps/chosen": -1.097975492477417, "logps/rejected": -5.217041015625, "loss": 1.1192, "odds_ratio_loss": 0.21179921925067902, "rewards/accuracies": 1.0, "rewards/chosen": -0.10979755222797394, "rewards/margins": 0.41190654039382935, "rewards/rejected": -0.5217040777206421, "sft_loss": 1.097975492477417, "step": 11700 }, { "epoch": 0.91, "grad_norm": 29.81282615661621, "learning_rate": 1.99236525717717e-07, "logits/chosen": -1.4915571212768555, "logits/rejected": -1.2468748092651367, "logps/chosen": -0.6215002536773682, "logps/rejected": -9.786310195922852, "loss": 0.6416, "odds_ratio_loss": 0.2014237642288208, "rewards/accuracies": 1.0, "rewards/chosen": -0.0621500238776207, "rewards/margins": 0.9164810180664062, "rewards/rejected": -0.9786310195922852, "sft_loss": 0.6215002536773682, "step": 11705 }, { "epoch": 0.91, "grad_norm": 7.58364725112915, "learning_rate": 1.975192737149595e-07, "logits/chosen": -1.4052196741104126, "logits/rejected": -1.3716819286346436, "logps/chosen": -0.9670282602310181, "logps/rejected": -9.287376403808594, "loss": 0.9827, "odds_ratio_loss": 0.15622581541538239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09670282900333405, "rewards/margins": 0.8320348858833313, "rewards/rejected": -0.9287376403808594, "sft_loss": 0.9670282602310181, "step": 11710 }, { "epoch": 0.91, "grad_norm": 13.548325538635254, "learning_rate": 1.9580930528396936e-07, "logits/chosen": -1.393760323524475, "logits/rejected": -1.1954675912857056, "logps/chosen": -0.9646106958389282, "logps/rejected": -8.41152572631836, "loss": 0.9693, "odds_ratio_loss": 0.046464212238788605, "rewards/accuracies": 1.0, "rewards/chosen": -0.09646106511354446, "rewards/margins": 0.7446915507316589, "rewards/rejected": -0.8411526679992676, "sft_loss": 0.9646106958389282, "step": 11715 }, { "epoch": 0.91, "grad_norm": 21.84276008605957, "learning_rate": 1.9410662301813155e-07, "logits/chosen": -1.3173141479492188, "logits/rejected": -0.8565092086791992, "logps/chosen": -0.9831641912460327, "logps/rejected": -4.476618766784668, "loss": 1.0125, "odds_ratio_loss": 0.2932348847389221, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09831643104553223, "rewards/margins": 0.34934547543525696, "rewards/rejected": -0.4476618766784668, "sft_loss": 0.9831641912460327, "step": 11720 }, { "epoch": 0.91, "grad_norm": 9.901228904724121, "learning_rate": 1.924112294997804e-07, "logits/chosen": -1.4346562623977661, "logits/rejected": -1.0577850341796875, "logps/chosen": -0.8075051307678223, "logps/rejected": -7.850630283355713, "loss": 0.8338, "odds_ratio_loss": 0.2629486918449402, "rewards/accuracies": 1.0, "rewards/chosen": -0.08075051009654999, "rewards/margins": 0.7043125629425049, "rewards/rejected": -0.7850630879402161, "sft_loss": 0.8075051307678223, "step": 11725 }, { "epoch": 0.91, "grad_norm": 6.227826118469238, "learning_rate": 1.9072312730019471e-07, "logits/chosen": -1.2854779958724976, "logits/rejected": -1.1545909643173218, "logps/chosen": -1.0335874557495117, "logps/rejected": -4.049464702606201, "loss": 1.041, "odds_ratio_loss": 0.07380737364292145, "rewards/accuracies": 1.0, "rewards/chosen": -0.10335874557495117, "rewards/margins": 0.3015877902507782, "rewards/rejected": -0.404946506023407, "sft_loss": 1.0335874557495117, "step": 11730 }, { "epoch": 0.91, "grad_norm": 5.330779552459717, "learning_rate": 1.8904231897959646e-07, "logits/chosen": -1.3925060033798218, "logits/rejected": -1.1478973627090454, "logps/chosen": -0.9691879153251648, "logps/rejected": -6.747610569000244, "loss": 0.97, "odds_ratio_loss": 0.007857967168092728, "rewards/accuracies": 1.0, "rewards/chosen": -0.09691879153251648, "rewards/margins": 0.5778422355651855, "rewards/rejected": -0.6747610569000244, "sft_loss": 0.9691879153251648, "step": 11735 }, { "epoch": 0.91, "grad_norm": 3.6288249492645264, "learning_rate": 1.8736880708714434e-07, "logits/chosen": -1.3915761709213257, "logits/rejected": -1.0295811891555786, "logps/chosen": -1.1151198148727417, "logps/rejected": -9.233034133911133, "loss": 1.1344, "odds_ratio_loss": 0.19316698610782623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11151199042797089, "rewards/margins": 0.8117914199829102, "rewards/rejected": -0.9233034253120422, "sft_loss": 1.1151198148727417, "step": 11740 }, { "epoch": 0.91, "grad_norm": 34.54227066040039, "learning_rate": 1.857025941609314e-07, "logits/chosen": -1.496463418006897, "logits/rejected": -1.3544741868972778, "logps/chosen": -0.9142772555351257, "logps/rejected": -13.582223892211914, "loss": 0.9159, "odds_ratio_loss": 0.016292279586195946, "rewards/accuracies": 1.0, "rewards/chosen": -0.09142772853374481, "rewards/margins": 1.2667948007583618, "rewards/rejected": -1.358222484588623, "sft_loss": 0.9142772555351257, "step": 11745 }, { "epoch": 0.91, "grad_norm": 12.939098358154297, "learning_rate": 1.840436827279818e-07, "logits/chosen": -1.2878293991088867, "logits/rejected": -1.8434340953826904, "logps/chosen": -0.953102707862854, "logps/rejected": -16.294424057006836, "loss": 0.9532, "odds_ratio_loss": 0.0005960009293630719, "rewards/accuracies": 1.0, "rewards/chosen": -0.0953102558851242, "rewards/margins": 1.5341323614120483, "rewards/rejected": -1.629442572593689, "sft_loss": 0.953102707862854, "step": 11750 }, { "epoch": 0.91, "grad_norm": 65.51850128173828, "learning_rate": 1.823920753042441e-07, "logits/chosen": -1.3374836444854736, "logits/rejected": -0.9872108697891235, "logps/chosen": -1.0601967573165894, "logps/rejected": -6.977286338806152, "loss": 1.0673, "odds_ratio_loss": 0.07123270630836487, "rewards/accuracies": 1.0, "rewards/chosen": -0.10601967573165894, "rewards/margins": 0.5917090177536011, "rewards/rejected": -0.6977287530899048, "sft_loss": 1.0601967573165894, "step": 11755 }, { "epoch": 0.91, "grad_norm": 16.67684555053711, "learning_rate": 1.8074777439459234e-07, "logits/chosen": -1.4826557636260986, "logits/rejected": -1.2680238485336304, "logps/chosen": -1.2555855512619019, "logps/rejected": -12.583868026733398, "loss": 1.2584, "odds_ratio_loss": 0.028480147942900658, "rewards/accuracies": 1.0, "rewards/chosen": -0.12555855512619019, "rewards/margins": 1.1328282356262207, "rewards/rejected": -1.2583868503570557, "sft_loss": 1.2555855512619019, "step": 11760 }, { "epoch": 0.92, "grad_norm": 11.102761268615723, "learning_rate": 1.7911078249281676e-07, "logits/chosen": -1.3503711223602295, "logits/rejected": -1.167409896850586, "logps/chosen": -0.8589099645614624, "logps/rejected": -7.199994087219238, "loss": 0.8662, "odds_ratio_loss": 0.07283125072717667, "rewards/accuracies": 1.0, "rewards/chosen": -0.085890993475914, "rewards/margins": 0.6341084241867065, "rewards/rejected": -0.7199994325637817, "sft_loss": 0.8589099645614624, "step": 11765 }, { "epoch": 0.92, "grad_norm": 7.360010147094727, "learning_rate": 1.7748110208162306e-07, "logits/chosen": -1.4154503345489502, "logits/rejected": -1.0518730878829956, "logps/chosen": -1.017219066619873, "logps/rejected": -5.722550392150879, "loss": 1.0484, "odds_ratio_loss": 0.311401903629303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10172190517187119, "rewards/margins": 0.4705330729484558, "rewards/rejected": -0.57225501537323, "sft_loss": 1.017219066619873, "step": 11770 }, { "epoch": 0.92, "grad_norm": 15.793622970581055, "learning_rate": 1.7585873563262911e-07, "logits/chosen": -1.444927453994751, "logits/rejected": -1.3852720260620117, "logps/chosen": -1.205545425415039, "logps/rejected": -3.7909297943115234, "loss": 1.2489, "odds_ratio_loss": 0.43350714445114136, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12055455148220062, "rewards/margins": 0.2585384249687195, "rewards/rejected": -0.3790929615497589, "sft_loss": 1.205545425415039, "step": 11775 }, { "epoch": 0.92, "grad_norm": 14.353672981262207, "learning_rate": 1.7424368560635952e-07, "logits/chosen": -1.3821165561676025, "logits/rejected": -0.9330111742019653, "logps/chosen": -0.8571823239326477, "logps/rejected": -6.622941017150879, "loss": 0.8578, "odds_ratio_loss": 0.006473424378782511, "rewards/accuracies": 1.0, "rewards/chosen": -0.08571822941303253, "rewards/margins": 0.5765758752822876, "rewards/rejected": -0.6622940897941589, "sft_loss": 0.8571823239326477, "step": 11780 }, { "epoch": 0.92, "grad_norm": 6.85159158706665, "learning_rate": 1.7263595445224267e-07, "logits/chosen": -1.3977632522583008, "logits/rejected": -0.8465273976325989, "logps/chosen": -0.9446234703063965, "logps/rejected": -6.264396667480469, "loss": 0.9558, "odds_ratio_loss": 0.11224206537008286, "rewards/accuracies": 1.0, "rewards/chosen": -0.09446235001087189, "rewards/margins": 0.5319773554801941, "rewards/rejected": -0.6264396905899048, "sft_loss": 0.9446234703063965, "step": 11785 }, { "epoch": 0.92, "grad_norm": 9.203495979309082, "learning_rate": 1.710355446086065e-07, "logits/chosen": -1.3460767269134521, "logits/rejected": -1.2761690616607666, "logps/chosen": -0.73606938123703, "logps/rejected": -4.091338157653809, "loss": 0.7445, "odds_ratio_loss": 0.08461825549602509, "rewards/accuracies": 1.0, "rewards/chosen": -0.0736069455742836, "rewards/margins": 0.33552688360214233, "rewards/rejected": -0.40913382172584534, "sft_loss": 0.73606938123703, "step": 11790 }, { "epoch": 0.92, "grad_norm": 22.562210083007812, "learning_rate": 1.694424585026766e-07, "logits/chosen": -1.2790594100952148, "logits/rejected": -1.4777812957763672, "logps/chosen": -0.8364423513412476, "logps/rejected": -8.052239418029785, "loss": 0.8528, "odds_ratio_loss": 0.16366323828697205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08364423364400864, "rewards/margins": 0.7215796709060669, "rewards/rejected": -0.8052239418029785, "sft_loss": 0.8364423513412476, "step": 11795 }, { "epoch": 0.92, "grad_norm": 5.110595703125, "learning_rate": 1.6785669855056973e-07, "logits/chosen": -1.2552369832992554, "logits/rejected": -1.2118430137634277, "logps/chosen": -0.8571056127548218, "logps/rejected": -8.143725395202637, "loss": 0.8688, "odds_ratio_loss": 0.11663065105676651, "rewards/accuracies": 1.0, "rewards/chosen": -0.0857105553150177, "rewards/margins": 0.7286620736122131, "rewards/rejected": -0.8143726587295532, "sft_loss": 0.8571056127548218, "step": 11800 }, { "epoch": 0.92, "grad_norm": 0.8979963064193726, "learning_rate": 1.6627826715729266e-07, "logits/chosen": -1.3609285354614258, "logits/rejected": -1.329521894454956, "logps/chosen": -0.5881875157356262, "logps/rejected": -6.507943630218506, "loss": 0.5898, "odds_ratio_loss": 0.015659797936677933, "rewards/accuracies": 1.0, "rewards/chosen": -0.058818746358156204, "rewards/margins": 0.5919756889343262, "rewards/rejected": -0.6507944464683533, "sft_loss": 0.5881875157356262, "step": 11805 }, { "epoch": 0.92, "grad_norm": 8.27506160736084, "learning_rate": 1.6470716671673603e-07, "logits/chosen": -1.2430107593536377, "logits/rejected": -0.9508243799209595, "logps/chosen": -0.7933021783828735, "logps/rejected": -6.344875812530518, "loss": 0.7937, "odds_ratio_loss": 0.003900631098076701, "rewards/accuracies": 1.0, "rewards/chosen": -0.07933022081851959, "rewards/margins": 0.5551573038101196, "rewards/rejected": -0.6344875693321228, "sft_loss": 0.7933021783828735, "step": 11810 }, { "epoch": 0.92, "grad_norm": 62.29372024536133, "learning_rate": 1.6314339961167435e-07, "logits/chosen": -1.3524234294891357, "logits/rejected": -1.283733606338501, "logps/chosen": -0.9083568453788757, "logps/rejected": -6.802338600158691, "loss": 0.9266, "odds_ratio_loss": 0.18251758813858032, "rewards/accuracies": 1.0, "rewards/chosen": -0.09083569049835205, "rewards/margins": 0.589398205280304, "rewards/rejected": -0.6802338361740112, "sft_loss": 0.9083568453788757, "step": 11815 }, { "epoch": 0.92, "grad_norm": 7.945556640625, "learning_rate": 1.6158696821375776e-07, "logits/chosen": -1.3798269033432007, "logits/rejected": -0.9315776824951172, "logps/chosen": -1.0558592081069946, "logps/rejected": -4.040333271026611, "loss": 1.1, "odds_ratio_loss": 0.44100436568260193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10558591037988663, "rewards/margins": 0.29844745993614197, "rewards/rejected": -0.404033362865448, "sft_loss": 1.0558592081069946, "step": 11820 }, { "epoch": 0.92, "grad_norm": 7.206352710723877, "learning_rate": 1.6003787488351298e-07, "logits/chosen": -1.2845613956451416, "logits/rejected": -1.0599045753479004, "logps/chosen": -1.1459189653396606, "logps/rejected": -7.448056697845459, "loss": 1.1656, "odds_ratio_loss": 0.1967560052871704, "rewards/accuracies": 1.0, "rewards/chosen": -0.11459188163280487, "rewards/margins": 0.6302137970924377, "rewards/rejected": -0.7448056936264038, "sft_loss": 1.1459189653396606, "step": 11825 }, { "epoch": 0.92, "grad_norm": 9.853588104248047, "learning_rate": 1.584961219703368e-07, "logits/chosen": -1.4143199920654297, "logits/rejected": -1.3166183233261108, "logps/chosen": -0.7544041872024536, "logps/rejected": -4.371842384338379, "loss": 0.7706, "odds_ratio_loss": 0.1623033583164215, "rewards/accuracies": 1.0, "rewards/chosen": -0.0754404217004776, "rewards/margins": 0.3617438077926636, "rewards/rejected": -0.43718424439430237, "sft_loss": 0.7544041872024536, "step": 11830 }, { "epoch": 0.92, "grad_norm": 31.727481842041016, "learning_rate": 1.569617118124922e-07, "logits/chosen": -1.4243693351745605, "logits/rejected": -1.200060248374939, "logps/chosen": -1.0224988460540771, "logps/rejected": -3.4999337196350098, "loss": 1.1675, "odds_ratio_loss": 1.4497023820877075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1022498831152916, "rewards/margins": 0.24774348735809326, "rewards/rejected": -0.34999337792396545, "sft_loss": 1.0224988460540771, "step": 11835 }, { "epoch": 0.92, "grad_norm": 9.608902931213379, "learning_rate": 1.5543464673710816e-07, "logits/chosen": -1.4377899169921875, "logits/rejected": -0.9992468953132629, "logps/chosen": -0.9196946024894714, "logps/rejected": -5.83154821395874, "loss": 0.9301, "odds_ratio_loss": 0.10370359569787979, "rewards/accuracies": 1.0, "rewards/chosen": -0.09196946769952774, "rewards/margins": 0.49118536710739136, "rewards/rejected": -0.5831547975540161, "sft_loss": 0.9196946024894714, "step": 11840 }, { "epoch": 0.92, "grad_norm": 7.982198715209961, "learning_rate": 1.5391492906017268e-07, "logits/chosen": -1.3677937984466553, "logits/rejected": -1.2269057035446167, "logps/chosen": -0.8073514103889465, "logps/rejected": -7.5669379234313965, "loss": 0.8105, "odds_ratio_loss": 0.03112083114683628, "rewards/accuracies": 1.0, "rewards/chosen": -0.08073513954877853, "rewards/margins": 0.6759586334228516, "rewards/rejected": -0.7566937208175659, "sft_loss": 0.8073514103889465, "step": 11845 }, { "epoch": 0.92, "grad_norm": 18.15665626525879, "learning_rate": 1.5240256108652986e-07, "logits/chosen": -1.446323037147522, "logits/rejected": -1.472093939781189, "logps/chosen": -1.1066490411758423, "logps/rejected": -9.583824157714844, "loss": 1.1195, "odds_ratio_loss": 0.12807539105415344, "rewards/accuracies": 1.0, "rewards/chosen": -0.11066490411758423, "rewards/margins": 0.8477176427841187, "rewards/rejected": -0.9583824872970581, "sft_loss": 1.1066490411758423, "step": 11850 }, { "epoch": 0.92, "grad_norm": 6.199967384338379, "learning_rate": 1.5089754510987875e-07, "logits/chosen": -1.482166051864624, "logits/rejected": -1.2701305150985718, "logps/chosen": -1.2697360515594482, "logps/rejected": -9.479783058166504, "loss": 1.295, "odds_ratio_loss": 0.2527654767036438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12697359919548035, "rewards/margins": 0.8210046887397766, "rewards/rejected": -0.9479783177375793, "sft_loss": 1.2697360515594482, "step": 11855 }, { "epoch": 0.92, "grad_norm": 205.06369018554688, "learning_rate": 1.493998834127658e-07, "logits/chosen": -1.33724844455719, "logits/rejected": -1.0690397024154663, "logps/chosen": -1.4901643991470337, "logps/rejected": -6.617243766784668, "loss": 1.5093, "odds_ratio_loss": 0.19102993607521057, "rewards/accuracies": 1.0, "rewards/chosen": -0.14901643991470337, "rewards/margins": 0.5127079486846924, "rewards/rejected": -0.6617244482040405, "sft_loss": 1.4901643991470337, "step": 11860 }, { "epoch": 0.92, "grad_norm": 5.183155536651611, "learning_rate": 1.4790957826658624e-07, "logits/chosen": -1.3647806644439697, "logits/rejected": -0.6501811742782593, "logps/chosen": -1.1898683309555054, "logps/rejected": -7.271353244781494, "loss": 1.2328, "odds_ratio_loss": 0.4295937418937683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11898684501647949, "rewards/margins": 0.608148455619812, "rewards/rejected": -0.7271353006362915, "sft_loss": 1.1898683309555054, "step": 11865 }, { "epoch": 0.92, "grad_norm": 7.028154373168945, "learning_rate": 1.4642663193157602e-07, "logits/chosen": -1.2401490211486816, "logits/rejected": -1.5572097301483154, "logps/chosen": -0.8862310647964478, "logps/rejected": -8.077818870544434, "loss": 0.8873, "odds_ratio_loss": 0.010762016288936138, "rewards/accuracies": 1.0, "rewards/chosen": -0.08862310647964478, "rewards/margins": 0.7191587686538696, "rewards/rejected": -0.8077818751335144, "sft_loss": 0.8862310647964478, "step": 11870 }, { "epoch": 0.92, "grad_norm": 13.864395141601562, "learning_rate": 1.449510466568127e-07, "logits/chosen": -1.3671717643737793, "logits/rejected": -1.4441094398498535, "logps/chosen": -0.9278122782707214, "logps/rejected": -5.194039344787598, "loss": 0.9475, "odds_ratio_loss": 0.19683992862701416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09278122335672379, "rewards/margins": 0.4266226887702942, "rewards/rejected": -0.519403874874115, "sft_loss": 0.9278122782707214, "step": 11875 }, { "epoch": 0.92, "grad_norm": 12.953536033630371, "learning_rate": 1.434828246802078e-07, "logits/chosen": -1.3513822555541992, "logits/rejected": -1.2330682277679443, "logps/chosen": -0.8415525555610657, "logps/rejected": -3.681126832962036, "loss": 0.8629, "odds_ratio_loss": 0.21303188800811768, "rewards/accuracies": 1.0, "rewards/chosen": -0.08415525406599045, "rewards/margins": 0.28395745158195496, "rewards/rejected": -0.368112713098526, "sft_loss": 0.8415525555610657, "step": 11880 }, { "epoch": 0.92, "grad_norm": 29.238855361938477, "learning_rate": 1.420219682285062e-07, "logits/chosen": -1.2675743103027344, "logits/rejected": -1.1817070245742798, "logps/chosen": -1.044460654258728, "logps/rejected": -2.4189651012420654, "loss": 1.0879, "odds_ratio_loss": 0.4343862533569336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10444607585668564, "rewards/margins": 0.1374504566192627, "rewards/rejected": -0.24189653992652893, "sft_loss": 1.044460654258728, "step": 11885 }, { "epoch": 0.92, "grad_norm": 5.407149314880371, "learning_rate": 1.4056847951728404e-07, "logits/chosen": -1.2687222957611084, "logits/rejected": -1.027491569519043, "logps/chosen": -0.9911456108093262, "logps/rejected": -4.098135948181152, "loss": 1.0073, "odds_ratio_loss": 0.16173891723155975, "rewards/accuracies": 1.0, "rewards/chosen": -0.0991145670413971, "rewards/margins": 0.31069907546043396, "rewards/rejected": -0.40981364250183105, "sft_loss": 0.9911456108093262, "step": 11890 }, { "epoch": 0.93, "grad_norm": 9.799278259277344, "learning_rate": 1.3912236075093955e-07, "logits/chosen": -1.4086112976074219, "logits/rejected": -0.8215053677558899, "logps/chosen": -1.0313136577606201, "logps/rejected": -4.323803901672363, "loss": 1.0426, "odds_ratio_loss": 0.1132875308394432, "rewards/accuracies": 1.0, "rewards/chosen": -0.10313136875629425, "rewards/margins": 0.3292490243911743, "rewards/rejected": -0.4323803782463074, "sft_loss": 1.0313136577606201, "step": 11895 }, { "epoch": 0.93, "grad_norm": 5.346977233886719, "learning_rate": 1.376836141226956e-07, "logits/chosen": -1.3889329433441162, "logits/rejected": -0.7270419001579285, "logps/chosen": -1.3125561475753784, "logps/rejected": -11.502326011657715, "loss": 1.3134, "odds_ratio_loss": 0.008127482607960701, "rewards/accuracies": 1.0, "rewards/chosen": -0.1312556117773056, "rewards/margins": 1.0189769268035889, "rewards/rejected": -1.1502325534820557, "sft_loss": 1.3125561475753784, "step": 11900 }, { "epoch": 0.93, "grad_norm": 7.43550968170166, "learning_rate": 1.3625224181459507e-07, "logits/chosen": -1.271698236465454, "logits/rejected": -1.1763298511505127, "logps/chosen": -1.30789053440094, "logps/rejected": -4.681346893310547, "loss": 1.3176, "odds_ratio_loss": 0.09740128368139267, "rewards/accuracies": 1.0, "rewards/chosen": -0.13078904151916504, "rewards/margins": 0.3373456597328186, "rewards/rejected": -0.46813470125198364, "sft_loss": 1.30789053440094, "step": 11905 }, { "epoch": 0.93, "grad_norm": 6.522105693817139, "learning_rate": 1.3482824599749534e-07, "logits/chosen": -1.300004005432129, "logits/rejected": -1.1260687112808228, "logps/chosen": -0.82489013671875, "logps/rejected": -7.8272504806518555, "loss": 0.8343, "odds_ratio_loss": 0.0938338190317154, "rewards/accuracies": 1.0, "rewards/chosen": -0.0824890211224556, "rewards/margins": 0.7002360820770264, "rewards/rejected": -0.7827251553535461, "sft_loss": 0.82489013671875, "step": 11910 }, { "epoch": 0.93, "grad_norm": 25.62769889831543, "learning_rate": 1.3341162883106662e-07, "logits/chosen": -1.531531572341919, "logits/rejected": -1.0012396574020386, "logps/chosen": -1.1209322214126587, "logps/rejected": -6.971850395202637, "loss": 1.1345, "odds_ratio_loss": 0.13544395565986633, "rewards/accuracies": 1.0, "rewards/chosen": -0.1120932325720787, "rewards/margins": 0.5850918292999268, "rewards/rejected": -0.6971850395202637, "sft_loss": 1.1209322214126587, "step": 11915 }, { "epoch": 0.93, "grad_norm": 991.5822143554688, "learning_rate": 1.320023924637892e-07, "logits/chosen": -1.2943589687347412, "logits/rejected": -1.1523224115371704, "logps/chosen": -1.4653940200805664, "logps/rejected": -9.920454978942871, "loss": 1.467, "odds_ratio_loss": 0.016162164509296417, "rewards/accuracies": 1.0, "rewards/chosen": -0.14653940498828888, "rewards/margins": 0.8455060720443726, "rewards/rejected": -0.9920455813407898, "sft_loss": 1.4653940200805664, "step": 11920 }, { "epoch": 0.93, "grad_norm": 20.709386825561523, "learning_rate": 1.3060053903294846e-07, "logits/chosen": -1.2096668481826782, "logits/rejected": -0.8821192979812622, "logps/chosen": -0.8613446950912476, "logps/rejected": -7.154592990875244, "loss": 0.8665, "odds_ratio_loss": 0.05132795497775078, "rewards/accuracies": 1.0, "rewards/chosen": -0.08613447099924088, "rewards/margins": 0.6293249130249023, "rewards/rejected": -0.7154593467712402, "sft_loss": 0.8613446950912476, "step": 11925 }, { "epoch": 0.93, "grad_norm": 5.921295642852783, "learning_rate": 1.2920607066463365e-07, "logits/chosen": -1.434833288192749, "logits/rejected": -1.0353233814239502, "logps/chosen": -1.550927758216858, "logps/rejected": -9.426887512207031, "loss": 1.5513, "odds_ratio_loss": 0.00328804855234921, "rewards/accuracies": 1.0, "rewards/chosen": -0.15509279072284698, "rewards/margins": 0.7875960469245911, "rewards/rejected": -0.9426888227462769, "sft_loss": 1.550927758216858, "step": 11930 }, { "epoch": 0.93, "grad_norm": 393.1823425292969, "learning_rate": 1.2781898947373195e-07, "logits/chosen": -1.4327385425567627, "logits/rejected": -1.2880324125289917, "logps/chosen": -1.134372353553772, "logps/rejected": -8.846219062805176, "loss": 1.1626, "odds_ratio_loss": 0.2817860245704651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1134372353553772, "rewards/margins": 0.7711846232414246, "rewards/rejected": -0.8846219182014465, "sft_loss": 1.134372353553772, "step": 11935 }, { "epoch": 0.93, "grad_norm": 3.882423162460327, "learning_rate": 1.2643929756392947e-07, "logits/chosen": -1.3471843004226685, "logits/rejected": -0.8193763494491577, "logps/chosen": -1.3053816556930542, "logps/rejected": -6.060335636138916, "loss": 1.3227, "odds_ratio_loss": 0.17290352284908295, "rewards/accuracies": 1.0, "rewards/chosen": -0.1305381804704666, "rewards/margins": 0.4754953980445862, "rewards/rejected": -0.6060335636138916, "sft_loss": 1.3053816556930542, "step": 11940 }, { "epoch": 0.93, "grad_norm": 80.9352798461914, "learning_rate": 1.2506699702770354e-07, "logits/chosen": -1.4275892972946167, "logits/rejected": -0.9729791879653931, "logps/chosen": -1.1546369791030884, "logps/rejected": -10.712817192077637, "loss": 1.1667, "odds_ratio_loss": 0.12075658142566681, "rewards/accuracies": 1.0, "rewards/chosen": -0.11546371132135391, "rewards/margins": 0.9558179974555969, "rewards/rejected": -1.0712816715240479, "sft_loss": 1.1546369791030884, "step": 11945 }, { "epoch": 0.93, "grad_norm": 9.213315963745117, "learning_rate": 1.2370208994632205e-07, "logits/chosen": -1.321332573890686, "logits/rejected": -0.8984807729721069, "logps/chosen": -0.9224896430969238, "logps/rejected": -4.597050666809082, "loss": 0.9979, "odds_ratio_loss": 0.754401683807373, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09224896878004074, "rewards/margins": 0.3674561083316803, "rewards/rejected": -0.45970505475997925, "sft_loss": 0.9224896430969238, "step": 11950 }, { "epoch": 0.93, "grad_norm": 644.9072875976562, "learning_rate": 1.2234457838984028e-07, "logits/chosen": -1.2838728427886963, "logits/rejected": -1.2300163507461548, "logps/chosen": -1.6868120431900024, "logps/rejected": -8.237083435058594, "loss": 1.6892, "odds_ratio_loss": 0.02369215339422226, "rewards/accuracies": 1.0, "rewards/chosen": -0.16868120431900024, "rewards/margins": 0.6550270915031433, "rewards/rejected": -0.8237083554267883, "sft_loss": 1.6868120431900024, "step": 11955 }, { "epoch": 0.93, "grad_norm": 32.0411376953125, "learning_rate": 1.2099446441709628e-07, "logits/chosen": -1.110761284828186, "logits/rejected": -1.6298236846923828, "logps/chosen": -0.7495521306991577, "logps/rejected": -11.462425231933594, "loss": 0.7499, "odds_ratio_loss": 0.0036002404522150755, "rewards/accuracies": 1.0, "rewards/chosen": -0.07495521008968353, "rewards/margins": 1.0712873935699463, "rewards/rejected": -1.1462424993515015, "sft_loss": 0.7495521306991577, "step": 11960 }, { "epoch": 0.93, "grad_norm": 5.815232276916504, "learning_rate": 1.1965175007571052e-07, "logits/chosen": -1.33306086063385, "logits/rejected": -1.0084116458892822, "logps/chosen": -1.0075197219848633, "logps/rejected": -7.959680080413818, "loss": 1.0209, "odds_ratio_loss": 0.133327916264534, "rewards/accuracies": 1.0, "rewards/chosen": -0.10075198113918304, "rewards/margins": 0.6952160596847534, "rewards/rejected": -0.7959679365158081, "sft_loss": 1.0075197219848633, "step": 11965 }, { "epoch": 0.93, "grad_norm": 8.307555198669434, "learning_rate": 1.1831643740207844e-07, "logits/chosen": -1.2692220211029053, "logits/rejected": -0.9586740732192993, "logps/chosen": -0.9765459895133972, "logps/rejected": -3.8916027545928955, "loss": 0.9861, "odds_ratio_loss": 0.09511784464120865, "rewards/accuracies": 1.0, "rewards/chosen": -0.09765460342168808, "rewards/margins": 0.2915056645870209, "rewards/rejected": -0.38916027545928955, "sft_loss": 0.9765459895133972, "step": 11970 }, { "epoch": 0.93, "grad_norm": 26.505313873291016, "learning_rate": 1.1698852842137176e-07, "logits/chosen": -1.3430640697479248, "logits/rejected": -1.318647027015686, "logps/chosen": -0.8846755027770996, "logps/rejected": -11.83702278137207, "loss": 0.9585, "odds_ratio_loss": 0.7379862666130066, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0884675532579422, "rewards/margins": 1.095234751701355, "rewards/rejected": -1.1837023496627808, "sft_loss": 0.8846755027770996, "step": 11975 }, { "epoch": 0.93, "grad_norm": 8.90914535522461, "learning_rate": 1.1566802514753284e-07, "logits/chosen": -1.4078346490859985, "logits/rejected": -1.1965572834014893, "logps/chosen": -0.8551615476608276, "logps/rejected": -5.059370994567871, "loss": 0.8636, "odds_ratio_loss": 0.08399681746959686, "rewards/accuracies": 1.0, "rewards/chosen": -0.08551616221666336, "rewards/margins": 0.42042097449302673, "rewards/rejected": -0.5059371590614319, "sft_loss": 0.8551615476608276, "step": 11980 }, { "epoch": 0.93, "grad_norm": 8.81097412109375, "learning_rate": 1.1435492958327243e-07, "logits/chosen": -1.3314521312713623, "logits/rejected": -1.2199690341949463, "logps/chosen": -0.9615247845649719, "logps/rejected": -7.449499607086182, "loss": 0.9763, "odds_ratio_loss": 0.1473933905363083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09615248441696167, "rewards/margins": 0.6487975120544434, "rewards/rejected": -0.744949996471405, "sft_loss": 0.9615247845649719, "step": 11985 }, { "epoch": 0.93, "grad_norm": 5.545321941375732, "learning_rate": 1.1304924372006754e-07, "logits/chosen": -1.4224355220794678, "logits/rejected": -0.9253866076469421, "logps/chosen": -1.4553496837615967, "logps/rejected": -7.184508323669434, "loss": 1.4743, "odds_ratio_loss": 0.18964393436908722, "rewards/accuracies": 1.0, "rewards/chosen": -0.1455349624156952, "rewards/margins": 0.5729159116744995, "rewards/rejected": -0.7184508442878723, "sft_loss": 1.4553496837615967, "step": 11990 }, { "epoch": 0.93, "grad_norm": 4.908974647521973, "learning_rate": 1.1175096953815578e-07, "logits/chosen": -1.211637020111084, "logits/rejected": -1.2191622257232666, "logps/chosen": -1.2892272472381592, "logps/rejected": -13.501127243041992, "loss": 1.337, "odds_ratio_loss": 0.4774898588657379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1289227306842804, "rewards/margins": 1.221190094947815, "rewards/rejected": -1.350112795829773, "sft_loss": 1.2892272472381592, "step": 11995 }, { "epoch": 0.93, "grad_norm": 12.566999435424805, "learning_rate": 1.1046010900653492e-07, "logits/chosen": -1.3955318927764893, "logits/rejected": -0.9571416974067688, "logps/chosen": -0.9366915822029114, "logps/rejected": -7.75982141494751, "loss": 0.9803, "odds_ratio_loss": 0.43617886304855347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09366915374994278, "rewards/margins": 0.6823130249977112, "rewards/rejected": -0.775982141494751, "sft_loss": 0.9366915822029114, "step": 12000 }, { "epoch": 0.93, "grad_norm": 35.92390441894531, "learning_rate": 1.0917666408295891e-07, "logits/chosen": -1.4041101932525635, "logits/rejected": -1.0649272203445435, "logps/chosen": -0.9326528310775757, "logps/rejected": -2.555864095687866, "loss": 0.9705, "odds_ratio_loss": 0.37830302119255066, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09326529502868652, "rewards/margins": 0.16232113540172577, "rewards/rejected": -0.2555864453315735, "sft_loss": 0.9326528310775757, "step": 12005 }, { "epoch": 0.93, "grad_norm": 7.222818374633789, "learning_rate": 1.0790063671393514e-07, "logits/chosen": -1.2876554727554321, "logits/rejected": -1.0530130863189697, "logps/chosen": -0.8531826138496399, "logps/rejected": -6.352944850921631, "loss": 0.8805, "odds_ratio_loss": 0.2727716565132141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08531826734542847, "rewards/margins": 0.5499762296676636, "rewards/rejected": -0.6352945566177368, "sft_loss": 0.8531826138496399, "step": 12010 }, { "epoch": 0.93, "grad_norm": 78.87431335449219, "learning_rate": 1.0663202883472056e-07, "logits/chosen": -1.2207971811294556, "logits/rejected": -1.0271674394607544, "logps/chosen": -0.8058179616928101, "logps/rejected": -8.2181396484375, "loss": 0.8072, "odds_ratio_loss": 0.013833925127983093, "rewards/accuracies": 1.0, "rewards/chosen": -0.08058180660009384, "rewards/margins": 0.74123215675354, "rewards/rejected": -0.8218139410018921, "sft_loss": 0.8058179616928101, "step": 12015 }, { "epoch": 0.94, "grad_norm": 5.772078990936279, "learning_rate": 1.0537084236932116e-07, "logits/chosen": -1.1963145732879639, "logits/rejected": -0.8109409213066101, "logps/chosen": -0.7392154335975647, "logps/rejected": -6.433381080627441, "loss": 0.7564, "odds_ratio_loss": 0.171526238322258, "rewards/accuracies": 1.0, "rewards/chosen": -0.07392154633998871, "rewards/margins": 0.5694166421890259, "rewards/rejected": -0.6433380842208862, "sft_loss": 0.7392154335975647, "step": 12020 }, { "epoch": 0.94, "grad_norm": 28.807552337646484, "learning_rate": 1.041170792304852e-07, "logits/chosen": -1.1929874420166016, "logits/rejected": -1.4887889623641968, "logps/chosen": -1.1478734016418457, "logps/rejected": -8.757756233215332, "loss": 1.1484, "odds_ratio_loss": 0.0055931126698851585, "rewards/accuracies": 1.0, "rewards/chosen": -0.11478734016418457, "rewards/margins": 0.7609882354736328, "rewards/rejected": -0.8757756352424622, "sft_loss": 1.1478734016418457, "step": 12025 }, { "epoch": 0.94, "grad_norm": 6.5143585205078125, "learning_rate": 1.0287074131970387e-07, "logits/chosen": -1.3817237615585327, "logits/rejected": -1.1475965976715088, "logps/chosen": -1.1112757921218872, "logps/rejected": -5.863193035125732, "loss": 1.118, "odds_ratio_loss": 0.06756018102169037, "rewards/accuracies": 1.0, "rewards/chosen": -0.1111275777220726, "rewards/margins": 0.47519174218177795, "rewards/rejected": -0.5863193273544312, "sft_loss": 1.1112757921218872, "step": 12030 }, { "epoch": 0.94, "grad_norm": 4.472811698913574, "learning_rate": 1.0163183052720793e-07, "logits/chosen": -1.0868616104125977, "logits/rejected": -1.0630944967269897, "logps/chosen": -0.9792786836624146, "logps/rejected": -5.954896450042725, "loss": 1.0019, "odds_ratio_loss": 0.22617638111114502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09792786091566086, "rewards/margins": 0.4975617825984955, "rewards/rejected": -0.5954896807670593, "sft_loss": 0.9792786836624146, "step": 12035 }, { "epoch": 0.94, "grad_norm": 15.370599746704102, "learning_rate": 1.0040034873196158e-07, "logits/chosen": -1.3447648286819458, "logits/rejected": -1.0325714349746704, "logps/chosen": -0.956787109375, "logps/rejected": -2.4064972400665283, "loss": 0.9877, "odds_ratio_loss": 0.3093990385532379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09567871689796448, "rewards/margins": 0.14497099816799164, "rewards/rejected": -0.2406497299671173, "sft_loss": 0.956787109375, "step": 12040 }, { "epoch": 0.94, "grad_norm": 9.366061210632324, "learning_rate": 9.91762978016636e-08, "logits/chosen": -1.3803406953811646, "logits/rejected": -0.5240996479988098, "logps/chosen": -1.0151456594467163, "logps/rejected": -7.575808525085449, "loss": 1.0166, "odds_ratio_loss": 0.014451740309596062, "rewards/accuracies": 1.0, "rewards/chosen": -0.10151456296443939, "rewards/margins": 0.6560662984848022, "rewards/rejected": -0.7575808763504028, "sft_loss": 1.0151456594467163, "step": 12045 }, { "epoch": 0.94, "grad_norm": 6.190619945526123, "learning_rate": 9.795967959274233e-08, "logits/chosen": -1.1493083238601685, "logits/rejected": -1.1575744152069092, "logps/chosen": -0.8835114240646362, "logps/rejected": -9.733977317810059, "loss": 0.898, "odds_ratio_loss": 0.1443968117237091, "rewards/accuracies": 1.0, "rewards/chosen": -0.08835114538669586, "rewards/margins": 0.8850466012954712, "rewards/rejected": -0.9733978509902954, "sft_loss": 0.8835114240646362, "step": 12050 }, { "epoch": 0.94, "grad_norm": 5.491006374359131, "learning_rate": 9.675049595035512e-08, "logits/chosen": -1.180174708366394, "logits/rejected": -0.6385637521743774, "logps/chosen": -1.3686999082565308, "logps/rejected": -13.49418830871582, "loss": 1.3689, "odds_ratio_loss": 0.0024213658180087805, "rewards/accuracies": 1.0, "rewards/chosen": -0.13686999678611755, "rewards/margins": 1.2125487327575684, "rewards/rejected": -1.3494187593460083, "sft_loss": 1.3686999082565308, "step": 12055 }, { "epoch": 0.94, "grad_norm": 10.283293724060059, "learning_rate": 9.554874870838116e-08, "logits/chosen": -1.3981066942214966, "logits/rejected": -1.540945291519165, "logps/chosen": -0.962783932685852, "logps/rejected": -6.7717413902282715, "loss": 0.9941, "odds_ratio_loss": 0.31289738416671753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09627839177846909, "rewards/margins": 0.5808957815170288, "rewards/rejected": -0.6771741509437561, "sft_loss": 0.962783932685852, "step": 12060 }, { "epoch": 0.94, "grad_norm": 17.973472595214844, "learning_rate": 9.435443968942304e-08, "logits/chosen": -1.2509690523147583, "logits/rejected": -0.672872006893158, "logps/chosen": -1.6948953866958618, "logps/rejected": -6.384387969970703, "loss": 1.7539, "odds_ratio_loss": 0.5902327299118042, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1694895476102829, "rewards/margins": 0.46894925832748413, "rewards/rejected": -0.6384388208389282, "sft_loss": 1.6948953866958618, "step": 12065 }, { "epoch": 0.94, "grad_norm": 46.34778594970703, "learning_rate": 9.316757070480242e-08, "logits/chosen": -1.2243728637695312, "logits/rejected": -0.6550508737564087, "logps/chosen": -1.2963842153549194, "logps/rejected": -4.396199703216553, "loss": 1.3026, "odds_ratio_loss": 0.0616978295147419, "rewards/accuracies": 1.0, "rewards/chosen": -0.1296384334564209, "rewards/margins": 0.3099815547466278, "rewards/rejected": -0.4396200180053711, "sft_loss": 1.2963842153549194, "step": 12070 }, { "epoch": 0.94, "grad_norm": 14.527528762817383, "learning_rate": 9.198814355455666e-08, "logits/chosen": -1.4315307140350342, "logits/rejected": -1.2783674001693726, "logps/chosen": -0.5937285423278809, "logps/rejected": -5.994349002838135, "loss": 0.6022, "odds_ratio_loss": 0.08455837517976761, "rewards/accuracies": 1.0, "rewards/chosen": -0.059372853487730026, "rewards/margins": 0.5400620698928833, "rewards/rejected": -0.5994349718093872, "sft_loss": 0.5937285423278809, "step": 12075 }, { "epoch": 0.94, "grad_norm": 14.392807006835938, "learning_rate": 9.08161600274371e-08, "logits/chosen": -1.3758456707000732, "logits/rejected": -0.9627407789230347, "logps/chosen": -1.0425665378570557, "logps/rejected": -3.557486057281494, "loss": 1.1027, "odds_ratio_loss": 0.6015771627426147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10425666719675064, "rewards/margins": 0.2514919340610504, "rewards/rejected": -0.35574859380722046, "sft_loss": 1.0425665378570557, "step": 12080 }, { "epoch": 0.94, "grad_norm": 13.73567008972168, "learning_rate": 8.965162190090415e-08, "logits/chosen": -1.4193708896636963, "logits/rejected": -0.9948325157165527, "logps/chosen": -1.0957233905792236, "logps/rejected": -4.032468795776367, "loss": 1.1318, "odds_ratio_loss": 0.361260324716568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10957233607769012, "rewards/margins": 0.293674498796463, "rewards/rejected": -0.4032468795776367, "sft_loss": 1.0957233905792236, "step": 12085 }, { "epoch": 0.94, "grad_norm": 221.73533630371094, "learning_rate": 8.849453094112947e-08, "logits/chosen": -1.3586000204086304, "logits/rejected": -0.698357343673706, "logps/chosen": -1.1419107913970947, "logps/rejected": -3.3975303173065186, "loss": 1.1667, "odds_ratio_loss": 0.24787549674510956, "rewards/accuracies": 1.0, "rewards/chosen": -0.11419107764959335, "rewards/margins": 0.2255619764328003, "rewards/rejected": -0.33975309133529663, "sft_loss": 1.1419107913970947, "step": 12090 }, { "epoch": 0.94, "grad_norm": 8.304841995239258, "learning_rate": 8.734488890298765e-08, "logits/chosen": -1.3388170003890991, "logits/rejected": -0.8920143842697144, "logps/chosen": -0.9899235963821411, "logps/rejected": -5.051036834716797, "loss": 1.0105, "odds_ratio_loss": 0.20586714148521423, "rewards/accuracies": 1.0, "rewards/chosen": -0.09899236261844635, "rewards/margins": 0.40611138939857483, "rewards/rejected": -0.5051037073135376, "sft_loss": 0.9899235963821411, "step": 12095 }, { "epoch": 0.94, "grad_norm": 61.597381591796875, "learning_rate": 8.620269753005617e-08, "logits/chosen": -1.508922815322876, "logits/rejected": -0.8960739374160767, "logps/chosen": -1.2426116466522217, "logps/rejected": -8.366430282592773, "loss": 1.2661, "odds_ratio_loss": 0.23516330122947693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12426116317510605, "rewards/margins": 0.7123818397521973, "rewards/rejected": -0.8366430401802063, "sft_loss": 1.2426116466522217, "step": 12100 }, { "epoch": 0.94, "grad_norm": 14.718682289123535, "learning_rate": 8.506795855461381e-08, "logits/chosen": -1.3858963251113892, "logits/rejected": -1.4522991180419922, "logps/chosen": -1.1231282949447632, "logps/rejected": -5.925871849060059, "loss": 1.1419, "odds_ratio_loss": 0.18807430565357208, "rewards/accuracies": 1.0, "rewards/chosen": -0.11231283843517303, "rewards/margins": 0.4802742898464203, "rewards/rejected": -0.5925871729850769, "sft_loss": 1.1231282949447632, "step": 12105 }, { "epoch": 0.94, "grad_norm": 27.117656707763672, "learning_rate": 8.394067369763725e-08, "logits/chosen": -1.331491470336914, "logits/rejected": -1.3383785486221313, "logps/chosen": -0.893126368522644, "logps/rejected": -4.682895183563232, "loss": 0.908, "odds_ratio_loss": 0.14834722876548767, "rewards/accuracies": 1.0, "rewards/chosen": -0.08931264281272888, "rewards/margins": 0.37897688150405884, "rewards/rejected": -0.4682895541191101, "sft_loss": 0.893126368522644, "step": 12110 }, { "epoch": 0.94, "grad_norm": 10.441815376281738, "learning_rate": 8.282084466879503e-08, "logits/chosen": -1.4479217529296875, "logits/rejected": -1.075995683670044, "logps/chosen": -0.9278053045272827, "logps/rejected": -6.618409633636475, "loss": 0.9594, "odds_ratio_loss": 0.3158452808856964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09278053045272827, "rewards/margins": 0.5690604448318481, "rewards/rejected": -0.6618409156799316, "sft_loss": 0.9278053045272827, "step": 12115 }, { "epoch": 0.94, "grad_norm": 11.735318183898926, "learning_rate": 8.170847316645247e-08, "logits/chosen": -1.4747486114501953, "logits/rejected": -1.2501027584075928, "logps/chosen": -0.8782304525375366, "logps/rejected": -5.13653039932251, "loss": 0.8838, "odds_ratio_loss": 0.05616645887494087, "rewards/accuracies": 1.0, "rewards/chosen": -0.0878230482339859, "rewards/margins": 0.4258299767971039, "rewards/rejected": -0.5136530995368958, "sft_loss": 0.8782304525375366, "step": 12120 }, { "epoch": 0.94, "grad_norm": 11.163629531860352, "learning_rate": 8.060356087766063e-08, "logits/chosen": -1.396041989326477, "logits/rejected": -1.07887601852417, "logps/chosen": -1.0224263668060303, "logps/rejected": -2.5870959758758545, "loss": 1.062, "odds_ratio_loss": 0.3955123722553253, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10224263370037079, "rewards/margins": 0.1564669907093048, "rewards/rejected": -0.2587096095085144, "sft_loss": 1.0224263668060303, "step": 12125 }, { "epoch": 0.94, "grad_norm": 9.195205688476562, "learning_rate": 7.950610947815907e-08, "logits/chosen": -1.3456220626831055, "logits/rejected": -0.5792462229728699, "logps/chosen": -0.9115538597106934, "logps/rejected": -1.8077294826507568, "loss": 0.9513, "odds_ratio_loss": 0.39775362610816956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09115538746118546, "rewards/margins": 0.08961758762598038, "rewards/rejected": -0.18077296018600464, "sft_loss": 0.9115538597106934, "step": 12130 }, { "epoch": 0.94, "grad_norm": 194.4700469970703, "learning_rate": 7.841612063237303e-08, "logits/chosen": -1.3239562511444092, "logits/rejected": -1.120084285736084, "logps/chosen": -0.8774365186691284, "logps/rejected": -9.447786331176758, "loss": 0.9098, "odds_ratio_loss": 0.3234691321849823, "rewards/accuracies": 1.0, "rewards/chosen": -0.08774365484714508, "rewards/margins": 0.8570348620414734, "rewards/rejected": -0.944778561592102, "sft_loss": 0.8774365186691284, "step": 12135 }, { "epoch": 0.94, "grad_norm": 9.07659912109375, "learning_rate": 7.733359599340906e-08, "logits/chosen": -1.4422779083251953, "logits/rejected": -1.4027847051620483, "logps/chosen": -1.2413756847381592, "logps/rejected": -4.607026100158691, "loss": 1.2533, "odds_ratio_loss": 0.1192028746008873, "rewards/accuracies": 1.0, "rewards/chosen": -0.12413756549358368, "rewards/margins": 0.3365650475025177, "rewards/rejected": -0.4607025980949402, "sft_loss": 1.2413756847381592, "step": 12140 }, { "epoch": 0.94, "grad_norm": 158.15708923339844, "learning_rate": 7.625853720305276e-08, "logits/chosen": -1.3283551931381226, "logits/rejected": -1.3912734985351562, "logps/chosen": -0.695563018321991, "logps/rejected": -8.136407852172852, "loss": 0.6959, "odds_ratio_loss": 0.0031926899682730436, "rewards/accuracies": 1.0, "rewards/chosen": -0.06955631077289581, "rewards/margins": 0.7440845370292664, "rewards/rejected": -0.8136407732963562, "sft_loss": 0.695563018321991, "step": 12145 }, { "epoch": 0.95, "grad_norm": 10.927029609680176, "learning_rate": 7.519094589176711e-08, "logits/chosen": -1.3552180528640747, "logits/rejected": -0.7659333944320679, "logps/chosen": -0.9943283200263977, "logps/rejected": -6.943314552307129, "loss": 0.999, "odds_ratio_loss": 0.04680733382701874, "rewards/accuracies": 1.0, "rewards/chosen": -0.09943283349275589, "rewards/margins": 0.5948985815048218, "rewards/rejected": -0.6943314671516418, "sft_loss": 0.9943283200263977, "step": 12150 }, { "epoch": 0.95, "grad_norm": 12.59920883178711, "learning_rate": 7.41308236786903e-08, "logits/chosen": -1.3633660078048706, "logits/rejected": -1.2992112636566162, "logps/chosen": -1.0420236587524414, "logps/rejected": -10.714728355407715, "loss": 1.0427, "odds_ratio_loss": 0.006580235902220011, "rewards/accuracies": 1.0, "rewards/chosen": -0.10420236736536026, "rewards/margins": 0.9672704935073853, "rewards/rejected": -1.0714728832244873, "sft_loss": 1.0420236587524414, "step": 12155 }, { "epoch": 0.95, "grad_norm": 4.964155673980713, "learning_rate": 7.307817217163226e-08, "logits/chosen": -1.3251888751983643, "logits/rejected": -0.5417923927307129, "logps/chosen": -0.9761824607849121, "logps/rejected": -8.093366622924805, "loss": 0.9885, "odds_ratio_loss": 0.12305097281932831, "rewards/accuracies": 1.0, "rewards/chosen": -0.09761824458837509, "rewards/margins": 0.7117183804512024, "rewards/rejected": -0.8093365430831909, "sft_loss": 0.9761824607849121, "step": 12160 }, { "epoch": 0.95, "grad_norm": 104.98523712158203, "learning_rate": 7.203299296707156e-08, "logits/chosen": -1.3231998682022095, "logits/rejected": -1.129874348640442, "logps/chosen": -1.044856071472168, "logps/rejected": -5.618155479431152, "loss": 1.0523, "odds_ratio_loss": 0.07447656989097595, "rewards/accuracies": 1.0, "rewards/chosen": -0.10448561608791351, "rewards/margins": 0.45732998847961426, "rewards/rejected": -0.5618155598640442, "sft_loss": 1.044856071472168, "step": 12165 }, { "epoch": 0.95, "grad_norm": 5.787536144256592, "learning_rate": 7.099528765015684e-08, "logits/chosen": -1.4294707775115967, "logits/rejected": -1.1909528970718384, "logps/chosen": -1.0964525938034058, "logps/rejected": -2.062704563140869, "loss": 1.1325, "odds_ratio_loss": 0.360903263092041, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10964526236057281, "rewards/margins": 0.09662520885467529, "rewards/rejected": -0.20627045631408691, "sft_loss": 1.0964525938034058, "step": 12170 }, { "epoch": 0.95, "grad_norm": 10.634527206420898, "learning_rate": 6.996505779469976e-08, "logits/chosen": -1.3393810987472534, "logits/rejected": -1.0690622329711914, "logps/chosen": -0.9409956932067871, "logps/rejected": -6.221067905426025, "loss": 0.9422, "odds_ratio_loss": 0.011605637148022652, "rewards/accuracies": 1.0, "rewards/chosen": -0.09409955888986588, "rewards/margins": 0.5280072689056396, "rewards/rejected": -0.6221068501472473, "sft_loss": 0.9409956932067871, "step": 12175 }, { "epoch": 0.95, "grad_norm": 6.429897785186768, "learning_rate": 6.894230496317322e-08, "logits/chosen": -1.4484320878982544, "logits/rejected": -0.8284847140312195, "logps/chosen": -1.3208070993423462, "logps/rejected": -7.3359880447387695, "loss": 1.335, "odds_ratio_loss": 0.1418505609035492, "rewards/accuracies": 1.0, "rewards/chosen": -0.13208071887493134, "rewards/margins": 0.6015180349349976, "rewards/rejected": -0.7335987687110901, "sft_loss": 1.3208070993423462, "step": 12180 }, { "epoch": 0.95, "grad_norm": 7.727414131164551, "learning_rate": 6.792703070671258e-08, "logits/chosen": -1.229412317276001, "logits/rejected": -1.1320011615753174, "logps/chosen": -0.8664442896842957, "logps/rejected": -10.342384338378906, "loss": 0.8741, "odds_ratio_loss": 0.07685581594705582, "rewards/accuracies": 1.0, "rewards/chosen": -0.08664443343877792, "rewards/margins": 0.9475939869880676, "rewards/rejected": -1.0342384576797485, "sft_loss": 0.8664442896842957, "step": 12185 }, { "epoch": 0.95, "grad_norm": 8.296463012695312, "learning_rate": 6.691923656511112e-08, "logits/chosen": -1.295880913734436, "logits/rejected": -1.4863216876983643, "logps/chosen": -1.0527490377426147, "logps/rejected": -10.10430908203125, "loss": 1.0625, "odds_ratio_loss": 0.09733770787715912, "rewards/accuracies": 1.0, "rewards/chosen": -0.10527490079402924, "rewards/margins": 0.9051558375358582, "rewards/rejected": -1.0104308128356934, "sft_loss": 1.0527490377426147, "step": 12190 }, { "epoch": 0.95, "grad_norm": 395.551513671875, "learning_rate": 6.591892406681511e-08, "logits/chosen": -1.233620285987854, "logits/rejected": -1.1905428171157837, "logps/chosen": -1.7282718420028687, "logps/rejected": -2.440072536468506, "loss": 1.7949, "odds_ratio_loss": 0.6666890978813171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17282718420028687, "rewards/margins": 0.07118009030818939, "rewards/rejected": -0.24400727450847626, "sft_loss": 1.7282718420028687, "step": 12195 }, { "epoch": 0.95, "grad_norm": 6.149363040924072, "learning_rate": 6.492609472892653e-08, "logits/chosen": -1.2824854850769043, "logits/rejected": -1.3274608850479126, "logps/chosen": -1.017652153968811, "logps/rejected": -10.595731735229492, "loss": 1.0283, "odds_ratio_loss": 0.10598143190145493, "rewards/accuracies": 1.0, "rewards/chosen": -0.1017652153968811, "rewards/margins": 0.9578080177307129, "rewards/rejected": -1.0595732927322388, "sft_loss": 1.017652153968811, "step": 12200 }, { "epoch": 0.95, "grad_norm": 8.832098007202148, "learning_rate": 6.394075005719647e-08, "logits/chosen": -1.3609025478363037, "logits/rejected": -1.0700979232788086, "logps/chosen": -0.8298861384391785, "logps/rejected": -4.105466365814209, "loss": 0.8455, "odds_ratio_loss": 0.1559799760580063, "rewards/accuracies": 1.0, "rewards/chosen": -0.08298861235380173, "rewards/margins": 0.3275580406188965, "rewards/rejected": -0.4105466902256012, "sft_loss": 0.8298861384391785, "step": 12205 }, { "epoch": 0.95, "grad_norm": 7.1576247215271, "learning_rate": 6.296289154602508e-08, "logits/chosen": -1.4908638000488281, "logits/rejected": -1.1165975332260132, "logps/chosen": -0.8523378372192383, "logps/rejected": -3.153052568435669, "loss": 0.8723, "odds_ratio_loss": 0.19982339441776276, "rewards/accuracies": 1.0, "rewards/chosen": -0.08523379266262054, "rewards/margins": 0.23007145524024963, "rewards/rejected": -0.315305233001709, "sft_loss": 0.8523378372192383, "step": 12210 }, { "epoch": 0.95, "grad_norm": 5.327239990234375, "learning_rate": 6.199252067845995e-08, "logits/chosen": -1.408676266670227, "logits/rejected": -1.2627909183502197, "logps/chosen": -1.008329153060913, "logps/rejected": -7.339837074279785, "loss": 1.0153, "odds_ratio_loss": 0.06966123729944229, "rewards/accuracies": 1.0, "rewards/chosen": -0.10083291679620743, "rewards/margins": 0.6331508159637451, "rewards/rejected": -0.7339836955070496, "sft_loss": 1.008329153060913, "step": 12215 }, { "epoch": 0.95, "grad_norm": 331.38897705078125, "learning_rate": 6.102963892619107e-08, "logits/chosen": -1.4222207069396973, "logits/rejected": -1.1397409439086914, "logps/chosen": -1.2288148403167725, "logps/rejected": -9.242072105407715, "loss": 1.2508, "odds_ratio_loss": 0.21948783099651337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12288150936365128, "rewards/margins": 0.801325798034668, "rewards/rejected": -0.924207329750061, "sft_loss": 1.2288148403167725, "step": 12220 }, { "epoch": 0.95, "grad_norm": 16.810434341430664, "learning_rate": 6.007424774955029e-08, "logits/chosen": -1.2084866762161255, "logits/rejected": -1.2693732976913452, "logps/chosen": -0.8313711285591125, "logps/rejected": -3.6703248023986816, "loss": 0.8445, "odds_ratio_loss": 0.13097386062145233, "rewards/accuracies": 1.0, "rewards/chosen": -0.08313710987567902, "rewards/margins": 0.2838953733444214, "rewards/rejected": -0.3670324683189392, "sft_loss": 0.8313711285591125, "step": 12225 }, { "epoch": 0.95, "grad_norm": 26.042701721191406, "learning_rate": 5.912634859751021e-08, "logits/chosen": -1.2757648229599, "logits/rejected": -1.3488729000091553, "logps/chosen": -0.9625293016433716, "logps/rejected": -2.616014003753662, "loss": 1.0387, "odds_ratio_loss": 0.7618128657341003, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0962529331445694, "rewards/margins": 0.16534848511219025, "rewards/rejected": -0.26160138845443726, "sft_loss": 0.9625293016433716, "step": 12230 }, { "epoch": 0.95, "grad_norm": 9.009185791015625, "learning_rate": 5.818594290768087e-08, "logits/chosen": -1.4181190729141235, "logits/rejected": -1.1298072338104248, "logps/chosen": -0.9780190587043762, "logps/rejected": -5.396805286407471, "loss": 1.0051, "odds_ratio_loss": 0.2709425091743469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09780190140008926, "rewards/margins": 0.44187861680984497, "rewards/rejected": -0.5396804809570312, "sft_loss": 0.9780190587043762, "step": 12235 }, { "epoch": 0.95, "grad_norm": 33.64002990722656, "learning_rate": 5.725303210630584e-08, "logits/chosen": -1.25346839427948, "logits/rejected": -0.8549555540084839, "logps/chosen": -0.9498344659805298, "logps/rejected": -3.4202792644500732, "loss": 0.9662, "odds_ratio_loss": 0.16371150314807892, "rewards/accuracies": 1.0, "rewards/chosen": -0.09498345106840134, "rewards/margins": 0.24704448878765106, "rewards/rejected": -0.3420279622077942, "sft_loss": 0.9498344659805298, "step": 12240 }, { "epoch": 0.95, "grad_norm": 15.04161548614502, "learning_rate": 5.632761760826333e-08, "logits/chosen": -1.1823722124099731, "logits/rejected": -1.113013505935669, "logps/chosen": -0.8765622973442078, "logps/rejected": -2.5976319313049316, "loss": 0.9287, "odds_ratio_loss": 0.5214719772338867, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.08765622973442078, "rewards/margins": 0.17210696637630463, "rewards/rejected": -0.2597631812095642, "sft_loss": 0.8765622973442078, "step": 12245 }, { "epoch": 0.95, "grad_norm": 6.058566570281982, "learning_rate": 5.540970081706176e-08, "logits/chosen": -1.280879259109497, "logits/rejected": -0.4485422968864441, "logps/chosen": -1.225573182106018, "logps/rejected": -9.1913480758667, "loss": 1.2274, "odds_ratio_loss": 0.01814914494752884, "rewards/accuracies": 1.0, "rewards/chosen": -0.12255732715129852, "rewards/margins": 0.796577513217926, "rewards/rejected": -0.919134795665741, "sft_loss": 1.225573182106018, "step": 12250 }, { "epoch": 0.95, "grad_norm": 31.721729278564453, "learning_rate": 5.449928312483865e-08, "logits/chosen": -1.1671860218048096, "logits/rejected": -1.2594536542892456, "logps/chosen": -1.1395418643951416, "logps/rejected": -4.042827129364014, "loss": 1.1664, "odds_ratio_loss": 0.26880699396133423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11395418643951416, "rewards/margins": 0.2903285324573517, "rewards/rejected": -0.40428265929222107, "sft_loss": 1.1395418643951416, "step": 12255 }, { "epoch": 0.95, "grad_norm": 618.9825439453125, "learning_rate": 5.359636591235784e-08, "logits/chosen": -1.311244249343872, "logits/rejected": -1.2541598081588745, "logps/chosen": -1.1040689945220947, "logps/rejected": -5.051757335662842, "loss": 1.1462, "odds_ratio_loss": 0.42150840163230896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11040691286325455, "rewards/margins": 0.3947688341140747, "rewards/rejected": -0.505175769329071, "sft_loss": 1.1040689945220947, "step": 12260 }, { "epoch": 0.95, "grad_norm": 18.62577247619629, "learning_rate": 5.270095054900781e-08, "logits/chosen": -1.414301872253418, "logits/rejected": -1.191956639289856, "logps/chosen": -1.1366630792617798, "logps/rejected": -9.936189651489258, "loss": 1.1743, "odds_ratio_loss": 0.37666797637939453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11366631090641022, "rewards/margins": 0.8799527287483215, "rewards/rejected": -0.9936189651489258, "sft_loss": 1.1366630792617798, "step": 12265 }, { "epoch": 0.95, "grad_norm": 8.314948081970215, "learning_rate": 5.1813038392800056e-08, "logits/chosen": -1.1787521839141846, "logits/rejected": -1.5429284572601318, "logps/chosen": -1.432403802871704, "logps/rejected": -10.608168601989746, "loss": 1.4325, "odds_ratio_loss": 0.0005790928844362497, "rewards/accuracies": 1.0, "rewards/chosen": -0.14324037730693817, "rewards/margins": 0.9175764918327332, "rewards/rejected": -1.060817003250122, "sft_loss": 1.432403802871704, "step": 12270 }, { "epoch": 0.95, "grad_norm": 10.948424339294434, "learning_rate": 5.0932630790366256e-08, "logits/chosen": -1.2694766521453857, "logits/rejected": -1.3637231588363647, "logps/chosen": -0.8769696950912476, "logps/rejected": -7.397462368011475, "loss": 0.8855, "odds_ratio_loss": 0.08508679270744324, "rewards/accuracies": 1.0, "rewards/chosen": -0.08769698441028595, "rewards/margins": 0.6520493030548096, "rewards/rejected": -0.7397462725639343, "sft_loss": 0.8769696950912476, "step": 12275 }, { "epoch": 0.96, "grad_norm": 31.869291305541992, "learning_rate": 5.0059729076955e-08, "logits/chosen": -1.192575216293335, "logits/rejected": -1.6547155380249023, "logps/chosen": -1.0308337211608887, "logps/rejected": -12.254032135009766, "loss": 1.0309, "odds_ratio_loss": 0.0003266182611696422, "rewards/accuracies": 1.0, "rewards/chosen": -0.10308338701725006, "rewards/margins": 1.1223198175430298, "rewards/rejected": -1.2254031896591187, "sft_loss": 1.0308337211608887, "step": 12280 }, { "epoch": 0.96, "grad_norm": 24.861595153808594, "learning_rate": 4.919433457643452e-08, "logits/chosen": -1.0728105306625366, "logits/rejected": -1.2247686386108398, "logps/chosen": -1.1914570331573486, "logps/rejected": -6.046353340148926, "loss": 1.2243, "odds_ratio_loss": 0.32865655422210693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1191457062959671, "rewards/margins": 0.4854896664619446, "rewards/rejected": -0.6046353578567505, "sft_loss": 1.1914570331573486, "step": 12285 }, { "epoch": 0.96, "grad_norm": 33.83651351928711, "learning_rate": 4.8336448601283835e-08, "logits/chosen": -1.2199666500091553, "logits/rejected": -1.2750102281570435, "logps/chosen": -1.2813136577606201, "logps/rejected": -5.206510066986084, "loss": 1.3284, "odds_ratio_loss": 0.47062787413597107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12813135981559753, "rewards/margins": 0.39251965284347534, "rewards/rejected": -0.5206509828567505, "sft_loss": 1.2813136577606201, "step": 12290 }, { "epoch": 0.96, "grad_norm": 6.180647373199463, "learning_rate": 4.748607245259606e-08, "logits/chosen": -1.3814548254013062, "logits/rejected": -0.8651509284973145, "logps/chosen": -0.760599672794342, "logps/rejected": -11.416925430297852, "loss": 0.7704, "odds_ratio_loss": 0.09805931150913239, "rewards/accuracies": 1.0, "rewards/chosen": -0.0760599672794342, "rewards/margins": 1.0656325817108154, "rewards/rejected": -1.1416925191879272, "sft_loss": 0.760599672794342, "step": 12295 }, { "epoch": 0.96, "grad_norm": 208.45420837402344, "learning_rate": 4.664320742007622e-08, "logits/chosen": -1.432703971862793, "logits/rejected": -1.4251139163970947, "logps/chosen": -1.8889224529266357, "logps/rejected": -6.2691545486450195, "loss": 1.9051, "odds_ratio_loss": 0.16187646985054016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18889224529266357, "rewards/margins": 0.438023179769516, "rewards/rejected": -0.6269153356552124, "sft_loss": 1.8889224529266357, "step": 12300 }, { "epoch": 0.96, "grad_norm": 8.938104629516602, "learning_rate": 4.580785478203453e-08, "logits/chosen": -1.2809088230133057, "logits/rejected": -1.1958422660827637, "logps/chosen": -1.5119210481643677, "logps/rejected": -7.5544843673706055, "loss": 1.5902, "odds_ratio_loss": 0.7823742628097534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1511920988559723, "rewards/margins": 0.6042563915252686, "rewards/rejected": -0.7554485201835632, "sft_loss": 1.5119210481643677, "step": 12305 }, { "epoch": 0.96, "grad_norm": 28.944644927978516, "learning_rate": 4.49800158053898e-08, "logits/chosen": -1.3874715566635132, "logits/rejected": -1.027039885520935, "logps/chosen": -1.0698702335357666, "logps/rejected": -11.652139663696289, "loss": 1.0706, "odds_ratio_loss": 0.007088521029800177, "rewards/accuracies": 1.0, "rewards/chosen": -0.10698702186346054, "rewards/margins": 1.0582268238067627, "rewards/rejected": -1.165213942527771, "sft_loss": 1.0698702335357666, "step": 12310 }, { "epoch": 0.96, "grad_norm": 14.06658935546875, "learning_rate": 4.4159691745664925e-08, "logits/chosen": -1.4516162872314453, "logits/rejected": -1.3610146045684814, "logps/chosen": -1.4354077577590942, "logps/rejected": -4.483978271484375, "loss": 1.4797, "odds_ratio_loss": 0.4433773458003998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.14354076981544495, "rewards/margins": 0.304857075214386, "rewards/rejected": -0.44839781522750854, "sft_loss": 1.4354077577590942, "step": 12315 }, { "epoch": 0.96, "grad_norm": 13.312085151672363, "learning_rate": 4.3346883846985265e-08, "logits/chosen": -1.4321238994598389, "logits/rejected": -0.7923226356506348, "logps/chosen": -1.0463868379592896, "logps/rejected": -9.889020919799805, "loss": 1.0495, "odds_ratio_loss": 0.03073304519057274, "rewards/accuracies": 1.0, "rewards/chosen": -0.10463868081569672, "rewards/margins": 0.8842633962631226, "rewards/rejected": -0.9889020919799805, "sft_loss": 1.0463868379592896, "step": 12320 }, { "epoch": 0.96, "grad_norm": 123.09896087646484, "learning_rate": 4.254159334207752e-08, "logits/chosen": -1.0274879932403564, "logits/rejected": -1.069379448890686, "logps/chosen": -1.4011691808700562, "logps/rejected": -12.469643592834473, "loss": 1.4012, "odds_ratio_loss": 0.0007433110731653869, "rewards/accuracies": 1.0, "rewards/chosen": -0.14011691510677338, "rewards/margins": 1.1068474054336548, "rewards/rejected": -1.2469643354415894, "sft_loss": 1.4011691808700562, "step": 12325 }, { "epoch": 0.96, "grad_norm": 6.3734283447265625, "learning_rate": 4.174382145226696e-08, "logits/chosen": -1.2176523208618164, "logits/rejected": -1.2900688648223877, "logps/chosen": -0.5494667887687683, "logps/rejected": -6.461459159851074, "loss": 0.554, "odds_ratio_loss": 0.04508071392774582, "rewards/accuracies": 1.0, "rewards/chosen": -0.05494668334722519, "rewards/margins": 0.5911992788314819, "rewards/rejected": -0.6461459398269653, "sft_loss": 0.5494667887687683, "step": 12330 }, { "epoch": 0.96, "grad_norm": 19.052757263183594, "learning_rate": 4.09535693874763e-08, "logits/chosen": -1.4111024141311646, "logits/rejected": -1.15664803981781, "logps/chosen": -0.7811176180839539, "logps/rejected": -4.984771728515625, "loss": 0.7908, "odds_ratio_loss": 0.09680463373661041, "rewards/accuracies": 1.0, "rewards/chosen": -0.07811176776885986, "rewards/margins": 0.4203653931617737, "rewards/rejected": -0.49847716093063354, "sft_loss": 0.7811176180839539, "step": 12335 }, { "epoch": 0.96, "grad_norm": 23.22859001159668, "learning_rate": 4.017083834622237e-08, "logits/chosen": -1.2132599353790283, "logits/rejected": -1.0425660610198975, "logps/chosen": -0.9321387410163879, "logps/rejected": -3.740152359008789, "loss": 0.9503, "odds_ratio_loss": 0.18153466284275055, "rewards/accuracies": 1.0, "rewards/chosen": -0.09321387857198715, "rewards/margins": 0.28080135583877563, "rewards/rejected": -0.3740152418613434, "sft_loss": 0.9321387410163879, "step": 12340 }, { "epoch": 0.96, "grad_norm": 12.329089164733887, "learning_rate": 3.9395629515616154e-08, "logits/chosen": -1.3570067882537842, "logits/rejected": -1.0711150169372559, "logps/chosen": -0.8609020113945007, "logps/rejected": -13.920463562011719, "loss": 0.8757, "odds_ratio_loss": 0.14800772070884705, "rewards/accuracies": 1.0, "rewards/chosen": -0.08609020709991455, "rewards/margins": 1.3059561252593994, "rewards/rejected": -1.392046332359314, "sft_loss": 0.8609020113945007, "step": 12345 }, { "epoch": 0.96, "grad_norm": 18.560041427612305, "learning_rate": 3.862794407136106e-08, "logits/chosen": -1.2778164148330688, "logits/rejected": -1.038604736328125, "logps/chosen": -1.2508759498596191, "logps/rejected": -8.479180335998535, "loss": 1.2699, "odds_ratio_loss": 0.18985147774219513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12508758902549744, "rewards/margins": 0.7228304147720337, "rewards/rejected": -0.8479180335998535, "sft_loss": 1.2508759498596191, "step": 12350 }, { "epoch": 0.96, "grad_norm": 61.77004623413086, "learning_rate": 3.786778317774964e-08, "logits/chosen": -1.0532138347625732, "logits/rejected": -1.1754436492919922, "logps/chosen": -0.764786422252655, "logps/rejected": -3.444678544998169, "loss": 0.7887, "odds_ratio_loss": 0.23928098380565643, "rewards/accuracies": 1.0, "rewards/chosen": -0.07647864520549774, "rewards/margins": 0.26798921823501587, "rewards/rejected": -0.3444678485393524, "sft_loss": 0.764786422252655, "step": 12355 }, { "epoch": 0.96, "grad_norm": 21.275634765625, "learning_rate": 3.711514798766081e-08, "logits/chosen": -1.2819563150405884, "logits/rejected": -0.8122344017028809, "logps/chosen": -1.0787274837493896, "logps/rejected": -2.4202353954315186, "loss": 1.1184, "odds_ratio_loss": 0.39671677350997925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10787274688482285, "rewards/margins": 0.13415075838565826, "rewards/rejected": -0.2420235425233841, "sft_loss": 1.0787274837493896, "step": 12360 }, { "epoch": 0.96, "grad_norm": 62.028438568115234, "learning_rate": 3.6370039642563134e-08, "logits/chosen": -1.3025977611541748, "logits/rejected": -1.6483367681503296, "logps/chosen": -0.8876152038574219, "logps/rejected": -7.4527907371521, "loss": 0.9039, "odds_ratio_loss": 0.16312487423419952, "rewards/accuracies": 1.0, "rewards/chosen": -0.08876152336597443, "rewards/margins": 0.6565175652503967, "rewards/rejected": -0.7452791333198547, "sft_loss": 0.8876152038574219, "step": 12365 }, { "epoch": 0.96, "grad_norm": 18.61726951599121, "learning_rate": 3.563245927250714e-08, "logits/chosen": -1.4143457412719727, "logits/rejected": -1.4389795064926147, "logps/chosen": -0.9123845100402832, "logps/rejected": -9.375925064086914, "loss": 0.9339, "odds_ratio_loss": 0.21503356099128723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09123845398426056, "rewards/margins": 0.8463540077209473, "rewards/rejected": -0.9375923871994019, "sft_loss": 0.9123845100402832, "step": 12370 }, { "epoch": 0.96, "grad_norm": 13.771241188049316, "learning_rate": 3.490240799612743e-08, "logits/chosen": -1.2913004159927368, "logits/rejected": -1.2106720209121704, "logps/chosen": -1.0773588418960571, "logps/rejected": -5.933541297912598, "loss": 1.0871, "odds_ratio_loss": 0.09785932302474976, "rewards/accuracies": 1.0, "rewards/chosen": -0.10773588716983795, "rewards/margins": 0.4856182634830475, "rewards/rejected": -0.5933541059494019, "sft_loss": 1.0773588418960571, "step": 12375 }, { "epoch": 0.96, "grad_norm": 165.80172729492188, "learning_rate": 3.417988692063945e-08, "logits/chosen": -1.4670937061309814, "logits/rejected": -1.0459048748016357, "logps/chosen": -0.7823060750961304, "logps/rejected": -2.592039108276367, "loss": 0.8162, "odds_ratio_loss": 0.3391726016998291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07823061943054199, "rewards/margins": 0.18097327649593353, "rewards/rejected": -0.25920388102531433, "sft_loss": 0.7823060750961304, "step": 12380 }, { "epoch": 0.96, "grad_norm": 12.715102195739746, "learning_rate": 3.346489714183831e-08, "logits/chosen": -1.2532730102539062, "logits/rejected": -0.949592113494873, "logps/chosen": -0.8318389654159546, "logps/rejected": -4.871085166931152, "loss": 0.8441, "odds_ratio_loss": 0.12211360782384872, "rewards/accuracies": 1.0, "rewards/chosen": -0.0831838995218277, "rewards/margins": 0.4039246141910553, "rewards/rejected": -0.4871085286140442, "sft_loss": 0.8318389654159546, "step": 12385 }, { "epoch": 0.96, "grad_norm": 271.7589111328125, "learning_rate": 3.275743974409606e-08, "logits/chosen": -1.3090143203735352, "logits/rejected": -1.1848065853118896, "logps/chosen": -0.8043609857559204, "logps/rejected": -11.306020736694336, "loss": 0.8107, "odds_ratio_loss": 0.06342881172895432, "rewards/accuracies": 1.0, "rewards/chosen": -0.0804360955953598, "rewards/margins": 1.050166130065918, "rewards/rejected": -1.1306021213531494, "sft_loss": 0.8043609857559204, "step": 12390 }, { "epoch": 0.96, "grad_norm": 80.97168731689453, "learning_rate": 3.20575158003622e-08, "logits/chosen": -1.466761827468872, "logits/rejected": -0.7243001461029053, "logps/chosen": -1.1298373937606812, "logps/rejected": -4.683624744415283, "loss": 1.1529, "odds_ratio_loss": 0.23061838746070862, "rewards/accuracies": 1.0, "rewards/chosen": -0.11298374831676483, "rewards/margins": 0.3553787171840668, "rewards/rejected": -0.468362420797348, "sft_loss": 1.1298373937606812, "step": 12395 }, { "epoch": 0.96, "grad_norm": 45.660179138183594, "learning_rate": 3.1365126372159824e-08, "logits/chosen": -1.3778924942016602, "logits/rejected": -1.3838534355163574, "logps/chosen": -0.791498601436615, "logps/rejected": -5.726881980895996, "loss": 0.8549, "odds_ratio_loss": 0.6341503262519836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07914986461400986, "rewards/margins": 0.4935383200645447, "rewards/rejected": -0.5726882219314575, "sft_loss": 0.791498601436615, "step": 12400 }, { "epoch": 0.96, "grad_norm": 19.550127029418945, "learning_rate": 3.068027250958616e-08, "logits/chosen": -1.4337947368621826, "logits/rejected": -1.0507450103759766, "logps/chosen": -0.8702392578125, "logps/rejected": -4.506911277770996, "loss": 0.9357, "odds_ratio_loss": 0.6542928814888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08702392876148224, "rewards/margins": 0.36366719007492065, "rewards/rejected": -0.4506911337375641, "sft_loss": 0.8702392578125, "step": 12405 }, { "epoch": 0.97, "grad_norm": 33.817935943603516, "learning_rate": 3.0002955251308696e-08, "logits/chosen": -1.412219762802124, "logits/rejected": -1.1537320613861084, "logps/chosen": -0.9598878026008606, "logps/rejected": -1.8418006896972656, "loss": 1.0055, "odds_ratio_loss": 0.4561440348625183, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09598878771066666, "rewards/margins": 0.08819130063056946, "rewards/rejected": -0.18418008089065552, "sft_loss": 0.9598878026008606, "step": 12410 }, { "epoch": 0.97, "grad_norm": 17.341407775878906, "learning_rate": 2.9333175624565168e-08, "logits/chosen": -1.4794400930404663, "logits/rejected": -1.7852928638458252, "logps/chosen": -0.536201000213623, "logps/rejected": -7.498734951019287, "loss": 0.5399, "odds_ratio_loss": 0.037034954875707626, "rewards/accuracies": 1.0, "rewards/chosen": -0.053620100021362305, "rewards/margins": 0.6962534189224243, "rewards/rejected": -0.7498735189437866, "sft_loss": 0.536201000213623, "step": 12415 }, { "epoch": 0.97, "grad_norm": 3.3973031044006348, "learning_rate": 2.8670934645160797e-08, "logits/chosen": -1.4210028648376465, "logits/rejected": -1.1338955163955688, "logps/chosen": -1.047786831855774, "logps/rejected": -5.595246315002441, "loss": 1.0528, "odds_ratio_loss": 0.04966248571872711, "rewards/accuracies": 1.0, "rewards/chosen": -0.10477868467569351, "rewards/margins": 0.45474591851234436, "rewards/rejected": -0.5595245957374573, "sft_loss": 1.047786831855774, "step": 12420 }, { "epoch": 0.97, "grad_norm": 5.257033824920654, "learning_rate": 2.8016233317468834e-08, "logits/chosen": -1.3157680034637451, "logits/rejected": -0.8841627240180969, "logps/chosen": -0.8723659515380859, "logps/rejected": -5.532425880432129, "loss": 0.8923, "odds_ratio_loss": 0.1988508403301239, "rewards/accuracies": 1.0, "rewards/chosen": -0.08723659813404083, "rewards/margins": 0.46600595116615295, "rewards/rejected": -0.553242564201355, "sft_loss": 0.8723659515380859, "step": 12425 }, { "epoch": 0.97, "grad_norm": 66.54361724853516, "learning_rate": 2.7369072634426673e-08, "logits/chosen": -1.225722074508667, "logits/rejected": -1.364844560623169, "logps/chosen": -1.2202316522598267, "logps/rejected": -5.396240711212158, "loss": 1.2492, "odds_ratio_loss": 0.2900741696357727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12202316522598267, "rewards/margins": 0.4176008701324463, "rewards/rejected": -0.5396240949630737, "sft_loss": 1.2202316522598267, "step": 12430 }, { "epoch": 0.97, "grad_norm": 7.29395055770874, "learning_rate": 2.672945357753587e-08, "logits/chosen": -1.2899141311645508, "logits/rejected": -1.6844837665557861, "logps/chosen": -0.9342865943908691, "logps/rejected": -15.596992492675781, "loss": 0.9355, "odds_ratio_loss": 0.012498864904046059, "rewards/accuracies": 1.0, "rewards/chosen": -0.09342865645885468, "rewards/margins": 1.4662706851959229, "rewards/rejected": -1.559699296951294, "sft_loss": 0.9342865943908691, "step": 12435 }, { "epoch": 0.97, "grad_norm": 8.378362655639648, "learning_rate": 2.6097377116859335e-08, "logits/chosen": -1.3625600337982178, "logits/rejected": -1.1308603286743164, "logps/chosen": -1.2010387182235718, "logps/rejected": -9.78742504119873, "loss": 1.2424, "odds_ratio_loss": 0.4139278829097748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1201038733124733, "rewards/margins": 0.8586385846138, "rewards/rejected": -0.9787424802780151, "sft_loss": 1.2010387182235718, "step": 12440 }, { "epoch": 0.97, "grad_norm": 12.528746604919434, "learning_rate": 2.547284421102192e-08, "logits/chosen": -1.3684858083724976, "logits/rejected": -0.7313657999038696, "logps/chosen": -1.1039519309997559, "logps/rejected": -5.332610607147217, "loss": 1.1141, "odds_ratio_loss": 0.10197708755731583, "rewards/accuracies": 1.0, "rewards/chosen": -0.11039519309997559, "rewards/margins": 0.4228658676147461, "rewards/rejected": -0.5332610607147217, "sft_loss": 1.1039519309997559, "step": 12445 }, { "epoch": 0.97, "grad_norm": 104.84793090820312, "learning_rate": 2.4855855807206508e-08, "logits/chosen": -1.4106018543243408, "logits/rejected": -1.2714979648590088, "logps/chosen": -1.1574375629425049, "logps/rejected": -4.188596725463867, "loss": 1.1862, "odds_ratio_loss": 0.28758615255355835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11574375629425049, "rewards/margins": 0.3031159043312073, "rewards/rejected": -0.41885966062545776, "sft_loss": 1.1574375629425049, "step": 12450 }, { "epoch": 0.97, "grad_norm": 17.179094314575195, "learning_rate": 2.4246412841155144e-08, "logits/chosen": -1.2052602767944336, "logits/rejected": -1.1856670379638672, "logps/chosen": -1.4439436197280884, "logps/rejected": -5.836577892303467, "loss": 1.4997, "odds_ratio_loss": 0.5576989650726318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14439436793327332, "rewards/margins": 0.4392634332180023, "rewards/rejected": -0.5836578607559204, "sft_loss": 1.4439436197280884, "step": 12455 }, { "epoch": 0.97, "grad_norm": 20.688180923461914, "learning_rate": 2.3644516237164572e-08, "logits/chosen": -1.226061224937439, "logits/rejected": -1.5945135354995728, "logps/chosen": -0.8379164934158325, "logps/rejected": -7.608580112457275, "loss": 0.8551, "odds_ratio_loss": 0.17142853140830994, "rewards/accuracies": 1.0, "rewards/chosen": -0.08379164338111877, "rewards/margins": 0.677066445350647, "rewards/rejected": -0.7608579993247986, "sft_loss": 0.8379164934158325, "step": 12460 }, { "epoch": 0.97, "grad_norm": 11.189611434936523, "learning_rate": 2.305016690808848e-08, "logits/chosen": -1.3278387784957886, "logits/rejected": -1.1817538738250732, "logps/chosen": -0.8746849298477173, "logps/rejected": -3.890235185623169, "loss": 0.9015, "odds_ratio_loss": 0.26853257417678833, "rewards/accuracies": 1.0, "rewards/chosen": -0.08746849000453949, "rewards/margins": 0.3015550374984741, "rewards/rejected": -0.3890235424041748, "sft_loss": 0.8746849298477173, "step": 12465 }, { "epoch": 0.97, "grad_norm": 28.143857955932617, "learning_rate": 2.2463365755331924e-08, "logits/chosen": -1.3462903499603271, "logits/rejected": -0.7961365580558777, "logps/chosen": -1.0623729228973389, "logps/rejected": -5.807260513305664, "loss": 1.0717, "odds_ratio_loss": 0.09347637742757797, "rewards/accuracies": 1.0, "rewards/chosen": -0.10623729228973389, "rewards/margins": 0.4744887351989746, "rewards/rejected": -0.5807260870933533, "sft_loss": 1.0623729228973389, "step": 12470 }, { "epoch": 0.97, "grad_norm": 5.191647052764893, "learning_rate": 2.1884113668853567e-08, "logits/chosen": -1.3941482305526733, "logits/rejected": -0.8888921737670898, "logps/chosen": -1.0999834537506104, "logps/rejected": -5.673184871673584, "loss": 1.1023, "odds_ratio_loss": 0.02334499917924404, "rewards/accuracies": 1.0, "rewards/chosen": -0.10999833047389984, "rewards/margins": 0.4573201537132263, "rewards/rejected": -0.5673185586929321, "sft_loss": 1.0999834537506104, "step": 12475 }, { "epoch": 0.97, "grad_norm": 8.231232643127441, "learning_rate": 2.1312411527164012e-08, "logits/chosen": -1.2910339832305908, "logits/rejected": -1.4623374938964844, "logps/chosen": -0.9894932508468628, "logps/rejected": -9.08910083770752, "loss": 0.9928, "odds_ratio_loss": 0.033546727150678635, "rewards/accuracies": 1.0, "rewards/chosen": -0.09894932806491852, "rewards/margins": 0.8099607229232788, "rewards/rejected": -0.9089100956916809, "sft_loss": 0.9894932508468628, "step": 12480 }, { "epoch": 0.97, "grad_norm": 4.641633033752441, "learning_rate": 2.0748260197320234e-08, "logits/chosen": -1.130858063697815, "logits/rejected": -1.4460694789886475, "logps/chosen": -0.6807447671890259, "logps/rejected": -5.400501251220703, "loss": 0.6967, "odds_ratio_loss": 0.15972675383090973, "rewards/accuracies": 1.0, "rewards/chosen": -0.06807447969913483, "rewards/margins": 0.471975713968277, "rewards/rejected": -0.5400501489639282, "sft_loss": 0.6807447671890259, "step": 12485 }, { "epoch": 0.97, "grad_norm": 5.488857269287109, "learning_rate": 2.0191660534931158e-08, "logits/chosen": -1.1198769807815552, "logits/rejected": -1.112151861190796, "logps/chosen": -0.9378183484077454, "logps/rejected": -1.6302454471588135, "loss": 0.9778, "odds_ratio_loss": 0.3997967541217804, "rewards/accuracies": 1.0, "rewards/chosen": -0.09378183633089066, "rewards/margins": 0.0692427009344101, "rewards/rejected": -0.16302451491355896, "sft_loss": 0.9378183484077454, "step": 12490 }, { "epoch": 0.97, "grad_norm": 15.16747760772705, "learning_rate": 1.9642613384149302e-08, "logits/chosen": -1.2848796844482422, "logits/rejected": -1.223474383354187, "logps/chosen": -1.5398253202438354, "logps/rejected": -5.566136360168457, "loss": 1.555, "odds_ratio_loss": 0.1515004187822342, "rewards/accuracies": 1.0, "rewards/chosen": -0.15398254990577698, "rewards/margins": 0.40263113379478455, "rewards/rejected": -0.5566136240959167, "sft_loss": 1.5398253202438354, "step": 12495 }, { "epoch": 0.97, "grad_norm": 19.638164520263672, "learning_rate": 1.910111957767524e-08, "logits/chosen": -1.4474903345108032, "logits/rejected": -1.3258614540100098, "logps/chosen": -0.6606062054634094, "logps/rejected": -5.556430816650391, "loss": 0.6655, "odds_ratio_loss": 0.04882761836051941, "rewards/accuracies": 1.0, "rewards/chosen": -0.0660606175661087, "rewards/margins": 0.48958244919776917, "rewards/rejected": -0.5556430816650391, "sft_loss": 0.6606062054634094, "step": 12500 }, { "epoch": 0.97, "grad_norm": 5.311905384063721, "learning_rate": 1.856717993675261e-08, "logits/chosen": -1.2535361051559448, "logits/rejected": -1.0046923160552979, "logps/chosen": -0.9040173292160034, "logps/rejected": -11.26391315460205, "loss": 0.9184, "odds_ratio_loss": 0.14369972050189972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09040174633264542, "rewards/margins": 1.03598952293396, "rewards/rejected": -1.1263912916183472, "sft_loss": 0.9040173292160034, "step": 12505 }, { "epoch": 0.97, "grad_norm": 10.375931739807129, "learning_rate": 1.8040795271169753e-08, "logits/chosen": -1.3679031133651733, "logits/rejected": -0.9070374369621277, "logps/chosen": -1.0435655117034912, "logps/rejected": -5.098996639251709, "loss": 1.0562, "odds_ratio_loss": 0.1265898197889328, "rewards/accuracies": 1.0, "rewards/chosen": -0.1043565645813942, "rewards/margins": 0.40554314851760864, "rewards/rejected": -0.5098997354507446, "sft_loss": 1.0435655117034912, "step": 12510 }, { "epoch": 0.97, "grad_norm": 4.680568695068359, "learning_rate": 1.752196637925474e-08, "logits/chosen": -1.3530833721160889, "logits/rejected": -1.097294569015503, "logps/chosen": -0.7488092184066772, "logps/rejected": -8.338506698608398, "loss": 0.7525, "odds_ratio_loss": 0.0365881472826004, "rewards/accuracies": 1.0, "rewards/chosen": -0.0748809203505516, "rewards/margins": 0.7589698433876038, "rewards/rejected": -0.8338507413864136, "sft_loss": 0.7488092184066772, "step": 12515 }, { "epoch": 0.97, "grad_norm": 6.964818477630615, "learning_rate": 1.7010694047877585e-08, "logits/chosen": -1.293218970298767, "logits/rejected": -0.6837180256843567, "logps/chosen": -1.0612825155258179, "logps/rejected": -6.177232265472412, "loss": 1.0979, "odds_ratio_loss": 0.36641591787338257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1061282530426979, "rewards/margins": 0.5115949511528015, "rewards/rejected": -0.6177231669425964, "sft_loss": 1.0612825155258179, "step": 12520 }, { "epoch": 0.97, "grad_norm": 15.219218254089355, "learning_rate": 1.650697905244747e-08, "logits/chosen": -1.2578482627868652, "logits/rejected": -1.422055959701538, "logps/chosen": -0.8639196157455444, "logps/rejected": -6.498141288757324, "loss": 0.8792, "odds_ratio_loss": 0.15299847722053528, "rewards/accuracies": 1.0, "rewards/chosen": -0.08639196306467056, "rewards/margins": 0.5634222030639648, "rewards/rejected": -0.6498141288757324, "sft_loss": 0.8639196157455444, "step": 12525 }, { "epoch": 0.97, "grad_norm": 4.948149681091309, "learning_rate": 1.6010822156913297e-08, "logits/chosen": -1.411399483680725, "logits/rejected": -0.9298642873764038, "logps/chosen": -1.153228521347046, "logps/rejected": -7.6135687828063965, "loss": 1.1708, "odds_ratio_loss": 0.17584821581840515, "rewards/accuracies": 1.0, "rewards/chosen": -0.11532286554574966, "rewards/margins": 0.6460340619087219, "rewards/rejected": -0.7613569498062134, "sft_loss": 1.153228521347046, "step": 12530 }, { "epoch": 0.98, "grad_norm": 15.22192668914795, "learning_rate": 1.55222241137587e-08, "logits/chosen": -1.3863394260406494, "logits/rejected": -1.158898115158081, "logps/chosen": -1.0210561752319336, "logps/rejected": -6.618653774261475, "loss": 1.0234, "odds_ratio_loss": 0.023359347134828568, "rewards/accuracies": 1.0, "rewards/chosen": -0.10210561752319336, "rewards/margins": 0.559759795665741, "rewards/rejected": -0.6618653535842896, "sft_loss": 1.0210561752319336, "step": 12535 }, { "epoch": 0.98, "grad_norm": 16.71721076965332, "learning_rate": 1.5041185664005365e-08, "logits/chosen": -1.3702729940414429, "logits/rejected": -1.153377890586853, "logps/chosen": -0.9755480885505676, "logps/rejected": -5.225513458251953, "loss": 0.9837, "odds_ratio_loss": 0.08182965219020844, "rewards/accuracies": 1.0, "rewards/chosen": -0.09755481034517288, "rewards/margins": 0.424996554851532, "rewards/rejected": -0.5225513577461243, "sft_loss": 0.9755480885505676, "step": 12540 }, { "epoch": 0.98, "grad_norm": 97.78411865234375, "learning_rate": 1.4567707537209153e-08, "logits/chosen": -1.2511688470840454, "logits/rejected": -1.3687814474105835, "logps/chosen": -0.8645130395889282, "logps/rejected": -6.271538257598877, "loss": 0.8895, "odds_ratio_loss": 0.2500821650028229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08645130693912506, "rewards/margins": 0.5407025218009949, "rewards/rejected": -0.6271538138389587, "sft_loss": 0.8645130395889282, "step": 12545 }, { "epoch": 0.98, "grad_norm": 183.43133544921875, "learning_rate": 1.410179045145954e-08, "logits/chosen": -1.3008298873901367, "logits/rejected": -1.3373351097106934, "logps/chosen": -1.174229383468628, "logps/rejected": -5.585914134979248, "loss": 1.1844, "odds_ratio_loss": 0.10214630514383316, "rewards/accuracies": 1.0, "rewards/chosen": -0.11742293834686279, "rewards/margins": 0.44116848707199097, "rewards/rejected": -0.5585914850234985, "sft_loss": 1.174229383468628, "step": 12550 }, { "epoch": 0.98, "grad_norm": 5.683286666870117, "learning_rate": 1.3643435113379067e-08, "logits/chosen": -1.3199999332427979, "logits/rejected": -0.7393957376480103, "logps/chosen": -0.8424164652824402, "logps/rejected": -11.82227611541748, "loss": 0.8517, "odds_ratio_loss": 0.09303996711969376, "rewards/accuracies": 1.0, "rewards/chosen": -0.08424165099859238, "rewards/margins": 1.0979859828948975, "rewards/rejected": -1.182227611541748, "sft_loss": 0.8424164652824402, "step": 12555 }, { "epoch": 0.98, "grad_norm": 5.736454010009766, "learning_rate": 1.3192642218121666e-08, "logits/chosen": -1.4242204427719116, "logits/rejected": -1.298656702041626, "logps/chosen": -0.6155864000320435, "logps/rejected": -9.867898941040039, "loss": 0.621, "odds_ratio_loss": 0.0539126992225647, "rewards/accuracies": 1.0, "rewards/chosen": -0.061558641493320465, "rewards/margins": 0.9252313375473022, "rewards/rejected": -0.986789882183075, "sft_loss": 0.6155864000320435, "step": 12560 }, { "epoch": 0.98, "grad_norm": 17.89364242553711, "learning_rate": 1.2749412449372111e-08, "logits/chosen": -1.3865082263946533, "logits/rejected": -1.0207258462905884, "logps/chosen": -0.852857768535614, "logps/rejected": -5.261617183685303, "loss": 0.9491, "odds_ratio_loss": 0.9626294374465942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08528578281402588, "rewards/margins": 0.4408760070800781, "rewards/rejected": -0.526161789894104, "sft_loss": 0.852857768535614, "step": 12565 }, { "epoch": 0.98, "grad_norm": 27.80198097229004, "learning_rate": 1.2313746479344358e-08, "logits/chosen": -1.2341769933700562, "logits/rejected": -1.2669892311096191, "logps/chosen": -1.1279561519622803, "logps/rejected": -8.634511947631836, "loss": 1.1319, "odds_ratio_loss": 0.03956901282072067, "rewards/accuracies": 1.0, "rewards/chosen": -0.11279561370611191, "rewards/margins": 0.7506555318832397, "rewards/rejected": -0.8634511828422546, "sft_loss": 1.1279561519622803, "step": 12570 }, { "epoch": 0.98, "grad_norm": 9.365230560302734, "learning_rate": 1.188564496878153e-08, "logits/chosen": -1.3226608037948608, "logits/rejected": -0.809215247631073, "logps/chosen": -1.0910683870315552, "logps/rejected": -3.9763412475585938, "loss": 1.1081, "odds_ratio_loss": 0.1704411506652832, "rewards/accuracies": 1.0, "rewards/chosen": -0.10910683870315552, "rewards/margins": 0.2885272800922394, "rewards/rejected": -0.3976341187953949, "sft_loss": 1.0910683870315552, "step": 12575 }, { "epoch": 0.98, "grad_norm": 6.049681186676025, "learning_rate": 1.1465108566953708e-08, "logits/chosen": -1.1451175212860107, "logits/rejected": -1.6655000448226929, "logps/chosen": -1.2132195234298706, "logps/rejected": -15.503191947937012, "loss": 1.2136, "odds_ratio_loss": 0.0037333047948777676, "rewards/accuracies": 1.0, "rewards/chosen": -0.12132195383310318, "rewards/margins": 1.428997278213501, "rewards/rejected": -1.5503193140029907, "sft_loss": 1.2132195234298706, "step": 12580 }, { "epoch": 0.98, "grad_norm": 7.754752159118652, "learning_rate": 1.1052137911657934e-08, "logits/chosen": -1.3875739574432373, "logits/rejected": -0.9447689056396484, "logps/chosen": -0.9363592267036438, "logps/rejected": -4.744017601013184, "loss": 0.998, "odds_ratio_loss": 0.61687171459198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0936359241604805, "rewards/margins": 0.3807658553123474, "rewards/rejected": -0.4744018018245697, "sft_loss": 0.9363592267036438, "step": 12585 }, { "epoch": 0.98, "grad_norm": 10.098403930664062, "learning_rate": 1.0646733629216533e-08, "logits/chosen": -1.3224453926086426, "logits/rejected": -1.1574100255966187, "logps/chosen": -1.0401006937026978, "logps/rejected": -7.8945112228393555, "loss": 1.0577, "odds_ratio_loss": 0.17600440979003906, "rewards/accuracies": 1.0, "rewards/chosen": -0.10401006788015366, "rewards/margins": 0.6854410171508789, "rewards/rejected": -0.7894511222839355, "sft_loss": 1.0401006937026978, "step": 12590 }, { "epoch": 0.98, "grad_norm": 5.171893119812012, "learning_rate": 1.0248896334476565e-08, "logits/chosen": -1.3351364135742188, "logits/rejected": -1.343653917312622, "logps/chosen": -0.8911256790161133, "logps/rejected": -6.915536403656006, "loss": 0.893, "odds_ratio_loss": 0.0184454545378685, "rewards/accuracies": 1.0, "rewards/chosen": -0.08911257237195969, "rewards/margins": 0.602441132068634, "rewards/rejected": -0.6915537118911743, "sft_loss": 0.8911256790161133, "step": 12595 }, { "epoch": 0.98, "grad_norm": 4.757512092590332, "learning_rate": 9.858626630808722e-09, "logits/chosen": -1.536574125289917, "logits/rejected": -1.043745756149292, "logps/chosen": -1.218515396118164, "logps/rejected": -4.443150520324707, "loss": 1.248, "odds_ratio_loss": 0.2948893904685974, "rewards/accuracies": 1.0, "rewards/chosen": -0.12185152620077133, "rewards/margins": 0.3224635720252991, "rewards/rejected": -0.444315105676651, "sft_loss": 1.218515396118164, "step": 12600 }, { "epoch": 0.98, "grad_norm": 6.375965595245361, "learning_rate": 9.475925110106753e-09, "logits/chosen": -1.3484649658203125, "logits/rejected": -0.6500853300094604, "logps/chosen": -1.0133591890335083, "logps/rejected": -5.055027961730957, "loss": 1.0315, "odds_ratio_loss": 0.181620255112648, "rewards/accuracies": 1.0, "rewards/chosen": -0.10133592039346695, "rewards/margins": 0.4041668474674225, "rewards/rejected": -0.5055028200149536, "sft_loss": 1.0133591890335083, "step": 12605 }, { "epoch": 0.98, "grad_norm": 14.441625595092773, "learning_rate": 9.100792352785826e-09, "logits/chosen": -1.3859080076217651, "logits/rejected": -1.3279926776885986, "logps/chosen": -1.1327625513076782, "logps/rejected": -9.13421630859375, "loss": 1.1406, "odds_ratio_loss": 0.07846628874540329, "rewards/accuracies": 1.0, "rewards/chosen": -0.11327625811100006, "rewards/margins": 0.8001454472541809, "rewards/rejected": -0.9134217500686646, "sft_loss": 1.1327625513076782, "step": 12610 }, { "epoch": 0.98, "grad_norm": 79.08858489990234, "learning_rate": 8.7332289277825e-09, "logits/chosen": -1.3899104595184326, "logits/rejected": -1.2215381860733032, "logps/chosen": -1.0673812627792358, "logps/rejected": -6.102982044219971, "loss": 1.0804, "odds_ratio_loss": 0.13017579913139343, "rewards/accuracies": 1.0, "rewards/chosen": -0.10673811286687851, "rewards/margins": 0.5035600662231445, "rewards/rejected": -0.610298216342926, "sft_loss": 1.0673812627792358, "step": 12615 }, { "epoch": 0.98, "grad_norm": 5.262150287628174, "learning_rate": 8.373235392553636e-09, "logits/chosen": -1.3551607131958008, "logits/rejected": -0.987968921661377, "logps/chosen": -1.1806728839874268, "logps/rejected": -8.183259963989258, "loss": 1.2099, "odds_ratio_loss": 0.292438268661499, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11806728690862656, "rewards/margins": 0.7002586722373962, "rewards/rejected": -0.8183259963989258, "sft_loss": 1.1806728839874268, "step": 12620 }, { "epoch": 0.98, "grad_norm": 9.846895217895508, "learning_rate": 8.02081229307472e-09, "logits/chosen": -1.3026286363601685, "logits/rejected": -0.7004513144493103, "logps/chosen": -0.8980560302734375, "logps/rejected": -7.850504398345947, "loss": 0.9003, "odds_ratio_loss": 0.022868018597364426, "rewards/accuracies": 1.0, "rewards/chosen": -0.08980560302734375, "rewards/margins": 0.6952449083328247, "rewards/rejected": -0.7850505113601685, "sft_loss": 0.8980560302734375, "step": 12625 }, { "epoch": 0.98, "grad_norm": 9.495035171508789, "learning_rate": 7.675960163840424e-09, "logits/chosen": -1.2690961360931396, "logits/rejected": -1.247907042503357, "logps/chosen": -1.029133915901184, "logps/rejected": -13.74725341796875, "loss": 1.0295, "odds_ratio_loss": 0.0034622892271727324, "rewards/accuracies": 1.0, "rewards/chosen": -0.10291339457035065, "rewards/margins": 1.271812081336975, "rewards/rejected": -1.3747254610061646, "sft_loss": 1.029133915901184, "step": 12630 }, { "epoch": 0.98, "grad_norm": 12.372653007507324, "learning_rate": 7.33867952786238e-09, "logits/chosen": -1.395240068435669, "logits/rejected": -0.935256838798523, "logps/chosen": -1.8347313404083252, "logps/rejected": -2.4637718200683594, "loss": 1.9064, "odds_ratio_loss": 0.7163336277008057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1834731101989746, "rewards/margins": 0.06290406733751297, "rewards/rejected": -0.24637719988822937, "sft_loss": 1.8347313404083252, "step": 12635 }, { "epoch": 0.98, "grad_norm": 5.233946323394775, "learning_rate": 7.008970896670298e-09, "logits/chosen": -1.3666355609893799, "logits/rejected": -0.8786395788192749, "logps/chosen": -1.2400833368301392, "logps/rejected": -15.411317825317383, "loss": 1.2401, "odds_ratio_loss": 0.0001732175296638161, "rewards/accuracies": 1.0, "rewards/chosen": -0.12400834262371063, "rewards/margins": 1.4171233177185059, "rewards/rejected": -1.5411317348480225, "sft_loss": 1.2400833368301392, "step": 12640 }, { "epoch": 0.98, "grad_norm": 18.897790908813477, "learning_rate": 6.686834770308626e-09, "logits/chosen": -1.421450138092041, "logits/rejected": -1.2205473184585571, "logps/chosen": -0.8802854418754578, "logps/rejected": -4.678915023803711, "loss": 0.9042, "odds_ratio_loss": 0.23880383372306824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08802854269742966, "rewards/margins": 0.3798629939556122, "rewards/rejected": -0.46789151430130005, "sft_loss": 0.8802854418754578, "step": 12645 }, { "epoch": 0.98, "grad_norm": 7.433767795562744, "learning_rate": 6.372271637337668e-09, "logits/chosen": -1.4291326999664307, "logits/rejected": -0.8617070317268372, "logps/chosen": -0.9746553301811218, "logps/rejected": -4.9322404861450195, "loss": 1.0212, "odds_ratio_loss": 0.464984655380249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09746553003787994, "rewards/margins": 0.39575856924057007, "rewards/rejected": -0.49322405457496643, "sft_loss": 0.9746553301811218, "step": 12650 }, { "epoch": 0.98, "grad_norm": 6.346502304077148, "learning_rate": 6.065281974832471e-09, "logits/chosen": -1.351488709449768, "logits/rejected": -1.3281567096710205, "logps/chosen": -0.6839212775230408, "logps/rejected": -6.7796502113342285, "loss": 0.685, "odds_ratio_loss": 0.010401845909655094, "rewards/accuracies": 1.0, "rewards/chosen": -0.06839212775230408, "rewards/margins": 0.6095728278160095, "rewards/rejected": -0.6779649257659912, "sft_loss": 0.6839212775230408, "step": 12655 }, { "epoch": 0.98, "grad_norm": 46.9267692565918, "learning_rate": 5.765866248381713e-09, "logits/chosen": -1.465703010559082, "logits/rejected": -1.3380768299102783, "logps/chosen": -1.0040488243103027, "logps/rejected": -4.003195762634277, "loss": 1.0089, "odds_ratio_loss": 0.048082135617733, "rewards/accuracies": 1.0, "rewards/chosen": -0.10040488094091415, "rewards/margins": 0.2999146580696106, "rewards/rejected": -0.40031957626342773, "sft_loss": 1.0040488243103027, "step": 12660 }, { "epoch": 0.99, "grad_norm": 16.431318283081055, "learning_rate": 5.474024912087151e-09, "logits/chosen": -1.4906480312347412, "logits/rejected": -0.8143842816352844, "logps/chosen": -0.7565047144889832, "logps/rejected": -2.544574022293091, "loss": 0.876, "odds_ratio_loss": 1.194966197013855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07565047591924667, "rewards/margins": 0.17880688607692719, "rewards/rejected": -0.25445738434791565, "sft_loss": 0.7565047144889832, "step": 12665 }, { "epoch": 0.99, "grad_norm": 10.983074188232422, "learning_rate": 5.189758408564172e-09, "logits/chosen": -1.4593555927276611, "logits/rejected": -0.6856383681297302, "logps/chosen": -0.977257251739502, "logps/rejected": -6.1070733070373535, "loss": 1.0098, "odds_ratio_loss": 0.32559916377067566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09772571921348572, "rewards/margins": 0.5129815936088562, "rewards/rejected": -0.6107073426246643, "sft_loss": 0.977257251739502, "step": 12670 }, { "epoch": 0.99, "grad_norm": 9.633749961853027, "learning_rate": 4.913067168937913e-09, "logits/chosen": -1.3621352910995483, "logits/rejected": -1.0428334474563599, "logps/chosen": -0.9668213725090027, "logps/rejected": -9.47398853302002, "loss": 0.9695, "odds_ratio_loss": 0.02649303898215294, "rewards/accuracies": 1.0, "rewards/chosen": -0.09668214619159698, "rewards/margins": 0.850716769695282, "rewards/rejected": -0.947398841381073, "sft_loss": 0.9668213725090027, "step": 12675 }, { "epoch": 0.99, "grad_norm": 6.018378257751465, "learning_rate": 4.643951612846587e-09, "logits/chosen": -1.4125245809555054, "logits/rejected": -0.9760688543319702, "logps/chosen": -1.1332448720932007, "logps/rejected": -8.689168930053711, "loss": 1.1377, "odds_ratio_loss": 0.04467242211103439, "rewards/accuracies": 1.0, "rewards/chosen": -0.11332448571920395, "rewards/margins": 0.7555924654006958, "rewards/rejected": -0.8689168691635132, "sft_loss": 1.1332448720932007, "step": 12680 }, { "epoch": 0.99, "grad_norm": 9.102438926696777, "learning_rate": 4.382412148437598e-09, "logits/chosen": -1.4320725202560425, "logits/rejected": -0.9469528198242188, "logps/chosen": -0.852150559425354, "logps/rejected": -6.599529266357422, "loss": 0.8543, "odds_ratio_loss": 0.021591413766145706, "rewards/accuracies": 1.0, "rewards/chosen": -0.08521505445241928, "rewards/margins": 0.5747378468513489, "rewards/rejected": -0.6599529385566711, "sft_loss": 0.852150559425354, "step": 12685 }, { "epoch": 0.99, "grad_norm": 102.65242767333984, "learning_rate": 4.1284491723686536e-09, "logits/chosen": -1.406354308128357, "logits/rejected": -0.9787223935127258, "logps/chosen": -1.1560405492782593, "logps/rejected": -7.624680519104004, "loss": 1.1582, "odds_ratio_loss": 0.021217485889792442, "rewards/accuracies": 1.0, "rewards/chosen": -0.11560405790805817, "rewards/margins": 0.6468639969825745, "rewards/rejected": -0.7624679803848267, "sft_loss": 1.1560405492782593, "step": 12690 }, { "epoch": 0.99, "grad_norm": 15.876679420471191, "learning_rate": 3.882063069807762e-09, "logits/chosen": -1.4680473804473877, "logits/rejected": -1.2769049406051636, "logps/chosen": -0.6791022419929504, "logps/rejected": -4.149896621704102, "loss": 0.6847, "odds_ratio_loss": 0.05620497465133667, "rewards/accuracies": 1.0, "rewards/chosen": -0.06791022419929504, "rewards/margins": 0.34707945585250854, "rewards/rejected": -0.4149896502494812, "sft_loss": 0.6791022419929504, "step": 12695 }, { "epoch": 0.99, "grad_norm": 9.385997772216797, "learning_rate": 3.643254214429348e-09, "logits/chosen": -1.3954238891601562, "logits/rejected": -1.044518232345581, "logps/chosen": -0.9343746900558472, "logps/rejected": -6.008805751800537, "loss": 0.9483, "odds_ratio_loss": 0.13876792788505554, "rewards/accuracies": 1.0, "rewards/chosen": -0.09343747794628143, "rewards/margins": 0.5074431300163269, "rewards/rejected": -0.6008806228637695, "sft_loss": 0.9343746900558472, "step": 12700 }, { "epoch": 0.99, "grad_norm": 13.203768730163574, "learning_rate": 3.4120229684181384e-09, "logits/chosen": -1.3236767053604126, "logits/rejected": -0.831290602684021, "logps/chosen": -0.8365150690078735, "logps/rejected": -3.617612838745117, "loss": 0.8571, "odds_ratio_loss": 0.20549385249614716, "rewards/accuracies": 1.0, "rewards/chosen": -0.08365149796009064, "rewards/margins": 0.2781098484992981, "rewards/rejected": -0.36176127195358276, "sft_loss": 0.8365150690078735, "step": 12705 }, { "epoch": 0.99, "grad_norm": 11.924532890319824, "learning_rate": 3.188369682466386e-09, "logits/chosen": -1.4278162717819214, "logits/rejected": -1.0526517629623413, "logps/chosen": -1.2153667211532593, "logps/rejected": -5.400894641876221, "loss": 1.2309, "odds_ratio_loss": 0.15487821400165558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12153668701648712, "rewards/margins": 0.41855278611183167, "rewards/rejected": -0.54008948802948, "sft_loss": 1.2153667211532593, "step": 12710 }, { "epoch": 0.99, "grad_norm": 8.210156440734863, "learning_rate": 2.9722946957710943e-09, "logits/chosen": -1.2760294675827026, "logits/rejected": -0.657126247882843, "logps/chosen": -1.1431710720062256, "logps/rejected": -6.945553779602051, "loss": 1.1633, "odds_ratio_loss": 0.20087885856628418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11431711912155151, "rewards/margins": 0.5802382230758667, "rewards/rejected": -0.6945552825927734, "sft_loss": 1.1431710720062256, "step": 12715 }, { "epoch": 0.99, "grad_norm": 8.233572006225586, "learning_rate": 2.763798336039014e-09, "logits/chosen": -1.2289212942123413, "logits/rejected": -1.1867921352386475, "logps/chosen": -0.7025913000106812, "logps/rejected": -5.2682671546936035, "loss": 0.711, "odds_ratio_loss": 0.08458174765110016, "rewards/accuracies": 1.0, "rewards/chosen": -0.07025913894176483, "rewards/margins": 0.45656758546829224, "rewards/rejected": -0.5268267393112183, "sft_loss": 0.7025913000106812, "step": 12720 }, { "epoch": 0.99, "grad_norm": 6.575437545776367, "learning_rate": 2.562880919479982e-09, "logits/chosen": -1.336478590965271, "logits/rejected": -1.242974042892456, "logps/chosen": -0.9741020202636719, "logps/rejected": -3.7598636150360107, "loss": 1.0127, "odds_ratio_loss": 0.3857436180114746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09741020947694778, "rewards/margins": 0.27857616543769836, "rewards/rejected": -0.37598639726638794, "sft_loss": 0.9741020202636719, "step": 12725 }, { "epoch": 0.99, "grad_norm": 7.393919467926025, "learning_rate": 2.36954275081136e-09, "logits/chosen": -1.261437177658081, "logits/rejected": -1.0915019512176514, "logps/chosen": -0.8143970370292664, "logps/rejected": -1.4711533784866333, "loss": 0.8482, "odds_ratio_loss": 0.3381730616092682, "rewards/accuracies": 1.0, "rewards/chosen": -0.08143971115350723, "rewards/margins": 0.06567564606666565, "rewards/rejected": -0.14711534976959229, "sft_loss": 0.8143970370292664, "step": 12730 }, { "epoch": 0.99, "grad_norm": 4.636721611022949, "learning_rate": 2.1837841232552616e-09, "logits/chosen": -1.4686650037765503, "logits/rejected": -1.1045680046081543, "logps/chosen": -1.0473394393920898, "logps/rejected": -7.3457441329956055, "loss": 1.0561, "odds_ratio_loss": 0.08712659776210785, "rewards/accuracies": 1.0, "rewards/chosen": -0.10473394393920898, "rewards/margins": 0.6298404932022095, "rewards/rejected": -0.7345744371414185, "sft_loss": 1.0473394393920898, "step": 12735 }, { "epoch": 0.99, "grad_norm": 16.038543701171875, "learning_rate": 2.0056053185379954e-09, "logits/chosen": -1.3290282487869263, "logits/rejected": -0.8544355630874634, "logps/chosen": -1.1234945058822632, "logps/rejected": -5.49116325378418, "loss": 1.1377, "odds_ratio_loss": 0.1421329826116562, "rewards/accuracies": 1.0, "rewards/chosen": -0.11234945058822632, "rewards/margins": 0.4367668628692627, "rewards/rejected": -0.549116313457489, "sft_loss": 1.1234945058822632, "step": 12740 }, { "epoch": 0.99, "grad_norm": 6.807204246520996, "learning_rate": 1.8350066068906213e-09, "logits/chosen": -1.2573984861373901, "logits/rejected": -0.9827863574028015, "logps/chosen": -0.9654420614242554, "logps/rejected": -21.103622436523438, "loss": 0.9654, "odds_ratio_loss": 6.937227590242401e-05, "rewards/accuracies": 1.0, "rewards/chosen": -0.09654419869184494, "rewards/margins": 2.0138182640075684, "rewards/rejected": -2.1103625297546387, "sft_loss": 0.9654420614242554, "step": 12745 }, { "epoch": 0.99, "grad_norm": 8.133843421936035, "learning_rate": 1.6719882470467297e-09, "logits/chosen": -1.4547202587127686, "logits/rejected": -1.3134218454360962, "logps/chosen": -1.2230987548828125, "logps/rejected": -8.175860404968262, "loss": 1.2578, "odds_ratio_loss": 0.34715619683265686, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12230987846851349, "rewards/margins": 0.695276141166687, "rewards/rejected": -0.8175861239433289, "sft_loss": 1.2230987548828125, "step": 12750 }, { "epoch": 0.99, "grad_norm": 22.136442184448242, "learning_rate": 1.5165504862457713e-09, "logits/chosen": -1.3403151035308838, "logits/rejected": -0.894049346446991, "logps/chosen": -1.0238540172576904, "logps/rejected": -2.9002327919006348, "loss": 1.0599, "odds_ratio_loss": 0.36095088720321655, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10238540172576904, "rewards/margins": 0.18763785064220428, "rewards/rejected": -0.2900232672691345, "sft_loss": 1.0238540172576904, "step": 12755 }, { "epoch": 0.99, "grad_norm": 9.113795280456543, "learning_rate": 1.3686935602280627e-09, "logits/chosen": -1.173903226852417, "logits/rejected": -1.0834901332855225, "logps/chosen": -1.094962239265442, "logps/rejected": -6.69094181060791, "loss": 1.0959, "odds_ratio_loss": 0.009845694527029991, "rewards/accuracies": 1.0, "rewards/chosen": -0.10949622094631195, "rewards/margins": 0.559597909450531, "rewards/rejected": -0.6690941452980042, "sft_loss": 1.094962239265442, "step": 12760 }, { "epoch": 0.99, "grad_norm": 9.03683853149414, "learning_rate": 1.2284176932375601e-09, "logits/chosen": -1.5034894943237305, "logits/rejected": -1.175966501235962, "logps/chosen": -0.694218635559082, "logps/rejected": -5.125706672668457, "loss": 0.7015, "odds_ratio_loss": 0.0728861540555954, "rewards/accuracies": 1.0, "rewards/chosen": -0.06942186504602432, "rewards/margins": 0.44314879179000854, "rewards/rejected": -0.5125706791877747, "sft_loss": 0.694218635559082, "step": 12765 }, { "epoch": 0.99, "grad_norm": 6.062671184539795, "learning_rate": 1.0957230980201961e-09, "logits/chosen": -1.3591234683990479, "logits/rejected": -0.9866539835929871, "logps/chosen": -0.8312653303146362, "logps/rejected": -9.86632251739502, "loss": 0.8442, "odds_ratio_loss": 0.129344180226326, "rewards/accuracies": 1.0, "rewards/chosen": -0.08312653750181198, "rewards/margins": 0.9035056829452515, "rewards/rejected": -0.986632227897644, "sft_loss": 0.8312653303146362, "step": 12770 }, { "epoch": 0.99, "grad_norm": 5.2164225578308105, "learning_rate": 9.706099758244325e-10, "logits/chosen": -1.3940870761871338, "logits/rejected": -0.8618147969245911, "logps/chosen": -1.2448606491088867, "logps/rejected": -6.871438503265381, "loss": 1.3133, "odds_ratio_loss": 0.6842848062515259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12448606640100479, "rewards/margins": 0.5626577734947205, "rewards/rejected": -0.6871439218521118, "sft_loss": 1.2448606491088867, "step": 12775 }, { "epoch": 0.99, "grad_norm": 3.143425226211548, "learning_rate": 8.530785164001521e-10, "logits/chosen": -1.4610567092895508, "logits/rejected": -1.3560374975204468, "logps/chosen": -1.2909905910491943, "logps/rejected": -9.755168914794922, "loss": 1.3604, "odds_ratio_loss": 0.6936588883399963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1290990561246872, "rewards/margins": 0.8464177846908569, "rewards/rejected": -0.9755169153213501, "sft_loss": 1.2909905910491943, "step": 12780 }, { "epoch": 0.99, "grad_norm": 162.7854461669922, "learning_rate": 7.431288979986572e-10, "logits/chosen": -1.3986408710479736, "logits/rejected": -1.0258430242538452, "logps/chosen": -1.276390552520752, "logps/rejected": -8.889037132263184, "loss": 1.2909, "odds_ratio_loss": 0.1447000950574875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1276390552520752, "rewards/margins": 0.7612647414207458, "rewards/rejected": -0.888903796672821, "sft_loss": 1.276390552520752, "step": 12785 }, { "epoch": 0.99, "grad_norm": 5.574721336364746, "learning_rate": 6.407612873726709e-10, "logits/chosen": -1.325803518295288, "logits/rejected": -1.1525557041168213, "logps/chosen": -0.9420539140701294, "logps/rejected": -5.841198444366455, "loss": 0.9465, "odds_ratio_loss": 0.044614277780056, "rewards/accuracies": 1.0, "rewards/chosen": -0.09420539438724518, "rewards/margins": 0.4899144768714905, "rewards/rejected": -0.5841198563575745, "sft_loss": 0.9420539140701294, "step": 12790 }, { "epoch": 1.0, "grad_norm": 6.682177543640137, "learning_rate": 5.459758397757808e-10, "logits/chosen": -1.3960912227630615, "logits/rejected": -0.8625108599662781, "logps/chosen": -0.9628890752792358, "logps/rejected": -6.6562066078186035, "loss": 1.0017, "odds_ratio_loss": 0.3881911039352417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09628890454769135, "rewards/margins": 0.5693317651748657, "rewards/rejected": -0.6656206846237183, "sft_loss": 0.9628890752792358, "step": 12795 }, { "epoch": 1.0, "grad_norm": 4.425645351409912, "learning_rate": 4.5877269896132946e-10, "logits/chosen": -1.4052798748016357, "logits/rejected": -0.8202239274978638, "logps/chosen": -1.0506846904754639, "logps/rejected": -6.205660820007324, "loss": 1.0597, "odds_ratio_loss": 0.09048835933208466, "rewards/accuracies": 1.0, "rewards/chosen": -0.10506846755743027, "rewards/margins": 0.5154975652694702, "rewards/rejected": -0.6205660700798035, "sft_loss": 1.0506846904754639, "step": 12800 }, { "epoch": 1.0, "grad_norm": 6.5811614990234375, "learning_rate": 3.791519971851898e-10, "logits/chosen": -1.3399574756622314, "logits/rejected": -1.0786248445510864, "logps/chosen": -1.1442134380340576, "logps/rejected": -7.063590049743652, "loss": 1.1558, "odds_ratio_loss": 0.11629381030797958, "rewards/accuracies": 1.0, "rewards/chosen": -0.11442134529352188, "rewards/margins": 0.5919376611709595, "rewards/rejected": -0.7063590288162231, "sft_loss": 1.1442134380340576, "step": 12805 }, { "epoch": 1.0, "grad_norm": 4.901683807373047, "learning_rate": 3.071138552013242e-10, "logits/chosen": -1.3089653253555298, "logits/rejected": -0.7390233278274536, "logps/chosen": -0.903420090675354, "logps/rejected": -5.574804306030273, "loss": 0.9193, "odds_ratio_loss": 0.15836270153522491, "rewards/accuracies": 1.0, "rewards/chosen": -0.09034201502799988, "rewards/margins": 0.4671383798122406, "rewards/rejected": -0.5574804544448853, "sft_loss": 0.903420090675354, "step": 12810 }, { "epoch": 1.0, "grad_norm": 11.829771041870117, "learning_rate": 2.426583822651152e-10, "logits/chosen": -1.369470238685608, "logits/rejected": -1.4664558172225952, "logps/chosen": -0.9986074566841125, "logps/rejected": -8.506292343139648, "loss": 1.0125, "odds_ratio_loss": 0.13853073120117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.09986075758934021, "rewards/margins": 0.7507684826850891, "rewards/rejected": -0.8506291508674622, "sft_loss": 0.9986074566841125, "step": 12815 }, { "epoch": 1.0, "grad_norm": 10.773012161254883, "learning_rate": 1.8578567613114496e-10, "logits/chosen": -1.3900362253189087, "logits/rejected": -1.2793715000152588, "logps/chosen": -0.8403071165084839, "logps/rejected": -7.705872535705566, "loss": 0.8411, "odds_ratio_loss": 0.0077665760181844234, "rewards/accuracies": 1.0, "rewards/chosen": -0.08403071761131287, "rewards/margins": 0.6865564584732056, "rewards/rejected": -0.7705872058868408, "sft_loss": 0.8403071165084839, "step": 12820 }, { "epoch": 1.0, "grad_norm": 6.064254283905029, "learning_rate": 1.3649582305486076e-10, "logits/chosen": -1.2912412881851196, "logits/rejected": -0.9192991256713867, "logps/chosen": -1.165502905845642, "logps/rejected": -9.035804748535156, "loss": 1.1662, "odds_ratio_loss": 0.0065170153975486755, "rewards/accuracies": 1.0, "rewards/chosen": -0.11655030399560928, "rewards/margins": 0.7870301008224487, "rewards/rejected": -0.9035804867744446, "sft_loss": 1.165502905845642, "step": 12825 }, { "epoch": 1.0, "grad_norm": 14.42724609375, "learning_rate": 9.478889778979927e-11, "logits/chosen": -1.3533092737197876, "logits/rejected": -1.2197463512420654, "logps/chosen": -1.057451605796814, "logps/rejected": -6.306844234466553, "loss": 1.0679, "odds_ratio_loss": 0.10441489517688751, "rewards/accuracies": 1.0, "rewards/chosen": -0.10574515908956528, "rewards/margins": 0.5249393582344055, "rewards/rejected": -0.6306844353675842, "sft_loss": 1.057451605796814, "step": 12830 }, { "epoch": 1.0, "grad_norm": 16.650434494018555, "learning_rate": 6.066496358980712e-11, "logits/chosen": -1.327243447303772, "logits/rejected": -0.8279932141304016, "logps/chosen": -0.8257233500480652, "logps/rejected": -4.297021865844727, "loss": 0.9023, "odds_ratio_loss": 0.7656275033950806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0825723335146904, "rewards/margins": 0.34712985157966614, "rewards/rejected": -0.42970219254493713, "sft_loss": 0.8257233500480652, "step": 12835 }, { "epoch": 1.0, "grad_norm": 219.4456024169922, "learning_rate": 3.412407220904079e-11, "logits/chosen": -1.287316918373108, "logits/rejected": -1.2117503881454468, "logps/chosen": -1.8838069438934326, "logps/rejected": -7.961875915527344, "loss": 1.9317, "odds_ratio_loss": 0.47844308614730835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18838071823120117, "rewards/margins": 0.6078068614006042, "rewards/rejected": -0.7961876392364502, "sft_loss": 1.8838069438934326, "step": 12840 }, { "epoch": 1.0, "grad_norm": 6.5753984451293945, "learning_rate": 1.5166263899191182e-11, "logits/chosen": -1.3301522731781006, "logits/rejected": -1.3508708477020264, "logps/chosen": -0.8103002309799194, "logps/rejected": -11.35803508758545, "loss": 0.8209, "odds_ratio_loss": 0.10593117773532867, "rewards/accuracies": 1.0, "rewards/chosen": -0.08103003352880478, "rewards/margins": 1.0547735691070557, "rewards/rejected": -1.135803461074829, "sft_loss": 0.8103002309799194, "step": 12845 }, { "epoch": 1.0, "grad_norm": 5.922697067260742, "learning_rate": 3.7915674122590565e-12, "logits/chosen": -1.4197337627410889, "logits/rejected": -1.1119892597198486, "logps/chosen": -0.8883382678031921, "logps/rejected": -6.8627800941467285, "loss": 0.9022, "odds_ratio_loss": 0.1383415162563324, "rewards/accuracies": 1.0, "rewards/chosen": -0.08883383125066757, "rewards/margins": 0.5974441766738892, "rewards/rejected": -0.6862779855728149, "sft_loss": 0.8883382678031921, "step": 12850 }, { "epoch": 1.0, "grad_norm": 9.195427894592285, "learning_rate": 0.0, "logits/chosen": -1.2286322116851807, "logits/rejected": -1.5984668731689453, "logps/chosen": -1.4172906875610352, "logps/rejected": -7.929937839508057, "loss": 1.4268, "odds_ratio_loss": 0.09547214210033417, "rewards/accuracies": 1.0, "rewards/chosen": -0.14172907173633575, "rewards/margins": 0.6512646675109863, "rewards/rejected": -0.7929937243461609, "sft_loss": 1.4172906875610352, "step": 12855 }, { "epoch": 1.0, "step": 12855, "total_flos": 2.8081461696357335e+18, "train_loss": 1.0734075052920875, "train_runtime": 27866.917, "train_samples_per_second": 0.461, "train_steps_per_second": 0.461 } ], "logging_steps": 5, "max_steps": 12855, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "total_flos": 2.8081461696357335e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }