{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.8125, "learning_rate": 1.3054830287206268e-08, "logits/chosen": -2.377302885055542, "logits/rejected": -2.2193148136138916, "logps/chosen": -290.4185485839844, "logps/rejected": -374.6668701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 2.40625, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.2492425441741943, "logits/rejected": -2.0517687797546387, "logps/chosen": -279.6344909667969, "logps/rejected": -245.47564697265625, "loss": 0.6928, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": 0.0005959311965852976, "rewards/margins": 0.000615339376963675, "rewards/rejected": -1.9408274965826422e-05, "step": 10 }, { "epoch": 0.01, "grad_norm": 2.5, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.244947671890259, "logits/rejected": -1.943969964981079, "logps/chosen": -305.4734802246094, "logps/rejected": -237.70083618164062, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004091521259397268, "rewards/margins": 0.000647729029878974, "rewards/rejected": 0.0034437919966876507, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.3125, "learning_rate": 3.9164490861618804e-07, "logits/chosen": -2.205514907836914, "logits/rejected": -2.1370320320129395, "logps/chosen": -251.25662231445312, "logps/rejected": -251.41213989257812, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": 0.011662699282169342, "rewards/margins": 0.0018940108129754663, "rewards/rejected": 0.00976868998259306, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.9453125, "learning_rate": 5.221932114882506e-07, "logits/chosen": -2.0618391036987305, "logits/rejected": -2.0241973400115967, "logps/chosen": -216.21438598632812, "logps/rejected": -221.6951141357422, "loss": 0.6915, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.019298259168863297, "rewards/margins": 0.0034350629430264235, "rewards/rejected": 0.015863195061683655, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.078125, "learning_rate": 6.527415143603135e-07, "logits/chosen": -2.1124298572540283, "logits/rejected": -2.1008057594299316, "logps/chosen": -266.8966064453125, "logps/rejected": -234.32998657226562, "loss": 0.6906, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.029909158125519753, "rewards/margins": 0.005118774715811014, "rewards/rejected": 0.024790380150079727, "step": 50 }, { "epoch": 0.02, "grad_norm": 2.125, "learning_rate": 7.832898172323761e-07, "logits/chosen": -2.099602460861206, "logits/rejected": -1.9424635171890259, "logps/chosen": -252.27310180664062, "logps/rejected": -226.72030639648438, "loss": 0.6897, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.03226853534579277, "rewards/margins": 0.007134293206036091, "rewards/rejected": 0.025134241208434105, "step": 60 }, { "epoch": 0.02, "grad_norm": 2.03125, "learning_rate": 9.138381201044387e-07, "logits/chosen": -2.2440109252929688, "logits/rejected": -2.036339282989502, "logps/chosen": -272.09234619140625, "logps/rejected": -246.6947784423828, "loss": 0.6881, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.04063863307237625, "rewards/margins": 0.010386193171143532, "rewards/rejected": 0.03025243617594242, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.359375, "learning_rate": 1.0443864229765013e-06, "logits/chosen": -2.1541717052459717, "logits/rejected": -1.9777501821517944, "logps/chosen": -257.61871337890625, "logps/rejected": -246.86483764648438, "loss": 0.6874, "rewards/accuracies": 0.65625, "rewards/chosen": 0.038099195808172226, "rewards/margins": 0.011846454814076424, "rewards/rejected": 0.026252740994095802, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.1875, "learning_rate": 1.1749347258485642e-06, "logits/chosen": -2.1348958015441895, "logits/rejected": -1.998792290687561, "logps/chosen": -250.1610107421875, "logps/rejected": -234.56787109375, "loss": 0.6846, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.04175186529755592, "rewards/margins": 0.01758204773068428, "rewards/rejected": 0.024169817566871643, "step": 90 }, { "epoch": 0.03, "grad_norm": 2.125, "learning_rate": 1.305483028720627e-06, "logits/chosen": -2.1793951988220215, "logits/rejected": -2.0686168670654297, "logps/chosen": -247.0215301513672, "logps/rejected": -230.79537963867188, "loss": 0.6821, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04747994989156723, "rewards/margins": 0.0229250006377697, "rewards/rejected": 0.024554943665862083, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -2.0950841903686523, "eval_logits/rejected": -1.9557065963745117, "eval_logps/chosen": -259.6705627441406, "eval_logps/rejected": -241.93917846679688, "eval_loss": 0.6820979714393616, "eval_rewards/accuracies": 0.656499981880188, "eval_rewards/chosen": 0.04981444031000137, "eval_rewards/margins": 0.02312026545405388, "eval_rewards/rejected": 0.026694171130657196, "eval_runtime": 385.815, "eval_samples_per_second": 5.184, "eval_steps_per_second": 0.648, "step": 100 }, { "epoch": 0.03, "grad_norm": 2.3125, "learning_rate": 1.4360313315926894e-06, "logits/chosen": -2.146080493927002, "logits/rejected": -2.002453327178955, "logps/chosen": -284.4079895019531, "logps/rejected": -238.9375457763672, "loss": 0.6791, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04955831170082092, "rewards/margins": 0.029538575559854507, "rewards/rejected": 0.020019738003611565, "step": 110 }, { "epoch": 0.03, "grad_norm": 2.15625, "learning_rate": 1.5665796344647521e-06, "logits/chosen": -2.1928741931915283, "logits/rejected": -2.0533928871154785, "logps/chosen": -287.5110778808594, "logps/rejected": -271.9446716308594, "loss": 0.6728, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05533873289823532, "rewards/margins": 0.04249165579676628, "rewards/rejected": 0.01284707523882389, "step": 120 }, { "epoch": 0.03, "grad_norm": 2.671875, "learning_rate": 1.6971279373368146e-06, "logits/chosen": -2.2082314491271973, "logits/rejected": -2.118213653564453, "logps/chosen": -250.14013671875, "logps/rejected": -252.6034393310547, "loss": 0.6701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05011880397796631, "rewards/margins": 0.048879969865083694, "rewards/rejected": 0.001238831551745534, "step": 130 }, { "epoch": 0.04, "grad_norm": 2.5, "learning_rate": 1.8276762402088774e-06, "logits/chosen": -2.24537992477417, "logits/rejected": -1.9110206365585327, "logps/chosen": -270.5356750488281, "logps/rejected": -226.2827606201172, "loss": 0.6686, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04202268272638321, "rewards/margins": 0.052745603024959564, "rewards/rejected": -0.010722924955189228, "step": 140 }, { "epoch": 0.04, "grad_norm": 2.65625, "learning_rate": 1.9582245430809403e-06, "logits/chosen": -2.264875888824463, "logits/rejected": -2.0387892723083496, "logps/chosen": -280.36077880859375, "logps/rejected": -242.8515625, "loss": 0.6676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.036501698195934296, "rewards/margins": 0.05586882680654526, "rewards/rejected": -0.019367124885320663, "step": 150 }, { "epoch": 0.04, "grad_norm": 2.71875, "learning_rate": 2.0887728459530026e-06, "logits/chosen": -2.15449595451355, "logits/rejected": -2.0523486137390137, "logps/chosen": -256.1204833984375, "logps/rejected": -261.9712219238281, "loss": 0.6686, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.007034213747829199, "rewards/margins": 0.0563817024230957, "rewards/rejected": -0.04934748262166977, "step": 160 }, { "epoch": 0.04, "grad_norm": 2.890625, "learning_rate": 2.2193211488250653e-06, "logits/chosen": -2.1238508224487305, "logits/rejected": -1.9688222408294678, "logps/chosen": -220.9573211669922, "logps/rejected": -228.40869140625, "loss": 0.6703, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003666641190648079, "rewards/margins": 0.05213465169072151, "rewards/rejected": -0.05580129101872444, "step": 170 }, { "epoch": 0.05, "grad_norm": 3.28125, "learning_rate": 2.3498694516971284e-06, "logits/chosen": -2.1223385334014893, "logits/rejected": -1.9868714809417725, "logps/chosen": -258.9825134277344, "logps/rejected": -252.4698944091797, "loss": 0.6638, "rewards/accuracies": 0.625, "rewards/chosen": -0.03278004750609398, "rewards/margins": 0.06821247935295105, "rewards/rejected": -0.10099252313375473, "step": 180 }, { "epoch": 0.05, "grad_norm": 3.890625, "learning_rate": 2.4804177545691907e-06, "logits/chosen": -2.2460696697235107, "logits/rejected": -2.0304675102233887, "logps/chosen": -274.5130920410156, "logps/rejected": -256.2106628417969, "loss": 0.65, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04428885504603386, "rewards/margins": 0.10040076822042465, "rewards/rejected": -0.1446896344423294, "step": 190 }, { "epoch": 0.05, "grad_norm": 3.375, "learning_rate": 2.610966057441254e-06, "logits/chosen": -2.1960341930389404, "logits/rejected": -1.95565927028656, "logps/chosen": -259.01934814453125, "logps/rejected": -231.2660369873047, "loss": 0.6496, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.09310005605220795, "rewards/margins": 0.10362167656421661, "rewards/rejected": -0.19672173261642456, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -2.068035125732422, "eval_logits/rejected": -1.9312690496444702, "eval_logps/chosen": -270.0797119140625, "eval_logps/rejected": -260.6905517578125, "eval_loss": 0.6486819982528687, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": -0.05427735298871994, "eval_rewards/margins": 0.10654205083847046, "eval_rewards/rejected": -0.160819411277771, "eval_runtime": 385.2774, "eval_samples_per_second": 5.191, "eval_steps_per_second": 0.649, "step": 200 }, { "epoch": 0.05, "grad_norm": 3.484375, "learning_rate": 2.741514360313316e-06, "logits/chosen": -2.197986602783203, "logits/rejected": -1.9808934926986694, "logps/chosen": -267.27685546875, "logps/rejected": -249.9297637939453, "loss": 0.6319, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.05056775361299515, "rewards/margins": 0.14322780072689056, "rewards/rejected": -0.1937955617904663, "step": 210 }, { "epoch": 0.06, "grad_norm": 5.09375, "learning_rate": 2.872062663185379e-06, "logits/chosen": -2.0990307331085205, "logits/rejected": -1.983565330505371, "logps/chosen": -270.3437194824219, "logps/rejected": -256.6988525390625, "loss": 0.6401, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16327962279319763, "rewards/margins": 0.12751872837543488, "rewards/rejected": -0.2907983660697937, "step": 220 }, { "epoch": 0.06, "grad_norm": 5.0625, "learning_rate": 3.0026109660574416e-06, "logits/chosen": -2.2433676719665527, "logits/rejected": -2.056224822998047, "logps/chosen": -314.1068420410156, "logps/rejected": -288.00250244140625, "loss": 0.6589, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07937607169151306, "rewards/margins": 0.09387041628360748, "rewards/rejected": -0.17324648797512054, "step": 230 }, { "epoch": 0.06, "grad_norm": 5.6875, "learning_rate": 3.1331592689295043e-06, "logits/chosen": -2.1602792739868164, "logits/rejected": -1.9714686870574951, "logps/chosen": -310.117919921875, "logps/rejected": -308.3526916503906, "loss": 0.6431, "rewards/accuracies": 0.625, "rewards/chosen": -0.13817985355854034, "rewards/margins": 0.13379593193531036, "rewards/rejected": -0.2719758152961731, "step": 240 }, { "epoch": 0.07, "grad_norm": 5.4375, "learning_rate": 3.263707571801567e-06, "logits/chosen": -2.129748821258545, "logits/rejected": -2.028604030609131, "logps/chosen": -282.7078552246094, "logps/rejected": -272.08837890625, "loss": 0.6361, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23127944767475128, "rewards/margins": 0.14839713275432587, "rewards/rejected": -0.37967658042907715, "step": 250 }, { "epoch": 0.07, "grad_norm": 3.65625, "learning_rate": 3.3942558746736293e-06, "logits/chosen": -2.183048725128174, "logits/rejected": -1.9789161682128906, "logps/chosen": -281.8155212402344, "logps/rejected": -272.23956298828125, "loss": 0.6437, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09410645067691803, "rewards/margins": 0.12439638376235962, "rewards/rejected": -0.21850283443927765, "step": 260 }, { "epoch": 0.07, "grad_norm": 3.921875, "learning_rate": 3.524804177545692e-06, "logits/chosen": -2.083225965499878, "logits/rejected": -1.9568647146224976, "logps/chosen": -275.0286560058594, "logps/rejected": -263.38140869140625, "loss": 0.6139, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.14354154467582703, "rewards/margins": 0.19759733974933624, "rewards/rejected": -0.3411388695240021, "step": 270 }, { "epoch": 0.07, "grad_norm": 4.8125, "learning_rate": 3.6553524804177547e-06, "logits/chosen": -2.1453604698181152, "logits/rejected": -1.9743705987930298, "logps/chosen": -287.78057861328125, "logps/rejected": -284.1526794433594, "loss": 0.6277, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28831422328948975, "rewards/margins": 0.17838594317436218, "rewards/rejected": -0.4667002260684967, "step": 280 }, { "epoch": 0.08, "grad_norm": 4.15625, "learning_rate": 3.7859007832898174e-06, "logits/chosen": -2.068016529083252, "logits/rejected": -1.9705654382705688, "logps/chosen": -315.2586364746094, "logps/rejected": -313.2366027832031, "loss": 0.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2962488532066345, "rewards/margins": 0.2308805286884308, "rewards/rejected": -0.5271294116973877, "step": 290 }, { "epoch": 0.08, "grad_norm": 5.8125, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -2.1018004417419434, "logits/rejected": -1.8998439311981201, "logps/chosen": -275.9500732421875, "logps/rejected": -287.0372009277344, "loss": 0.6042, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.31821924448013306, "rewards/margins": 0.24809296429157257, "rewards/rejected": -0.5663121938705444, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": -2.0229153633117676, "eval_logits/rejected": -1.889541745185852, "eval_logps/chosen": -295.1513671875, "eval_logps/rejected": -296.011474609375, "eval_loss": 0.6216087937355042, "eval_rewards/accuracies": 0.6729999780654907, "eval_rewards/chosen": -0.30499377846717834, "eval_rewards/margins": 0.2090347856283188, "eval_rewards/rejected": -0.5140285491943359, "eval_runtime": 385.3276, "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 300 }, { "epoch": 0.08, "grad_norm": 6.78125, "learning_rate": 4.046997389033943e-06, "logits/chosen": -2.246411085128784, "logits/rejected": -2.0464656352996826, "logps/chosen": -320.37054443359375, "logps/rejected": -296.6560363769531, "loss": 0.5823, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.31464242935180664, "rewards/margins": 0.29925835132598877, "rewards/rejected": -0.6139007806777954, "step": 310 }, { "epoch": 0.08, "grad_norm": 4.90625, "learning_rate": 4.177545691906005e-06, "logits/chosen": -2.1202454566955566, "logits/rejected": -1.933571457862854, "logps/chosen": -300.3293151855469, "logps/rejected": -303.07177734375, "loss": 0.6333, "rewards/accuracies": 0.6875, "rewards/chosen": -0.42065340280532837, "rewards/margins": 0.19771243631839752, "rewards/rejected": -0.6183657646179199, "step": 320 }, { "epoch": 0.09, "grad_norm": 5.65625, "learning_rate": 4.308093994778068e-06, "logits/chosen": -2.0555598735809326, "logits/rejected": -1.9103734493255615, "logps/chosen": -286.83306884765625, "logps/rejected": -285.3974609375, "loss": 0.6152, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2723875939846039, "rewards/margins": 0.22743086516857147, "rewards/rejected": -0.49981847405433655, "step": 330 }, { "epoch": 0.09, "grad_norm": 3.6875, "learning_rate": 4.4386422976501306e-06, "logits/chosen": -2.1098897457122803, "logits/rejected": -1.996603012084961, "logps/chosen": -339.12225341796875, "logps/rejected": -342.5606994628906, "loss": 0.611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.49436426162719727, "rewards/margins": 0.24838733673095703, "rewards/rejected": -0.7427516579627991, "step": 340 }, { "epoch": 0.09, "grad_norm": 4.71875, "learning_rate": 4.569190600522193e-06, "logits/chosen": -2.0351061820983887, "logits/rejected": -1.8878052234649658, "logps/chosen": -342.15667724609375, "logps/rejected": -348.20281982421875, "loss": 0.6434, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5289834141731262, "rewards/margins": 0.1753660887479782, "rewards/rejected": -0.7043493986129761, "step": 350 }, { "epoch": 0.09, "grad_norm": 3.84375, "learning_rate": 4.699738903394257e-06, "logits/chosen": -2.014333963394165, "logits/rejected": -1.9689722061157227, "logps/chosen": -274.50213623046875, "logps/rejected": -278.16351318359375, "loss": 0.6081, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.33518165349960327, "rewards/margins": 0.24500660598278046, "rewards/rejected": -0.5801882743835449, "step": 360 }, { "epoch": 0.1, "grad_norm": 5.09375, "learning_rate": 4.8302872062663196e-06, "logits/chosen": -2.1054439544677734, "logits/rejected": -1.9295707941055298, "logps/chosen": -315.6613464355469, "logps/rejected": -300.81231689453125, "loss": 0.5976, "rewards/accuracies": 0.71875, "rewards/chosen": -0.38765162229537964, "rewards/margins": 0.2898003160953522, "rewards/rejected": -0.6774519681930542, "step": 370 }, { "epoch": 0.1, "grad_norm": 6.375, "learning_rate": 4.9608355091383814e-06, "logits/chosen": -2.0662331581115723, "logits/rejected": -1.8568542003631592, "logps/chosen": -335.3840637207031, "logps/rejected": -334.6043395996094, "loss": 0.5885, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.373274028301239, "rewards/margins": 0.3260083794593811, "rewards/rejected": -0.6992824077606201, "step": 380 }, { "epoch": 0.1, "grad_norm": 5.53125, "learning_rate": 4.9999488562447675e-06, "logits/chosen": -2.0750114917755127, "logits/rejected": -1.9580342769622803, "logps/chosen": -320.772705078125, "logps/rejected": -333.44476318359375, "loss": 0.5855, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39726486802101135, "rewards/margins": 0.3268759846687317, "rewards/rejected": -0.7241408228874207, "step": 390 }, { "epoch": 0.1, "grad_norm": 5.3125, "learning_rate": 4.999698361256577e-06, "logits/chosen": -2.0969738960266113, "logits/rejected": -1.8604263067245483, "logps/chosen": -311.9226989746094, "logps/rejected": -294.60662841796875, "loss": 0.6218, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4644620418548584, "rewards/margins": 0.23335090279579163, "rewards/rejected": -0.6978129148483276, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -1.9431427717208862, "eval_logits/rejected": -1.8155378103256226, "eval_logps/chosen": -326.5406799316406, "eval_logps/rejected": -340.4455261230469, "eval_loss": 0.5939911007881165, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": -0.6188870072364807, "eval_rewards/margins": 0.3394821286201477, "eval_rewards/rejected": -0.9583691358566284, "eval_runtime": 385.2303, "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 400 }, { "epoch": 0.11, "grad_norm": 5.59375, "learning_rate": 4.999239142174581e-06, "logits/chosen": -1.9562289714813232, "logits/rejected": -1.8964239358901978, "logps/chosen": -315.13616943359375, "logps/rejected": -334.50677490234375, "loss": 0.6431, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7281379699707031, "rewards/margins": 0.2114681750535965, "rewards/rejected": -0.9396060705184937, "step": 410 }, { "epoch": 0.11, "grad_norm": 6.28125, "learning_rate": 4.99857123734344e-06, "logits/chosen": -1.9491183757781982, "logits/rejected": -1.8290717601776123, "logps/chosen": -280.4700622558594, "logps/rejected": -309.1809997558594, "loss": 0.5735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5167919397354126, "rewards/margins": 0.3777889609336853, "rewards/rejected": -0.8945809602737427, "step": 420 }, { "epoch": 0.11, "grad_norm": 9.5, "learning_rate": 4.997694702533016e-06, "logits/chosen": -1.9259364604949951, "logits/rejected": -1.8644450902938843, "logps/chosen": -345.35797119140625, "logps/rejected": -365.54449462890625, "loss": 0.5722, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6780990362167358, "rewards/margins": 0.4380587637424469, "rewards/rejected": -1.1161577701568604, "step": 430 }, { "epoch": 0.12, "grad_norm": 7.59375, "learning_rate": 4.996609610933713e-06, "logits/chosen": -2.0121302604675293, "logits/rejected": -1.9294341802597046, "logps/chosen": -349.0380554199219, "logps/rejected": -362.43768310546875, "loss": 0.5912, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7688915729522705, "rewards/margins": 0.4175523817539215, "rewards/rejected": -1.1864439249038696, "step": 440 }, { "epoch": 0.12, "grad_norm": 5.46875, "learning_rate": 4.995316053150366e-06, "logits/chosen": -1.889850378036499, "logits/rejected": -1.7697973251342773, "logps/chosen": -332.23077392578125, "logps/rejected": -353.26593017578125, "loss": 0.5642, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.643971860408783, "rewards/margins": 0.4406636357307434, "rewards/rejected": -1.0846354961395264, "step": 450 }, { "epoch": 0.12, "grad_norm": 9.875, "learning_rate": 4.9938141371946815e-06, "logits/chosen": -1.8695566654205322, "logits/rejected": -1.7812881469726562, "logps/chosen": -366.63818359375, "logps/rejected": -409.49755859375, "loss": 0.5388, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0153841972351074, "rewards/margins": 0.5738715529441833, "rewards/rejected": -1.589255690574646, "step": 460 }, { "epoch": 0.12, "grad_norm": 7.46875, "learning_rate": 4.992103988476206e-06, "logits/chosen": -1.8687667846679688, "logits/rejected": -1.7270047664642334, "logps/chosen": -376.8227844238281, "logps/rejected": -413.8404846191406, "loss": 0.5719, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3300559520721436, "rewards/margins": 0.49565353989601135, "rewards/rejected": -1.825709581375122, "step": 470 }, { "epoch": 0.13, "grad_norm": 6.0625, "learning_rate": 4.990185749791866e-06, "logits/chosen": -1.8790470361709595, "logits/rejected": -1.7465674877166748, "logps/chosen": -361.17974853515625, "logps/rejected": -419.521484375, "loss": 0.5472, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0430080890655518, "rewards/margins": 0.5655397176742554, "rewards/rejected": -1.6085479259490967, "step": 480 }, { "epoch": 0.13, "grad_norm": 6.53125, "learning_rate": 4.9880595813140395e-06, "logits/chosen": -1.923179268836975, "logits/rejected": -1.7839629650115967, "logps/chosen": -394.8546142578125, "logps/rejected": -421.29730224609375, "loss": 0.5317, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0996313095092773, "rewards/margins": 0.5679025053977966, "rewards/rejected": -1.6675338745117188, "step": 490 }, { "epoch": 0.13, "grad_norm": 6.25, "learning_rate": 4.985725660577184e-06, "logits/chosen": -1.887112021446228, "logits/rejected": -1.7504981756210327, "logps/chosen": -411.74951171875, "logps/rejected": -424.2745666503906, "loss": 0.5674, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3798956871032715, "rewards/margins": 0.5375889539718628, "rewards/rejected": -1.9174845218658447, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": -1.7892649173736572, "eval_logits/rejected": -1.6636674404144287, "eval_logps/chosen": -421.9456787109375, "eval_logps/rejected": -449.8769836425781, "eval_loss": 0.5779695510864258, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": -1.572936773300171, "eval_rewards/margins": 0.4797472655773163, "eval_rewards/rejected": -2.0526838302612305, "eval_runtime": 385.3091, "eval_samples_per_second": 5.191, "eval_steps_per_second": 0.649, "step": 500 }, { "epoch": 0.13, "grad_norm": 5.40625, "learning_rate": 4.983184182463009e-06, "logits/chosen": -1.853735327720642, "logits/rejected": -1.7524950504302979, "logps/chosen": -404.90545654296875, "logps/rejected": -425.74676513671875, "loss": 0.5607, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.266443133354187, "rewards/margins": 0.5788331031799316, "rewards/rejected": -1.8452762365341187, "step": 510 }, { "epoch": 0.14, "grad_norm": 7.3125, "learning_rate": 4.980435359184203e-06, "logits/chosen": -1.9005975723266602, "logits/rejected": -1.8376613855361938, "logps/chosen": -341.048828125, "logps/rejected": -359.40496826171875, "loss": 0.6122, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6798708438873291, "rewards/margins": 0.3456707298755646, "rewards/rejected": -1.0255415439605713, "step": 520 }, { "epoch": 0.14, "grad_norm": 10.875, "learning_rate": 4.9774794202667236e-06, "logits/chosen": -1.8874883651733398, "logits/rejected": -1.8308721780776978, "logps/chosen": -315.84173583984375, "logps/rejected": -365.2502746582031, "loss": 0.5734, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5474014282226562, "rewards/margins": 0.40957459807395935, "rewards/rejected": -0.9569761157035828, "step": 530 }, { "epoch": 0.14, "grad_norm": 8.5, "learning_rate": 4.974316612530615e-06, "logits/chosen": -1.8144280910491943, "logits/rejected": -1.657810926437378, "logps/chosen": -369.9844665527344, "logps/rejected": -390.8047180175781, "loss": 0.5011, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9150163531303406, "rewards/margins": 0.676045298576355, "rewards/rejected": -1.5910617113113403, "step": 540 }, { "epoch": 0.14, "grad_norm": 12.625, "learning_rate": 4.970947200069416e-06, "logits/chosen": -1.7606821060180664, "logits/rejected": -1.7015259265899658, "logps/chosen": -427.96990966796875, "logps/rejected": -451.92205810546875, "loss": 0.6311, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.494568943977356, "rewards/margins": 0.4373590350151062, "rewards/rejected": -1.931928038597107, "step": 550 }, { "epoch": 0.15, "grad_norm": 7.78125, "learning_rate": 4.967371464228096e-06, "logits/chosen": -1.9176502227783203, "logits/rejected": -1.832397699356079, "logps/chosen": -372.6578369140625, "logps/rejected": -429.7704162597656, "loss": 0.5482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1515864133834839, "rewards/margins": 0.567107081413269, "rewards/rejected": -1.718693494796753, "step": 560 }, { "epoch": 0.15, "grad_norm": 5.53125, "learning_rate": 4.963589703579569e-06, "logits/chosen": -1.9988332986831665, "logits/rejected": -1.8672618865966797, "logps/chosen": -407.62664794921875, "logps/rejected": -419.98291015625, "loss": 0.5754, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.075867772102356, "rewards/margins": 0.47032594680786133, "rewards/rejected": -1.5461935997009277, "step": 570 }, { "epoch": 0.15, "grad_norm": 8.5, "learning_rate": 4.9596022338997615e-06, "logits/chosen": -1.9790706634521484, "logits/rejected": -1.7595863342285156, "logps/chosen": -397.14752197265625, "logps/rejected": -413.5733337402344, "loss": 0.5495, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9936078190803528, "rewards/margins": 0.5750035047531128, "rewards/rejected": -1.5686112642288208, "step": 580 }, { "epoch": 0.15, "grad_norm": 7.9375, "learning_rate": 4.955409388141243e-06, "logits/chosen": -1.8258365392684937, "logits/rejected": -1.7129390239715576, "logps/chosen": -363.6575622558594, "logps/rejected": -387.19378662109375, "loss": 0.6003, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0635744333267212, "rewards/margins": 0.4720209240913391, "rewards/rejected": -1.5355952978134155, "step": 590 }, { "epoch": 0.16, "grad_norm": 4.84375, "learning_rate": 4.951011516405429e-06, "logits/chosen": -1.8798444271087646, "logits/rejected": -1.8100011348724365, "logps/chosen": -338.61151123046875, "logps/rejected": -374.54974365234375, "loss": 0.5632, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8785476684570312, "rewards/margins": 0.5087668895721436, "rewards/rejected": -1.3873146772384644, "step": 600 }, { "epoch": 0.16, "eval_logits/chosen": -1.778578281402588, "eval_logits/rejected": -1.6489102840423584, "eval_logps/chosen": -342.7493896484375, "eval_logps/rejected": -372.69134521484375, "eval_loss": 0.5649436712265015, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": -0.7809735536575317, "eval_rewards/margins": 0.49985405802726746, "eval_rewards/rejected": -1.2808276414871216, "eval_runtime": 385.3125, "eval_samples_per_second": 5.191, "eval_steps_per_second": 0.649, "step": 600 }, { "epoch": 0.16, "grad_norm": 6.71875, "learning_rate": 4.946408985913344e-06, "logits/chosen": -1.8086153268814087, "logits/rejected": -1.7312501668930054, "logps/chosen": -321.55279541015625, "logps/rejected": -367.79229736328125, "loss": 0.5218, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7568905353546143, "rewards/margins": 0.6399748921394348, "rewards/rejected": -1.3968654870986938, "step": 610 }, { "epoch": 0.16, "grad_norm": 16.25, "learning_rate": 4.941602180974958e-06, "logits/chosen": -1.833062767982483, "logits/rejected": -1.5977442264556885, "logps/chosen": -380.17169189453125, "logps/rejected": -390.75848388671875, "loss": 0.524, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9234441518783569, "rewards/margins": 0.6925610303878784, "rewards/rejected": -1.616005301475525, "step": 620 }, { "epoch": 0.16, "grad_norm": 6.71875, "learning_rate": 4.936591502957101e-06, "logits/chosen": -1.813197374343872, "logits/rejected": -1.6430933475494385, "logps/chosen": -355.9547424316406, "logps/rejected": -418.7765197753906, "loss": 0.5344, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.078249216079712, "rewards/margins": 0.7311606407165527, "rewards/rejected": -1.809409737586975, "step": 630 }, { "epoch": 0.17, "grad_norm": 7.0, "learning_rate": 4.931377370249946e-06, "logits/chosen": -1.8197021484375, "logits/rejected": -1.5834531784057617, "logps/chosen": -435.12738037109375, "logps/rejected": -468.70501708984375, "loss": 0.5641, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6980397701263428, "rewards/margins": 0.593255341053009, "rewards/rejected": -2.291295289993286, "step": 640 }, { "epoch": 0.17, "grad_norm": 10.0, "learning_rate": 4.925960218232073e-06, "logits/chosen": -1.7958835363388062, "logits/rejected": -1.6748111248016357, "logps/chosen": -392.5576171875, "logps/rejected": -455.75811767578125, "loss": 0.5384, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.372554898262024, "rewards/margins": 0.7270306348800659, "rewards/rejected": -2.09958553314209, "step": 650 }, { "epoch": 0.17, "grad_norm": 8.1875, "learning_rate": 4.920340499234116e-06, "logits/chosen": -1.7571017742156982, "logits/rejected": -1.5184545516967773, "logps/chosen": -403.0295715332031, "logps/rejected": -419.2205505371094, "loss": 0.5787, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3702582120895386, "rewards/margins": 0.5140202641487122, "rewards/rejected": -1.884278655052185, "step": 660 }, { "epoch": 0.18, "grad_norm": 6.53125, "learning_rate": 4.914518682500995e-06, "logits/chosen": -1.9124584197998047, "logits/rejected": -1.694361925125122, "logps/chosen": -436.59747314453125, "logps/rejected": -460.3738708496094, "loss": 0.5391, "rewards/accuracies": 0.6875, "rewards/chosen": -1.530548334121704, "rewards/margins": 0.6449794769287109, "rewards/rejected": -2.175528049468994, "step": 670 }, { "epoch": 0.18, "grad_norm": 9.5625, "learning_rate": 4.9084952541527315e-06, "logits/chosen": -1.7815015316009521, "logits/rejected": -1.5756428241729736, "logps/chosen": -448.1412658691406, "logps/rejected": -469.43603515625, "loss": 0.5139, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7480707168579102, "rewards/margins": 0.6839998364448547, "rewards/rejected": -2.432070255279541, "step": 680 }, { "epoch": 0.18, "grad_norm": 9.3125, "learning_rate": 4.902270717143858e-06, "logits/chosen": -1.7120873928070068, "logits/rejected": -1.6082136631011963, "logps/chosen": -419.0126037597656, "logps/rejected": -534.6773681640625, "loss": 0.4522, "rewards/accuracies": 0.78125, "rewards/chosen": -1.827803611755371, "rewards/margins": 1.0398612022399902, "rewards/rejected": -2.8676648139953613, "step": 690 }, { "epoch": 0.18, "grad_norm": 6.8125, "learning_rate": 4.895845591221427e-06, "logits/chosen": -1.676849365234375, "logits/rejected": -1.601438283920288, "logps/chosen": -455.9642639160156, "logps/rejected": -528.1475219726562, "loss": 0.5331, "rewards/accuracies": 0.75, "rewards/chosen": -2.0265378952026367, "rewards/margins": 0.7665891647338867, "rewards/rejected": -2.7931270599365234, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": -1.5919249057769775, "eval_logits/rejected": -1.469058632850647, "eval_logps/chosen": -455.5274963378906, "eval_logps/rejected": -512.6751098632812, "eval_loss": 0.560720682144165, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -1.9087554216384888, "eval_rewards/margins": 0.7719098925590515, "eval_rewards/rejected": -2.6806650161743164, "eval_runtime": 385.1228, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 700 }, { "epoch": 0.19, "grad_norm": 9.5, "learning_rate": 4.8892204128816e-06, "logits/chosen": -1.7319362163543701, "logits/rejected": -1.619175672531128, "logps/chosen": -431.63232421875, "logps/rejected": -489.86297607421875, "loss": 0.5277, "rewards/accuracies": 0.75, "rewards/chosen": -1.6406848430633545, "rewards/margins": 0.7181671857833862, "rewards/rejected": -2.358851909637451, "step": 710 }, { "epoch": 0.19, "grad_norm": 7.28125, "learning_rate": 4.882395735324864e-06, "logits/chosen": -1.6986335515975952, "logits/rejected": -1.5594747066497803, "logps/chosen": -427.96978759765625, "logps/rejected": -497.16766357421875, "loss": 0.4996, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6188774108886719, "rewards/margins": 0.8638092875480652, "rewards/rejected": -2.4826865196228027, "step": 720 }, { "epoch": 0.19, "grad_norm": 7.25, "learning_rate": 4.87537212840983e-06, "logits/chosen": -1.6116526126861572, "logits/rejected": -1.474578619003296, "logps/chosen": -464.416259765625, "logps/rejected": -503.581787109375, "loss": 0.576, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.981610894203186, "rewards/margins": 0.6970613598823547, "rewards/rejected": -2.6786723136901855, "step": 730 }, { "epoch": 0.19, "grad_norm": 9.1875, "learning_rate": 4.8681501786056545e-06, "logits/chosen": -1.5888502597808838, "logits/rejected": -1.4401233196258545, "logps/chosen": -373.1294860839844, "logps/rejected": -415.46240234375, "loss": 0.5066, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.449134111404419, "rewards/margins": 0.7457250356674194, "rewards/rejected": -2.194859027862549, "step": 740 }, { "epoch": 0.2, "grad_norm": 14.75, "learning_rate": 4.860730488943068e-06, "logits/chosen": -1.6056511402130127, "logits/rejected": -1.5784225463867188, "logps/chosen": -356.6183166503906, "logps/rejected": -429.750732421875, "loss": 0.5024, "rewards/accuracies": 0.75, "rewards/chosen": -1.2209298610687256, "rewards/margins": 0.7504220008850098, "rewards/rejected": -1.971351981163025, "step": 750 }, { "epoch": 0.2, "grad_norm": 6.96875, "learning_rate": 4.853113678964022e-06, "logits/chosen": -1.6386051177978516, "logits/rejected": -1.5690464973449707, "logps/chosen": -394.1507568359375, "logps/rejected": -469.383056640625, "loss": 0.4908, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2175710201263428, "rewards/margins": 0.8312736749649048, "rewards/rejected": -2.048844575881958, "step": 760 }, { "epoch": 0.2, "grad_norm": 15.75, "learning_rate": 4.845300384669958e-06, "logits/chosen": -1.6991758346557617, "logits/rejected": -1.563987374305725, "logps/chosen": -405.8094482421875, "logps/rejected": -445.58209228515625, "loss": 0.5794, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5260562896728516, "rewards/margins": 0.6259430050849915, "rewards/rejected": -2.1519992351531982, "step": 770 }, { "epoch": 0.2, "grad_norm": 8.9375, "learning_rate": 4.837291258468701e-06, "logits/chosen": -1.7494251728057861, "logits/rejected": -1.6077022552490234, "logps/chosen": -431.01519775390625, "logps/rejected": -486.4640197753906, "loss": 0.5468, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4566891193389893, "rewards/margins": 0.8033839464187622, "rewards/rejected": -2.260073184967041, "step": 780 }, { "epoch": 0.21, "grad_norm": 6.78125, "learning_rate": 4.829086969119984e-06, "logits/chosen": -1.613250732421875, "logits/rejected": -1.5955699682235718, "logps/chosen": -397.90008544921875, "logps/rejected": -463.9117126464844, "loss": 0.6001, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4717532396316528, "rewards/margins": 0.64255690574646, "rewards/rejected": -2.1143100261688232, "step": 790 }, { "epoch": 0.21, "grad_norm": 7.5, "learning_rate": 4.820688201679605e-06, "logits/chosen": -1.8398478031158447, "logits/rejected": -1.5474001169204712, "logps/chosen": -399.21368408203125, "logps/rejected": -416.2703552246094, "loss": 0.4996, "rewards/accuracies": 0.75, "rewards/chosen": -1.3657824993133545, "rewards/margins": 0.7942038774490356, "rewards/rejected": -2.1599864959716797, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": -1.6709563732147217, "eval_logits/rejected": -1.546115756034851, "eval_logps/chosen": -409.65435791015625, "eval_logps/rejected": -460.5684814453125, "eval_loss": 0.543312132358551, "eval_rewards/accuracies": 0.7070000171661377, "eval_rewards/chosen": -1.4500234127044678, "eval_rewards/margins": 0.7095751166343689, "eval_rewards/rejected": -2.1595985889434814, "eval_runtime": 385.2124, "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 800 }, { "epoch": 0.21, "grad_norm": 7.34375, "learning_rate": 4.8120956574422315e-06, "logits/chosen": -1.7810264825820923, "logits/rejected": -1.7489475011825562, "logps/chosen": -431.69219970703125, "logps/rejected": -477.871337890625, "loss": 0.6275, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5776736736297607, "rewards/margins": 0.5325725674629211, "rewards/rejected": -2.110246181488037, "step": 810 }, { "epoch": 0.21, "grad_norm": 13.625, "learning_rate": 4.803310053882831e-06, "logits/chosen": -1.7703052759170532, "logits/rejected": -1.7803173065185547, "logps/chosen": -363.9437561035156, "logps/rejected": -435.0057678222656, "loss": 0.5542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3461793661117554, "rewards/margins": 0.5871396064758301, "rewards/rejected": -1.933318853378296, "step": 820 }, { "epoch": 0.22, "grad_norm": 6.875, "learning_rate": 4.794332124596775e-06, "logits/chosen": -1.8022472858428955, "logits/rejected": -1.6746841669082642, "logps/chosen": -397.36090087890625, "logps/rejected": -445.603759765625, "loss": 0.5885, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2891987562179565, "rewards/margins": 0.5191696882247925, "rewards/rejected": -1.808368444442749, "step": 830 }, { "epoch": 0.22, "grad_norm": 9.375, "learning_rate": 4.785162619238575e-06, "logits/chosen": -1.7888991832733154, "logits/rejected": -1.6187770366668701, "logps/chosen": -355.0903015136719, "logps/rejected": -387.1643981933594, "loss": 0.5416, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.012415885925293, "rewards/margins": 0.5939286947250366, "rewards/rejected": -1.6063445806503296, "step": 840 }, { "epoch": 0.22, "grad_norm": 6.78125, "learning_rate": 4.775802303459288e-06, "logits/chosen": -1.7059911489486694, "logits/rejected": -1.6270997524261475, "logps/chosen": -346.2181091308594, "logps/rejected": -401.40069580078125, "loss": 0.5465, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9845376014709473, "rewards/margins": 0.5990740656852722, "rewards/rejected": -1.5836117267608643, "step": 850 }, { "epoch": 0.23, "grad_norm": 11.125, "learning_rate": 4.766251958842589e-06, "logits/chosen": -1.676922082901001, "logits/rejected": -1.5429388284683228, "logps/chosen": -394.45416259765625, "logps/rejected": -433.03369140625, "loss": 0.5815, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.219531774520874, "rewards/margins": 0.49049144983291626, "rewards/rejected": -1.710023283958435, "step": 860 }, { "epoch": 0.23, "grad_norm": 6.78125, "learning_rate": 4.7565123828395066e-06, "logits/chosen": -1.5784261226654053, "logits/rejected": -1.5068719387054443, "logps/chosen": -391.16192626953125, "logps/rejected": -455.4800720214844, "loss": 0.531, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3523309230804443, "rewards/margins": 0.6492956280708313, "rewards/rejected": -2.001626491546631, "step": 870 }, { "epoch": 0.23, "grad_norm": 6.65625, "learning_rate": 4.746584388701831e-06, "logits/chosen": -1.6509666442871094, "logits/rejected": -1.5814907550811768, "logps/chosen": -408.57598876953125, "logps/rejected": -468.8497619628906, "loss": 0.5239, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4533666372299194, "rewards/margins": 0.7448235750198364, "rewards/rejected": -2.198190212249756, "step": 880 }, { "epoch": 0.23, "grad_norm": 9.5625, "learning_rate": 4.736468805414218e-06, "logits/chosen": -1.6324241161346436, "logits/rejected": -1.6051101684570312, "logps/chosen": -362.0763244628906, "logps/rejected": -444.11077880859375, "loss": 0.5667, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1464101076126099, "rewards/margins": 0.6818080544471741, "rewards/rejected": -1.8282181024551392, "step": 890 }, { "epoch": 0.24, "grad_norm": 12.125, "learning_rate": 4.7261664776249595e-06, "logits/chosen": -1.5433322191238403, "logits/rejected": -1.4583094120025635, "logps/chosen": -336.41778564453125, "logps/rejected": -401.85772705078125, "loss": 0.514, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0750483274459839, "rewards/margins": 0.7443105578422546, "rewards/rejected": -1.8193588256835938, "step": 900 }, { "epoch": 0.24, "eval_logits/chosen": -1.621368169784546, "eval_logits/rejected": -1.5014086961746216, "eval_logps/chosen": -391.2229919433594, "eval_logps/rejected": -436.3040771484375, "eval_loss": 0.5440120697021484, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": -1.2657097578048706, "eval_rewards/margins": 0.6512450575828552, "eval_rewards/rejected": -1.916954755783081, "eval_runtime": 385.3527, "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 900 }, { "epoch": 0.24, "grad_norm": 7.8125, "learning_rate": 4.715678265575463e-06, "logits/chosen": -1.7400833368301392, "logits/rejected": -1.5401082038879395, "logps/chosen": -410.2032775878906, "logps/rejected": -411.843994140625, "loss": 0.556, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2593110799789429, "rewards/margins": 0.5718873739242554, "rewards/rejected": -1.8311984539031982, "step": 910 }, { "epoch": 0.24, "grad_norm": 9.3125, "learning_rate": 4.705005045028415e-06, "logits/chosen": -1.6306053400039673, "logits/rejected": -1.5210235118865967, "logps/chosen": -400.1542053222656, "logps/rejected": -448.408447265625, "loss": 0.5563, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.370181679725647, "rewards/margins": 0.6565110087394714, "rewards/rejected": -2.0266928672790527, "step": 920 }, { "epoch": 0.24, "grad_norm": 10.4375, "learning_rate": 4.694147707194659e-06, "logits/chosen": -1.6995433568954468, "logits/rejected": -1.6389293670654297, "logps/chosen": -427.10137939453125, "logps/rejected": -471.07952880859375, "loss": 0.5469, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5522325038909912, "rewards/margins": 0.6380002498626709, "rewards/rejected": -2.190232753753662, "step": 930 }, { "epoch": 0.25, "grad_norm": 6.65625, "learning_rate": 4.683107158658782e-06, "logits/chosen": -1.6130173206329346, "logits/rejected": -1.5491468906402588, "logps/chosen": -439.54248046875, "logps/rejected": -480.834228515625, "loss": 0.512, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4930182695388794, "rewards/margins": 0.7003245949745178, "rewards/rejected": -2.193342924118042, "step": 940 }, { "epoch": 0.25, "grad_norm": 11.3125, "learning_rate": 4.671884321303407e-06, "logits/chosen": -1.6797221899032593, "logits/rejected": -1.5230547189712524, "logps/chosen": -394.5656433105469, "logps/rejected": -453.25946044921875, "loss": 0.5134, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4823963642120361, "rewards/margins": 0.7305151224136353, "rewards/rejected": -2.212911605834961, "step": 950 }, { "epoch": 0.25, "grad_norm": 7.9375, "learning_rate": 4.660480132232224e-06, "logits/chosen": -1.7173080444335938, "logits/rejected": -1.60665762424469, "logps/chosen": -406.39117431640625, "logps/rejected": -445.9922790527344, "loss": 0.5666, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3457807302474976, "rewards/margins": 0.637101411819458, "rewards/rejected": -1.9828822612762451, "step": 960 }, { "epoch": 0.25, "grad_norm": 8.5625, "learning_rate": 4.6488955436917414e-06, "logits/chosen": -1.7457382678985596, "logits/rejected": -1.5430558919906616, "logps/chosen": -429.39300537109375, "logps/rejected": -465.61224365234375, "loss": 0.5461, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.49411940574646, "rewards/margins": 0.8279851078987122, "rewards/rejected": -2.3221046924591064, "step": 970 }, { "epoch": 0.26, "grad_norm": 7.03125, "learning_rate": 4.6371315229917644e-06, "logits/chosen": -1.7286710739135742, "logits/rejected": -1.5955041646957397, "logps/chosen": -443.83270263671875, "logps/rejected": -498.59967041015625, "loss": 0.5188, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6041603088378906, "rewards/margins": 0.7516692876815796, "rewards/rejected": -2.3558297157287598, "step": 980 }, { "epoch": 0.26, "grad_norm": 10.625, "learning_rate": 4.625189052424638e-06, "logits/chosen": -1.6606595516204834, "logits/rejected": -1.5426713228225708, "logps/chosen": -412.0262145996094, "logps/rejected": -478.1866149902344, "loss": 0.4696, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7292293310165405, "rewards/margins": 0.888912558555603, "rewards/rejected": -2.6181421279907227, "step": 990 }, { "epoch": 0.26, "grad_norm": 7.46875, "learning_rate": 4.613069129183218e-06, "logits/chosen": -1.7503217458724976, "logits/rejected": -1.6148483753204346, "logps/chosen": -452.8263244628906, "logps/rejected": -481.222900390625, "loss": 0.5468, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4942306280136108, "rewards/margins": 0.6457923054695129, "rewards/rejected": -2.1400229930877686, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": -1.5656111240386963, "eval_logits/rejected": -1.4448813199996948, "eval_logps/chosen": -401.67669677734375, "eval_logps/rejected": -451.64080810546875, "eval_loss": 0.5418093204498291, "eval_rewards/accuracies": 0.7174999713897705, "eval_rewards/chosen": -1.3702467679977417, "eval_rewards/margins": 0.7000752091407776, "eval_rewards/rejected": -2.070322036743164, "eval_runtime": 385.2164, "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 1000 }, { "epoch": 0.26, "grad_norm": 7.8125, "learning_rate": 4.600772765277607e-06, "logits/chosen": -1.531764268875122, "logits/rejected": -1.4728986024856567, "logps/chosen": -375.1974792480469, "logps/rejected": -444.0108337402344, "loss": 0.5138, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4164024591445923, "rewards/margins": 0.7396863698959351, "rewards/rejected": -2.1560888290405273, "step": 1010 }, { "epoch": 0.27, "grad_norm": 16.25, "learning_rate": 4.588300987450652e-06, "logits/chosen": -1.6515556573867798, "logits/rejected": -1.547123670578003, "logps/chosen": -394.8990173339844, "logps/rejected": -431.17645263671875, "loss": 0.5418, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4115506410598755, "rewards/margins": 0.6983591318130493, "rewards/rejected": -2.1099095344543457, "step": 1020 }, { "epoch": 0.27, "grad_norm": 6.875, "learning_rate": 4.5756548370922136e-06, "logits/chosen": -1.6495920419692993, "logits/rejected": -1.5659213066101074, "logps/chosen": -351.00146484375, "logps/rejected": -412.519287109375, "loss": 0.5127, "rewards/accuracies": 0.75, "rewards/chosen": -1.1029760837554932, "rewards/margins": 0.7049504518508911, "rewards/rejected": -1.8079265356063843, "step": 1030 }, { "epoch": 0.27, "grad_norm": 9.0625, "learning_rate": 4.562835370152206e-06, "logits/chosen": -1.7497320175170898, "logits/rejected": -1.5089380741119385, "logps/chosen": -426.8157653808594, "logps/rejected": -491.87860107421875, "loss": 0.473, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2558623552322388, "rewards/margins": 0.9615765810012817, "rewards/rejected": -2.2174386978149414, "step": 1040 }, { "epoch": 0.27, "grad_norm": 7.0625, "learning_rate": 4.54984365705243e-06, "logits/chosen": -1.6929643154144287, "logits/rejected": -1.5880625247955322, "logps/chosen": -421.701416015625, "logps/rejected": -518.8970947265625, "loss": 0.4784, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5323131084442139, "rewards/margins": 1.0377166271209717, "rewards/rejected": -2.5700297355651855, "step": 1050 }, { "epoch": 0.28, "grad_norm": 12.25, "learning_rate": 4.536680782597191e-06, "logits/chosen": -1.5793530941009521, "logits/rejected": -1.503025770187378, "logps/chosen": -413.30792236328125, "logps/rejected": -483.18048095703125, "loss": 0.5921, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7728277444839478, "rewards/margins": 0.7549096345901489, "rewards/rejected": -2.5277373790740967, "step": 1060 }, { "epoch": 0.28, "grad_norm": 15.3125, "learning_rate": 4.523347845882718e-06, "logits/chosen": -1.6937191486358643, "logits/rejected": -1.5083749294281006, "logps/chosen": -422.14447021484375, "logps/rejected": -479.6094665527344, "loss": 0.4495, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3397438526153564, "rewards/margins": 1.0674594640731812, "rewards/rejected": -2.407203197479248, "step": 1070 }, { "epoch": 0.28, "grad_norm": 5.375, "learning_rate": 4.50984596020539e-06, "logits/chosen": -1.544276475906372, "logits/rejected": -1.4562034606933594, "logps/chosen": -403.8301696777344, "logps/rejected": -444.5962829589844, "loss": 0.573, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.236783504486084, "rewards/margins": 0.7301002740859985, "rewards/rejected": -1.9668840169906616, "step": 1080 }, { "epoch": 0.29, "grad_norm": 7.40625, "learning_rate": 4.4961762529687745e-06, "logits/chosen": -1.6948843002319336, "logits/rejected": -1.5669870376586914, "logps/chosen": -365.44342041015625, "logps/rejected": -436.5625915527344, "loss": 0.5044, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0504177808761597, "rewards/margins": 0.8762027621269226, "rewards/rejected": -1.9266207218170166, "step": 1090 }, { "epoch": 0.29, "grad_norm": 9.6875, "learning_rate": 4.482339865589492e-06, "logits/chosen": -1.6588748693466187, "logits/rejected": -1.5048010349273682, "logps/chosen": -401.0564270019531, "logps/rejected": -414.84466552734375, "loss": 0.569, "rewards/accuracies": 0.75, "rewards/chosen": -1.3476970195770264, "rewards/margins": 0.5775381922721863, "rewards/rejected": -1.925235390663147, "step": 1100 }, { "epoch": 0.29, "eval_logits/chosen": -1.5524324178695679, "eval_logits/rejected": -1.427809476852417, "eval_logps/chosen": -378.61767578125, "eval_logps/rejected": -430.84136962890625, "eval_loss": 0.5299040675163269, "eval_rewards/accuracies": 0.7210000157356262, "eval_rewards/chosen": -1.1396570205688477, "eval_rewards/margins": 0.7226706147193909, "eval_rewards/rejected": -1.8623274564743042, "eval_runtime": 385.4496, "eval_samples_per_second": 5.189, "eval_steps_per_second": 0.649, "step": 1100 }, { "epoch": 0.29, "grad_norm": 5.6875, "learning_rate": 4.468337953401909e-06, "logits/chosen": -1.661257028579712, "logits/rejected": -1.5975781679153442, "logps/chosen": -380.5933837890625, "logps/rejected": -433.12139892578125, "loss": 0.5657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1024227142333984, "rewards/margins": 0.5738898515701294, "rewards/rejected": -1.6763126850128174, "step": 1110 }, { "epoch": 0.29, "grad_norm": 7.34375, "learning_rate": 4.45417168556166e-06, "logits/chosen": -1.5824635028839111, "logits/rejected": -1.4781149625778198, "logps/chosen": -340.6497497558594, "logps/rejected": -407.69293212890625, "loss": 0.5255, "rewards/accuracies": 0.75, "rewards/chosen": -0.9949854016304016, "rewards/margins": 0.6768967509269714, "rewards/rejected": -1.6718822717666626, "step": 1120 }, { "epoch": 0.3, "grad_norm": 10.5625, "learning_rate": 4.439842244948036e-06, "logits/chosen": -1.5540910959243774, "logits/rejected": -1.4291226863861084, "logps/chosen": -390.7538757324219, "logps/rejected": -446.49310302734375, "loss": 0.5752, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.296918511390686, "rewards/margins": 0.6129493117332458, "rewards/rejected": -1.9098678827285767, "step": 1130 }, { "epoch": 0.3, "grad_norm": 14.125, "learning_rate": 4.425350828065204e-06, "logits/chosen": -1.6088273525238037, "logits/rejected": -1.3946729898452759, "logps/chosen": -412.3367614746094, "logps/rejected": -442.0401916503906, "loss": 0.5089, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2825069427490234, "rewards/margins": 0.7757080793380737, "rewards/rejected": -2.0582151412963867, "step": 1140 }, { "epoch": 0.3, "grad_norm": 7.875, "learning_rate": 4.410698644942303e-06, "logits/chosen": -1.6174886226654053, "logits/rejected": -1.4844688177108765, "logps/chosen": -402.29486083984375, "logps/rejected": -463.25689697265625, "loss": 0.4913, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.299822211265564, "rewards/margins": 0.8488262891769409, "rewards/rejected": -2.148648738861084, "step": 1150 }, { "epoch": 0.3, "grad_norm": 11.125, "learning_rate": 4.395886919032406e-06, "logits/chosen": -1.4636362791061401, "logits/rejected": -1.3575894832611084, "logps/chosen": -405.80010986328125, "logps/rejected": -456.88641357421875, "loss": 0.5316, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4196369647979736, "rewards/margins": 0.7757617235183716, "rewards/rejected": -2.1953988075256348, "step": 1160 }, { "epoch": 0.31, "grad_norm": 6.6875, "learning_rate": 4.380916887110366e-06, "logits/chosen": -1.6339868307113647, "logits/rejected": -1.4374290704727173, "logps/chosen": -406.9070739746094, "logps/rejected": -451.3981018066406, "loss": 0.5169, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.477386474609375, "rewards/margins": 0.8484745025634766, "rewards/rejected": -2.3258609771728516, "step": 1170 }, { "epoch": 0.31, "grad_norm": 6.84375, "learning_rate": 4.365789799169539e-06, "logits/chosen": -1.4347012042999268, "logits/rejected": -1.4834723472595215, "logps/chosen": -395.71014404296875, "logps/rejected": -475.1640625, "loss": 0.5232, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4909913539886475, "rewards/margins": 0.7410578727722168, "rewards/rejected": -2.2320492267608643, "step": 1180 }, { "epoch": 0.31, "grad_norm": 6.78125, "learning_rate": 4.350506918317416e-06, "logits/chosen": -1.6247329711914062, "logits/rejected": -1.4631903171539307, "logps/chosen": -389.4300842285156, "logps/rejected": -455.31573486328125, "loss": 0.5133, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4183425903320312, "rewards/margins": 0.7307096719741821, "rewards/rejected": -2.149052143096924, "step": 1190 }, { "epoch": 0.31, "grad_norm": 6.15625, "learning_rate": 4.335069520670149e-06, "logits/chosen": -1.4696300029754639, "logits/rejected": -1.3941162824630737, "logps/chosen": -352.44671630859375, "logps/rejected": -424.1249084472656, "loss": 0.5732, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2273385524749756, "rewards/margins": 0.6349586248397827, "rewards/rejected": -1.8622970581054688, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": -1.4804484844207764, "eval_logits/rejected": -1.3595802783966064, "eval_logps/chosen": -375.21826171875, "eval_logps/rejected": -427.4810485839844, "eval_loss": 0.5184832811355591, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": -1.1056623458862305, "eval_rewards/margins": 0.7230623364448547, "eval_rewards/rejected": -1.82872474193573, "eval_runtime": 385.0476, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 1200 }, { "epoch": 0.32, "grad_norm": 9.8125, "learning_rate": 4.319478895246e-06, "logits/chosen": -1.5287452936172485, "logits/rejected": -1.3607852458953857, "logps/chosen": -350.2371520996094, "logps/rejected": -398.1286315917969, "loss": 0.5104, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0312172174453735, "rewards/margins": 0.7258288264274597, "rewards/rejected": -1.757046103477478, "step": 1210 }, { "epoch": 0.32, "grad_norm": 13.0, "learning_rate": 4.303736343857704e-06, "logits/chosen": -1.5342817306518555, "logits/rejected": -1.4489666223526, "logps/chosen": -372.7054138183594, "logps/rejected": -475.97601318359375, "loss": 0.5008, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1559852361679077, "rewards/margins": 0.9135689735412598, "rewards/rejected": -2.069554328918457, "step": 1220 }, { "epoch": 0.32, "grad_norm": 8.125, "learning_rate": 4.287843181003772e-06, "logits/chosen": -1.5427916049957275, "logits/rejected": -1.3855717182159424, "logps/chosen": -458.01641845703125, "logps/rejected": -475.8519592285156, "loss": 0.5884, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6416466236114502, "rewards/margins": 0.6417607665061951, "rewards/rejected": -2.283407211303711, "step": 1230 }, { "epoch": 0.32, "grad_norm": 6.46875, "learning_rate": 4.27180073375873e-06, "logits/chosen": -1.5489182472229004, "logits/rejected": -1.402178168296814, "logps/chosen": -442.7936096191406, "logps/rejected": -477.34515380859375, "loss": 0.5287, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5542631149291992, "rewards/margins": 0.7610459327697754, "rewards/rejected": -2.3153088092803955, "step": 1240 }, { "epoch": 0.33, "grad_norm": 4.4375, "learning_rate": 4.255610341662304e-06, "logits/chosen": -1.6110093593597412, "logits/rejected": -1.398992896080017, "logps/chosen": -380.208740234375, "logps/rejected": -425.40838623046875, "loss": 0.5553, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2492074966430664, "rewards/margins": 0.6511562466621399, "rewards/rejected": -1.900363564491272, "step": 1250 }, { "epoch": 0.33, "grad_norm": 6.84375, "learning_rate": 4.2392733566075764e-06, "logits/chosen": -1.59576416015625, "logits/rejected": -1.4599517583847046, "logps/chosen": -401.14984130859375, "logps/rejected": -438.921630859375, "loss": 0.591, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4514347314834595, "rewards/margins": 0.5331937670707703, "rewards/rejected": -1.984628438949585, "step": 1260 }, { "epoch": 0.33, "grad_norm": 9.875, "learning_rate": 4.2227911427280975e-06, "logits/chosen": -1.5509364604949951, "logits/rejected": -1.3630738258361816, "logps/chosen": -384.2834777832031, "logps/rejected": -420.7542419433594, "loss": 0.5353, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.339779257774353, "rewards/margins": 0.6931589841842651, "rewards/rejected": -2.0329384803771973, "step": 1270 }, { "epoch": 0.33, "grad_norm": 9.1875, "learning_rate": 4.206165076283983e-06, "logits/chosen": -1.5844643115997314, "logits/rejected": -1.4324209690093994, "logps/chosen": -375.78973388671875, "logps/rejected": -440.9784240722656, "loss": 0.4792, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.29856538772583, "rewards/margins": 0.8626803159713745, "rewards/rejected": -2.161245584487915, "step": 1280 }, { "epoch": 0.34, "grad_norm": 10.5625, "learning_rate": 4.189396545546995e-06, "logits/chosen": -1.5281752347946167, "logits/rejected": -1.4283504486083984, "logps/chosen": -397.5606384277344, "logps/rejected": -468.21002197265625, "loss": 0.5202, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4838616847991943, "rewards/margins": 0.9059172868728638, "rewards/rejected": -2.3897788524627686, "step": 1290 }, { "epoch": 0.34, "grad_norm": 10.9375, "learning_rate": 4.172486950684627e-06, "logits/chosen": -1.480257511138916, "logits/rejected": -1.4012019634246826, "logps/chosen": -429.61181640625, "logps/rejected": -510.66522216796875, "loss": 0.5332, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7596956491470337, "rewards/margins": 0.8419567942619324, "rewards/rejected": -2.6016526222229004, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": -1.30724036693573, "eval_logits/rejected": -1.1976608037948608, "eval_logps/chosen": -478.32550048828125, "eval_logps/rejected": -549.7024536132812, "eval_loss": 0.5315085053443909, "eval_rewards/accuracies": 0.7239999771118164, "eval_rewards/chosen": -2.1367344856262207, "eval_rewards/margins": 0.9142037630081177, "eval_rewards/rejected": -3.050938367843628, "eval_runtime": 385.0593, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 1300 }, { "epoch": 0.34, "grad_norm": 12.6875, "learning_rate": 4.155437703643182e-06, "logits/chosen": -1.4552199840545654, "logits/rejected": -1.306873083114624, "logps/chosen": -439.85382080078125, "logps/rejected": -500.3904724121094, "loss": 0.5037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9311021566390991, "rewards/margins": 0.8962618708610535, "rewards/rejected": -2.827363967895508, "step": 1310 }, { "epoch": 0.35, "grad_norm": 12.8125, "learning_rate": 4.138250228029882e-06, "logits/chosen": -1.482912302017212, "logits/rejected": -1.403141736984253, "logps/chosen": -424.140380859375, "logps/rejected": -514.3765869140625, "loss": 0.5066, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6720972061157227, "rewards/margins": 0.8676088452339172, "rewards/rejected": -2.539705753326416, "step": 1320 }, { "epoch": 0.35, "grad_norm": 6.9375, "learning_rate": 4.120925958993994e-06, "logits/chosen": -1.4682929515838623, "logits/rejected": -1.3645504713058472, "logps/chosen": -376.16033935546875, "logps/rejected": -447.6339416503906, "loss": 0.5583, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4225904941558838, "rewards/margins": 0.7580591440200806, "rewards/rejected": -2.180649757385254, "step": 1330 }, { "epoch": 0.35, "grad_norm": 10.8125, "learning_rate": 4.103466343106999e-06, "logits/chosen": -1.5599358081817627, "logits/rejected": -1.4370046854019165, "logps/chosen": -424.14849853515625, "logps/rejected": -472.6615295410156, "loss": 0.5315, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5476281642913818, "rewards/margins": 0.743899941444397, "rewards/rejected": -2.2915279865264893, "step": 1340 }, { "epoch": 0.35, "grad_norm": 8.625, "learning_rate": 4.085872838241797e-06, "logits/chosen": -1.464450716972351, "logits/rejected": -1.3373545408248901, "logps/chosen": -405.13262939453125, "logps/rejected": -447.93994140625, "loss": 0.5899, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.427685022354126, "rewards/margins": 0.6289039850234985, "rewards/rejected": -2.056588649749756, "step": 1350 }, { "epoch": 0.36, "grad_norm": 9.75, "learning_rate": 4.06814691345098e-06, "logits/chosen": -1.452643871307373, "logits/rejected": -1.2927871942520142, "logps/chosen": -378.2747497558594, "logps/rejected": -437.4178161621094, "loss": 0.4989, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2272692918777466, "rewards/margins": 0.8121210336685181, "rewards/rejected": -2.0393900871276855, "step": 1360 }, { "epoch": 0.36, "grad_norm": 10.4375, "learning_rate": 4.050290048844171e-06, "logits/chosen": -1.572665810585022, "logits/rejected": -1.4710958003997803, "logps/chosen": -398.8462829589844, "logps/rejected": -468.70794677734375, "loss": 0.5368, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.30232834815979, "rewards/margins": 0.750015139579773, "rewards/rejected": -2.0523436069488525, "step": 1370 }, { "epoch": 0.36, "grad_norm": 7.46875, "learning_rate": 4.032303735464422e-06, "logits/chosen": -1.6318330764770508, "logits/rejected": -1.4836442470550537, "logps/chosen": -405.6830749511719, "logps/rejected": -475.7168884277344, "loss": 0.4568, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.337021827697754, "rewards/margins": 0.9311949610710144, "rewards/rejected": -2.268216609954834, "step": 1380 }, { "epoch": 0.36, "grad_norm": 9.6875, "learning_rate": 4.014189475163727e-06, "logits/chosen": -1.4534022808074951, "logits/rejected": -1.3461982011795044, "logps/chosen": -380.7342224121094, "logps/rejected": -464.63916015625, "loss": 0.4968, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2607730627059937, "rewards/margins": 0.9202286005020142, "rewards/rejected": -2.181001663208008, "step": 1390 }, { "epoch": 0.37, "grad_norm": 15.0625, "learning_rate": 3.995948780477605e-06, "logits/chosen": -1.5742177963256836, "logits/rejected": -1.410463809967041, "logps/chosen": -382.19830322265625, "logps/rejected": -427.6187438964844, "loss": 0.5431, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1344490051269531, "rewards/margins": 0.7007244825363159, "rewards/rejected": -1.8351733684539795, "step": 1400 }, { "epoch": 0.37, "eval_logits/chosen": -1.431371808052063, "eval_logits/rejected": -1.3129903078079224, "eval_logps/chosen": -390.28460693359375, "eval_logps/rejected": -454.35223388671875, "eval_loss": 0.521051287651062, "eval_rewards/accuracies": 0.7260000109672546, "eval_rewards/chosen": -1.2563258409500122, "eval_rewards/margins": 0.841110348701477, "eval_rewards/rejected": -2.09743595123291, "eval_runtime": 385.3298, "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 1400 }, { "epoch": 0.37, "grad_norm": 10.875, "learning_rate": 3.977583174498816e-06, "logits/chosen": -1.4515248537063599, "logits/rejected": -1.3351339101791382, "logps/chosen": -412.0836486816406, "logps/rejected": -511.7002868652344, "loss": 0.3984, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4840004444122314, "rewards/margins": 1.215951681137085, "rewards/rejected": -2.6999518871307373, "step": 1410 }, { "epoch": 0.37, "grad_norm": 10.375, "learning_rate": 3.959094190750172e-06, "logits/chosen": -1.4154666662216187, "logits/rejected": -1.2808506488800049, "logps/chosen": -463.95367431640625, "logps/rejected": -530.5446166992188, "loss": 0.5238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7879329919815063, "rewards/margins": 0.9349812269210815, "rewards/rejected": -2.722913980484009, "step": 1420 }, { "epoch": 0.37, "grad_norm": 14.625, "learning_rate": 3.9404833730564975e-06, "logits/chosen": -1.3400425910949707, "logits/rejected": -1.2239243984222412, "logps/chosen": -414.04168701171875, "logps/rejected": -493.5077209472656, "loss": 0.5162, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6094110012054443, "rewards/margins": 0.910406768321991, "rewards/rejected": -2.51981782913208, "step": 1430 }, { "epoch": 0.38, "grad_norm": 13.0, "learning_rate": 3.921752275415712e-06, "logits/chosen": -1.4123733043670654, "logits/rejected": -1.379097580909729, "logps/chosen": -400.0645751953125, "logps/rejected": -482.3004455566406, "loss": 0.455, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.495286464691162, "rewards/margins": 1.028262734413147, "rewards/rejected": -2.5235490798950195, "step": 1440 }, { "epoch": 0.38, "grad_norm": 6.40625, "learning_rate": 3.902902461869079e-06, "logits/chosen": -1.3998125791549683, "logits/rejected": -1.2797114849090576, "logps/chosen": -421.95794677734375, "logps/rejected": -507.14361572265625, "loss": 0.5415, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8244426250457764, "rewards/margins": 1.0165250301361084, "rewards/rejected": -2.840967893600464, "step": 1450 }, { "epoch": 0.38, "grad_norm": 15.8125, "learning_rate": 3.883935506370605e-06, "logits/chosen": -1.4051461219787598, "logits/rejected": -1.2663236856460571, "logps/chosen": -432.9169921875, "logps/rejected": -484.6170349121094, "loss": 0.5752, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7967593669891357, "rewards/margins": 0.7953070402145386, "rewards/rejected": -2.592066526412964, "step": 1460 }, { "epoch": 0.38, "grad_norm": 5.71875, "learning_rate": 3.864852992655617e-06, "logits/chosen": -1.5188504457473755, "logits/rejected": -1.4224086999893188, "logps/chosen": -385.0553283691406, "logps/rejected": -460.64166259765625, "loss": 0.4617, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.364206075668335, "rewards/margins": 0.8787292242050171, "rewards/rejected": -2.2429351806640625, "step": 1470 }, { "epoch": 0.39, "grad_norm": 6.0, "learning_rate": 3.845656514108516e-06, "logits/chosen": -1.4730474948883057, "logits/rejected": -1.3063628673553467, "logps/chosen": -420.05364990234375, "logps/rejected": -448.61663818359375, "loss": 0.4919, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.609230637550354, "rewards/margins": 0.8354890942573547, "rewards/rejected": -2.4447197914123535, "step": 1480 }, { "epoch": 0.39, "grad_norm": 15.6875, "learning_rate": 3.826347673629738e-06, "logits/chosen": -1.447205901145935, "logits/rejected": -1.2630943059921265, "logps/chosen": -382.7901916503906, "logps/rejected": -455.2850036621094, "loss": 0.4846, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2948672771453857, "rewards/margins": 0.9875162243843079, "rewards/rejected": -2.282383441925049, "step": 1490 }, { "epoch": 0.39, "grad_norm": 13.4375, "learning_rate": 3.8069280835019062e-06, "logits/chosen": -1.4306355714797974, "logits/rejected": -1.2892208099365234, "logps/chosen": -402.4939880371094, "logps/rejected": -487.1109313964844, "loss": 0.4862, "rewards/accuracies": 0.75, "rewards/chosen": -1.38181734085083, "rewards/margins": 1.0380725860595703, "rewards/rejected": -2.4198899269104004, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": -1.4015111923217773, "eval_logits/rejected": -1.2794849872589111, "eval_logps/chosen": -401.4261779785156, "eval_logps/rejected": -472.0146179199219, "eval_loss": 0.5161935091018677, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -1.3677420616149902, "eval_rewards/margins": 0.9063177704811096, "eval_rewards/rejected": -2.274059534072876, "eval_runtime": 384.9141, "eval_samples_per_second": 5.196, "eval_steps_per_second": 0.649, "step": 1500 }, { "epoch": 0.4, "grad_norm": 10.0625, "learning_rate": 3.7873993652552077e-06, "logits/chosen": -1.4077152013778687, "logits/rejected": -1.3199503421783447, "logps/chosen": -359.19488525390625, "logps/rejected": -424.65576171875, "loss": 0.6047, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2548080682754517, "rewards/margins": 0.7129807472229004, "rewards/rejected": -1.9677889347076416, "step": 1510 }, { "epoch": 0.4, "grad_norm": 8.1875, "learning_rate": 3.7677631495319953e-06, "logits/chosen": -1.5366017818450928, "logits/rejected": -1.420841932296753, "logps/chosen": -355.3591613769531, "logps/rejected": -406.9905700683594, "loss": 0.5263, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9167426228523254, "rewards/margins": 0.6963993310928345, "rewards/rejected": -1.6131420135498047, "step": 1520 }, { "epoch": 0.4, "grad_norm": 6.15625, "learning_rate": 3.748021075950633e-06, "logits/chosen": -1.5663089752197266, "logits/rejected": -1.4497790336608887, "logps/chosen": -371.51312255859375, "logps/rejected": -410.3604431152344, "loss": 0.5946, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9686979055404663, "rewards/margins": 0.49481868743896484, "rewards/rejected": -1.4635167121887207, "step": 1530 }, { "epoch": 0.4, "grad_norm": 9.9375, "learning_rate": 3.7281747929685824e-06, "logits/chosen": -1.4247326850891113, "logits/rejected": -1.265855073928833, "logps/chosen": -353.05194091796875, "logps/rejected": -399.3148498535156, "loss": 0.548, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1539690494537354, "rewards/margins": 0.6195784211158752, "rewards/rejected": -1.7735474109649658, "step": 1540 }, { "epoch": 0.41, "grad_norm": 7.59375, "learning_rate": 3.7082259577447604e-06, "logits/chosen": -1.5184131860733032, "logits/rejected": -1.4079376459121704, "logps/chosen": -389.82550048828125, "logps/rejected": -445.1400451660156, "loss": 0.4885, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1863467693328857, "rewards/margins": 0.7531275749206543, "rewards/rejected": -1.9394744634628296, "step": 1550 }, { "epoch": 0.41, "grad_norm": 8.5625, "learning_rate": 3.6881762360011688e-06, "logits/chosen": -1.5098861455917358, "logits/rejected": -1.317479133605957, "logps/chosen": -411.7156677246094, "logps/rejected": -458.67218017578125, "loss": 0.5111, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3021931648254395, "rewards/margins": 0.8395140767097473, "rewards/rejected": -2.141706943511963, "step": 1560 }, { "epoch": 0.41, "grad_norm": 10.8125, "learning_rate": 3.668027301883802e-06, "logits/chosen": -1.4269211292266846, "logits/rejected": -1.2615479230880737, "logps/chosen": -402.62603759765625, "logps/rejected": -482.184326171875, "loss": 0.511, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5126349925994873, "rewards/margins": 0.9446732401847839, "rewards/rejected": -2.457308053970337, "step": 1570 }, { "epoch": 0.41, "grad_norm": 6.09375, "learning_rate": 3.64778083782286e-06, "logits/chosen": -1.2994117736816406, "logits/rejected": -1.2819687128067017, "logps/chosen": -454.22711181640625, "logps/rejected": -568.9495239257812, "loss": 0.5489, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0162034034729004, "rewards/margins": 0.8572282791137695, "rewards/rejected": -2.87343168258667, "step": 1580 }, { "epoch": 0.42, "grad_norm": 8.125, "learning_rate": 3.627438534392268e-06, "logits/chosen": -1.4073131084442139, "logits/rejected": -1.3753129243850708, "logps/chosen": -438.55255126953125, "logps/rejected": -532.357421875, "loss": 0.4994, "rewards/accuracies": 0.75, "rewards/chosen": -2.0050759315490723, "rewards/margins": 0.8708721399307251, "rewards/rejected": -2.875947952270508, "step": 1590 }, { "epoch": 0.42, "grad_norm": 10.5625, "learning_rate": 3.607002090168506e-06, "logits/chosen": -1.2787964344024658, "logits/rejected": -1.2062056064605713, "logps/chosen": -478.2181091308594, "logps/rejected": -532.1177978515625, "loss": 0.5858, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.139094829559326, "rewards/margins": 0.7530891299247742, "rewards/rejected": -2.892183780670166, "step": 1600 }, { "epoch": 0.42, "eval_logits/chosen": -1.2717995643615723, "eval_logits/rejected": -1.1533604860305786, "eval_logps/chosen": -445.6515197753906, "eval_logps/rejected": -514.567138671875, "eval_loss": 0.5072752833366394, "eval_rewards/accuracies": 0.7365000247955322, "eval_rewards/chosen": -1.809995174407959, "eval_rewards/margins": 0.889590322971344, "eval_rewards/rejected": -2.6995856761932373, "eval_runtime": 385.2379, "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 1600 }, { "epoch": 0.42, "grad_norm": 4.8125, "learning_rate": 3.586473211588787e-06, "logits/chosen": -1.3733545541763306, "logits/rejected": -1.2681185007095337, "logps/chosen": -407.07623291015625, "logps/rejected": -509.69683837890625, "loss": 0.4615, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6238105297088623, "rewards/margins": 0.9594193696975708, "rewards/rejected": -2.5832300186157227, "step": 1610 }, { "epoch": 0.42, "grad_norm": 11.125, "learning_rate": 3.5658536128085623e-06, "logits/chosen": -1.3982038497924805, "logits/rejected": -1.2271344661712646, "logps/chosen": -460.24951171875, "logps/rejected": -503.8080139160156, "loss": 0.595, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.9733803272247314, "rewards/margins": 0.7311316728591919, "rewards/rejected": -2.704512119293213, "step": 1620 }, { "epoch": 0.43, "grad_norm": 8.625, "learning_rate": 3.545145015558399e-06, "logits/chosen": -1.1945741176605225, "logits/rejected": -1.1713488101959229, "logps/chosen": -412.6747131347656, "logps/rejected": -492.372802734375, "loss": 0.5028, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8103736639022827, "rewards/margins": 0.9391372799873352, "rewards/rejected": -2.7495107650756836, "step": 1630 }, { "epoch": 0.43, "grad_norm": 8.9375, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -1.3308615684509277, "logits/rejected": -1.2446686029434204, "logps/chosen": -433.484375, "logps/rejected": -507.3377990722656, "loss": 0.5688, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7979129552841187, "rewards/margins": 0.7903792262077332, "rewards/rejected": -2.588292360305786, "step": 1640 }, { "epoch": 0.43, "grad_norm": 7.8125, "learning_rate": 3.503467749582857e-06, "logits/chosen": -1.378259301185608, "logits/rejected": -1.1882727146148682, "logps/chosen": -412.93560791015625, "logps/rejected": -446.2088317871094, "loss": 0.5722, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6171245574951172, "rewards/margins": 0.6728037595748901, "rewards/rejected": -2.2899281978607178, "step": 1650 }, { "epoch": 0.43, "grad_norm": 14.125, "learning_rate": 3.4825025608971947e-06, "logits/chosen": -1.2760392427444458, "logits/rejected": -1.2017720937728882, "logps/chosen": -379.2555847167969, "logps/rejected": -455.60791015625, "loss": 0.5323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.573118805885315, "rewards/margins": 0.7255537509918213, "rewards/rejected": -2.2986724376678467, "step": 1660 }, { "epoch": 0.44, "grad_norm": 7.34375, "learning_rate": 3.4614553335304407e-06, "logits/chosen": -1.3151836395263672, "logits/rejected": -1.113488793373108, "logps/chosen": -440.9947814941406, "logps/rejected": -505.5106506347656, "loss": 0.4714, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7208404541015625, "rewards/margins": 0.9629707336425781, "rewards/rejected": -2.6838109493255615, "step": 1670 }, { "epoch": 0.44, "grad_norm": 7.28125, "learning_rate": 3.4403278249200222e-06, "logits/chosen": -1.289880633354187, "logits/rejected": -1.0922951698303223, "logps/chosen": -470.2997131347656, "logps/rejected": -540.0161743164062, "loss": 0.4409, "rewards/accuracies": 0.75, "rewards/chosen": -1.8769254684448242, "rewards/margins": 1.1163800954818726, "rewards/rejected": -2.9933059215545654, "step": 1680 }, { "epoch": 0.44, "grad_norm": 15.75, "learning_rate": 3.4191217992068293e-06, "logits/chosen": -1.3650540113449097, "logits/rejected": -1.1904373168945312, "logps/chosen": -491.87060546875, "logps/rejected": -539.9581298828125, "loss": 0.5323, "rewards/accuracies": 0.75, "rewards/chosen": -2.137281656265259, "rewards/margins": 0.9366267323493958, "rewards/rejected": -3.0739083290100098, "step": 1690 }, { "epoch": 0.44, "grad_norm": 11.8125, "learning_rate": 3.3978390270879056e-06, "logits/chosen": -1.273272156715393, "logits/rejected": -1.1826374530792236, "logps/chosen": -441.7779235839844, "logps/rejected": -540.5211791992188, "loss": 0.5147, "rewards/accuracies": 0.75, "rewards/chosen": -2.3176229000091553, "rewards/margins": 0.9195470809936523, "rewards/rejected": -3.2371699810028076, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": -1.269109845161438, "eval_logits/rejected": -1.146828293800354, "eval_logps/chosen": -491.4620666503906, "eval_logps/rejected": -566.2828979492188, "eval_loss": 0.5000255107879639, "eval_rewards/accuracies": 0.734000027179718, "eval_rewards/chosen": -2.2681005001068115, "eval_rewards/margins": 0.9486428499221802, "eval_rewards/rejected": -3.2167434692382812, "eval_runtime": 385.0866, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 1700 }, { "epoch": 0.45, "grad_norm": 11.6875, "learning_rate": 3.3764812856685995e-06, "logits/chosen": -1.3418161869049072, "logits/rejected": -1.3261712789535522, "logps/chosen": -440.2900390625, "logps/rejected": -541.0260009765625, "loss": 0.5252, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1573386192321777, "rewards/margins": 0.8722022175788879, "rewards/rejected": -3.029540777206421, "step": 1710 }, { "epoch": 0.45, "grad_norm": 10.875, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -1.4454267024993896, "logits/rejected": -1.311650276184082, "logps/chosen": -485.03411865234375, "logps/rejected": -569.6810913085938, "loss": 0.4849, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1618409156799316, "rewards/margins": 0.9978164434432983, "rewards/rejected": -3.1596572399139404, "step": 1720 }, { "epoch": 0.45, "grad_norm": 8.375, "learning_rate": 3.3335480345008907e-06, "logits/chosen": -1.2839902639389038, "logits/rejected": -1.1861859560012817, "logps/chosen": -466.77850341796875, "logps/rejected": -553.5386352539062, "loss": 0.4622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0662953853607178, "rewards/margins": 1.1144917011260986, "rewards/rejected": -3.1807870864868164, "step": 1730 }, { "epoch": 0.46, "grad_norm": 8.0, "learning_rate": 3.3119761096666055e-06, "logits/chosen": -1.3106259107589722, "logits/rejected": -1.1651959419250488, "logps/chosen": -498.0738830566406, "logps/rejected": -552.7239379882812, "loss": 0.5547, "rewards/accuracies": 0.71875, "rewards/chosen": -2.230546236038208, "rewards/margins": 0.8497620820999146, "rewards/rejected": -3.080308437347412, "step": 1740 }, { "epoch": 0.46, "grad_norm": 7.6875, "learning_rate": 3.290336385060832e-06, "logits/chosen": -1.493554949760437, "logits/rejected": -1.2929532527923584, "logps/chosen": -479.22259521484375, "logps/rejected": -548.3055419921875, "loss": 0.55, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3036773204803467, "rewards/margins": 0.8925528526306152, "rewards/rejected": -3.196229934692383, "step": 1750 }, { "epoch": 0.46, "grad_norm": 9.3125, "learning_rate": 3.268630667594348e-06, "logits/chosen": -1.355196237564087, "logits/rejected": -1.3183298110961914, "logps/chosen": -460.26336669921875, "logps/rejected": -523.685546875, "loss": 0.5176, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.046140670776367, "rewards/margins": 0.8765950202941895, "rewards/rejected": -2.9227356910705566, "step": 1760 }, { "epoch": 0.46, "grad_norm": 11.125, "learning_rate": 3.2468607696883147e-06, "logits/chosen": -1.3625749349594116, "logits/rejected": -1.311535358428955, "logps/chosen": -489.18017578125, "logps/rejected": -587.8863525390625, "loss": 0.4934, "rewards/accuracies": 0.71875, "rewards/chosen": -2.360048294067383, "rewards/margins": 0.9435898065567017, "rewards/rejected": -3.303637742996216, "step": 1770 }, { "epoch": 0.47, "grad_norm": 7.78125, "learning_rate": 3.225028509122944e-06, "logits/chosen": -1.397005319595337, "logits/rejected": -1.2728253602981567, "logps/chosen": -486.8643493652344, "logps/rejected": -561.2532958984375, "loss": 0.5211, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.503471851348877, "rewards/margins": 0.8570526838302612, "rewards/rejected": -3.3605244159698486, "step": 1780 }, { "epoch": 0.47, "grad_norm": 11.6875, "learning_rate": 3.2031357088857083e-06, "logits/chosen": -1.3312914371490479, "logits/rejected": -1.2595702409744263, "logps/chosen": -561.8858032226562, "logps/rejected": -660.8182373046875, "loss": 0.5043, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8668177127838135, "rewards/margins": 1.0241832733154297, "rewards/rejected": -3.8910012245178223, "step": 1790 }, { "epoch": 0.47, "grad_norm": 12.8125, "learning_rate": 3.181184197019127e-06, "logits/chosen": -1.1215088367462158, "logits/rejected": -1.0118662118911743, "logps/chosen": -525.9521484375, "logps/rejected": -697.3963623046875, "loss": 0.4809, "rewards/accuracies": 0.78125, "rewards/chosen": -2.957373857498169, "rewards/margins": 1.4523636102676392, "rewards/rejected": -4.409738063812256, "step": 1800 }, { "epoch": 0.47, "eval_logits/chosen": -1.1786177158355713, "eval_logits/rejected": -1.0616753101348877, "eval_logps/chosen": -557.43115234375, "eval_logps/rejected": -643.640869140625, "eval_loss": 0.5022104382514954, "eval_rewards/accuracies": 0.7404999732971191, "eval_rewards/chosen": -2.9277913570404053, "eval_rewards/margins": 1.062530517578125, "eval_rewards/rejected": -3.9903218746185303, "eval_runtime": 384.8251, "eval_samples_per_second": 5.197, "eval_steps_per_second": 0.65, "step": 1800 }, { "epoch": 0.47, "grad_norm": 18.125, "learning_rate": 3.159175806468126e-06, "logits/chosen": -1.1367595195770264, "logits/rejected": -0.9490365982055664, "logps/chosen": -545.4899291992188, "logps/rejected": -620.2122192382812, "loss": 0.5001, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9681437015533447, "rewards/margins": 1.0490918159484863, "rewards/rejected": -4.01723575592041, "step": 1810 }, { "epoch": 0.48, "grad_norm": 13.375, "learning_rate": 3.1371123749269804e-06, "logits/chosen": -1.2076561450958252, "logits/rejected": -1.135667085647583, "logps/chosen": -596.1036376953125, "logps/rejected": -664.8118896484375, "loss": 0.5596, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.1270740032196045, "rewards/margins": 0.847625732421875, "rewards/rejected": -3.9746997356414795, "step": 1820 }, { "epoch": 0.48, "grad_norm": 11.6875, "learning_rate": 3.114995744685877e-06, "logits/chosen": -1.1738382577896118, "logits/rejected": -1.146437644958496, "logps/chosen": -529.6216430664062, "logps/rejected": -603.9373168945312, "loss": 0.5267, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.8229541778564453, "rewards/margins": 0.8735902905464172, "rewards/rejected": -3.696544647216797, "step": 1830 }, { "epoch": 0.48, "grad_norm": 6.34375, "learning_rate": 3.0928277624770743e-06, "logits/chosen": -1.3653886318206787, "logits/rejected": -1.2098249197006226, "logps/chosen": -532.6870727539062, "logps/rejected": -613.7505493164062, "loss": 0.5049, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4794299602508545, "rewards/margins": 1.0948512554168701, "rewards/rejected": -3.5742812156677246, "step": 1840 }, { "epoch": 0.48, "grad_norm": 6.625, "learning_rate": 3.070610279320708e-06, "logits/chosen": -1.3816752433776855, "logits/rejected": -1.2150719165802002, "logps/chosen": -521.9651489257812, "logps/rejected": -601.0781860351562, "loss": 0.4669, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3488364219665527, "rewards/margins": 1.0314748287200928, "rewards/rejected": -3.3803107738494873, "step": 1850 }, { "epoch": 0.49, "grad_norm": 5.71875, "learning_rate": 3.0483451503702264e-06, "logits/chosen": -1.3038969039916992, "logits/rejected": -1.2319445610046387, "logps/chosen": -547.4259033203125, "logps/rejected": -617.3253784179688, "loss": 0.5618, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.6133077144622803, "rewards/margins": 0.8994197845458984, "rewards/rejected": -3.5127272605895996, "step": 1860 }, { "epoch": 0.49, "grad_norm": 8.1875, "learning_rate": 3.0260342347574916e-06, "logits/chosen": -1.2965396642684937, "logits/rejected": -1.1523797512054443, "logps/chosen": -519.9957275390625, "logps/rejected": -625.9295654296875, "loss": 0.4402, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4765942096710205, "rewards/margins": 1.2496535778045654, "rewards/rejected": -3.726247787475586, "step": 1870 }, { "epoch": 0.49, "grad_norm": 9.8125, "learning_rate": 3.0036793954375358e-06, "logits/chosen": -1.2782443761825562, "logits/rejected": -1.1259523630142212, "logps/chosen": -547.7828979492188, "logps/rejected": -630.2535400390625, "loss": 0.4395, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.7768635749816895, "rewards/margins": 1.2470468282699585, "rewards/rejected": -4.0239105224609375, "step": 1880 }, { "epoch": 0.49, "grad_norm": 11.5, "learning_rate": 2.981282499033009e-06, "logits/chosen": -1.278181791305542, "logits/rejected": -1.1554654836654663, "logps/chosen": -553.5909423828125, "logps/rejected": -634.082275390625, "loss": 0.5183, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.772510051727295, "rewards/margins": 1.059287667274475, "rewards/rejected": -3.8317978382110596, "step": 1890 }, { "epoch": 0.5, "grad_norm": 8.0625, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -1.327986717224121, "logits/rejected": -1.165433645248413, "logps/chosen": -511.99090576171875, "logps/rejected": -616.3585815429688, "loss": 0.46, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3063502311706543, "rewards/margins": 1.2612559795379639, "rewards/rejected": -3.5676064491271973, "step": 1900 }, { "epoch": 0.5, "eval_logits/chosen": -1.2252681255340576, "eval_logits/rejected": -1.1040537357330322, "eval_logps/chosen": -507.9823303222656, "eval_logps/rejected": -594.7523193359375, "eval_loss": 0.5002806782722473, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -2.433302879333496, "eval_rewards/margins": 1.0681343078613281, "eval_rewards/rejected": -3.501437187194824, "eval_runtime": 384.8766, "eval_samples_per_second": 5.196, "eval_steps_per_second": 0.65, "step": 1900 }, { "epoch": 0.5, "grad_norm": 10.0625, "learning_rate": 2.9363700188634597e-06, "logits/chosen": -1.2988775968551636, "logits/rejected": -1.167811632156372, "logps/chosen": -534.3869018554688, "logps/rejected": -595.1586303710938, "loss": 0.518, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6931681632995605, "rewards/margins": 0.9779523611068726, "rewards/rejected": -3.6711204051971436, "step": 1910 }, { "epoch": 0.5, "grad_norm": 11.3125, "learning_rate": 2.9138581852776053e-06, "logits/chosen": -1.2570379972457886, "logits/rejected": -1.1531012058258057, "logps/chosen": -555.2855834960938, "logps/rejected": -654.2891845703125, "loss": 0.508, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.955744981765747, "rewards/margins": 1.1195967197418213, "rewards/rejected": -4.075342178344727, "step": 1920 }, { "epoch": 0.51, "grad_norm": 7.8125, "learning_rate": 2.8913117946523805e-06, "logits/chosen": -1.280539631843567, "logits/rejected": -1.100694179534912, "logps/chosen": -573.8317260742188, "logps/rejected": -636.535400390625, "loss": 0.4979, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.0741798877716064, "rewards/margins": 1.0079572200775146, "rewards/rejected": -4.082137107849121, "step": 1930 }, { "epoch": 0.51, "grad_norm": 9.9375, "learning_rate": 2.8687327296049126e-06, "logits/chosen": -1.2726280689239502, "logits/rejected": -1.171382188796997, "logps/chosen": -556.9118041992188, "logps/rejected": -646.0772705078125, "loss": 0.5218, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.977890968322754, "rewards/margins": 0.9889172315597534, "rewards/rejected": -3.966808319091797, "step": 1940 }, { "epoch": 0.51, "grad_norm": 13.25, "learning_rate": 2.8461228754806376e-06, "logits/chosen": -1.3368163108825684, "logits/rejected": -1.172978401184082, "logps/chosen": -542.0377807617188, "logps/rejected": -597.8560180664062, "loss": 0.5274, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6624741554260254, "rewards/margins": 0.8161913752555847, "rewards/rejected": -3.478665590286255, "step": 1950 }, { "epoch": 0.51, "grad_norm": 8.5625, "learning_rate": 2.823484120195865e-06, "logits/chosen": -1.4352657794952393, "logits/rejected": -1.227199912071228, "logps/chosen": -520.835693359375, "logps/rejected": -587.2822265625, "loss": 0.4585, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4302444458007812, "rewards/margins": 1.0082801580429077, "rewards/rejected": -3.4385247230529785, "step": 1960 }, { "epoch": 0.52, "grad_norm": 10.3125, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -1.293084979057312, "logits/rejected": -1.148153305053711, "logps/chosen": -520.2894897460938, "logps/rejected": -565.23681640625, "loss": 0.4997, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4665493965148926, "rewards/margins": 0.9034391641616821, "rewards/rejected": -3.369988203048706, "step": 1970 }, { "epoch": 0.52, "grad_norm": 13.1875, "learning_rate": 2.7781274697184353e-06, "logits/chosen": -1.1424802541732788, "logits/rejected": -1.187720775604248, "logps/chosen": -492.9554138183594, "logps/rejected": -617.7970581054688, "loss": 0.5349, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.6359786987304688, "rewards/margins": 1.0452814102172852, "rewards/rejected": -3.681259870529175, "step": 1980 }, { "epoch": 0.52, "grad_norm": 9.375, "learning_rate": 2.7554133617930397e-06, "logits/chosen": -1.2500625848770142, "logits/rejected": -1.1256784200668335, "logps/chosen": -501.5577087402344, "logps/rejected": -588.8922119140625, "loss": 0.5168, "rewards/accuracies": 0.71875, "rewards/chosen": -2.537215232849121, "rewards/margins": 1.0240715742111206, "rewards/rejected": -3.5612869262695312, "step": 1990 }, { "epoch": 0.52, "grad_norm": 11.625, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -1.436962366104126, "logits/rejected": -1.266498327255249, "logps/chosen": -528.1736450195312, "logps/rejected": -578.77734375, "loss": 0.477, "rewards/accuracies": 0.75, "rewards/chosen": -2.388388156890869, "rewards/margins": 1.0654609203338623, "rewards/rejected": -3.4538490772247314, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": -1.2391676902770996, "eval_logits/rejected": -1.1185089349746704, "eval_logps/chosen": -503.76922607421875, "eval_logps/rejected": -583.5771484375, "eval_loss": 0.4988709092140198, "eval_rewards/accuracies": 0.734499990940094, "eval_rewards/chosen": -2.39117169380188, "eval_rewards/margins": 0.9985132813453674, "eval_rewards/rejected": -3.3896851539611816, "eval_runtime": 385.1549, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 2000 }, { "epoch": 0.53, "grad_norm": 10.375, "learning_rate": 2.7099230635178954e-06, "logits/chosen": -1.280256748199463, "logits/rejected": -1.239262342453003, "logps/chosen": -499.21240234375, "logps/rejected": -584.2531127929688, "loss": 0.5227, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.3436267375946045, "rewards/margins": 0.9151015281677246, "rewards/rejected": -3.25872802734375, "step": 2010 }, { "epoch": 0.53, "grad_norm": 10.625, "learning_rate": 2.6871506715949608e-06, "logits/chosen": -1.4013721942901611, "logits/rejected": -1.2793995141983032, "logps/chosen": -463.5269470214844, "logps/rejected": -541.9952392578125, "loss": 0.4813, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1055784225463867, "rewards/margins": 0.965211033821106, "rewards/rejected": -3.070789337158203, "step": 2020 }, { "epoch": 0.53, "grad_norm": 10.25, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -1.4540785551071167, "logits/rejected": -1.2950793504714966, "logps/chosen": -521.9766235351562, "logps/rejected": -591.3455810546875, "loss": 0.4591, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.2707479000091553, "rewards/margins": 1.1174595355987549, "rewards/rejected": -3.388207197189331, "step": 2030 }, { "epoch": 0.53, "grad_norm": 12.375, "learning_rate": 2.6415609094604562e-06, "logits/chosen": -1.2611262798309326, "logits/rejected": -1.2067164182662964, "logps/chosen": -539.65869140625, "logps/rejected": -629.5203857421875, "loss": 0.4464, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6909213066101074, "rewards/margins": 1.0888901948928833, "rewards/rejected": -3.7798118591308594, "step": 2040 }, { "epoch": 0.54, "grad_norm": 14.375, "learning_rate": 2.618747345980904e-06, "logits/chosen": -1.2389599084854126, "logits/rejected": -1.0517133474349976, "logps/chosen": -593.0328369140625, "logps/rejected": -635.1866455078125, "loss": 0.5624, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.446354627609253, "rewards/margins": 0.9264065027236938, "rewards/rejected": -4.3727617263793945, "step": 2050 }, { "epoch": 0.54, "grad_norm": 6.5, "learning_rate": 2.595923867132136e-06, "logits/chosen": -1.2825162410736084, "logits/rejected": -1.1602712869644165, "logps/chosen": -608.6810302734375, "logps/rejected": -699.3939819335938, "loss": 0.5003, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.277606964111328, "rewards/margins": 1.153955101966858, "rewards/rejected": -4.4315619468688965, "step": 2060 }, { "epoch": 0.54, "grad_norm": 8.75, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -1.2274243831634521, "logits/rejected": -1.191007375717163, "logps/chosen": -544.41259765625, "logps/rejected": -638.9494018554688, "loss": 0.5467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.92881441116333, "rewards/margins": 0.869024932384491, "rewards/rejected": -3.7978389263153076, "step": 2070 }, { "epoch": 0.54, "grad_norm": 7.875, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -1.3184901475906372, "logits/rejected": -1.196045994758606, "logps/chosen": -512.1152954101562, "logps/rejected": -571.6860961914062, "loss": 0.5238, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.5673513412475586, "rewards/margins": 0.8982425928115845, "rewards/rejected": -3.4655938148498535, "step": 2080 }, { "epoch": 0.55, "grad_norm": 10.5625, "learning_rate": 2.527412999094507e-06, "logits/chosen": -1.3197405338287354, "logits/rejected": -1.1518932580947876, "logps/chosen": -544.2307739257812, "logps/rejected": -638.2955932617188, "loss": 0.4778, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.50474214553833, "rewards/margins": 1.0661590099334717, "rewards/rejected": -3.5709011554718018, "step": 2090 }, { "epoch": 0.55, "grad_norm": 10.0625, "learning_rate": 2.504568922200064e-06, "logits/chosen": -1.283879041671753, "logits/rejected": -1.1339181661605835, "logps/chosen": -479.8946838378906, "logps/rejected": -564.1932373046875, "loss": 0.5068, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.385560989379883, "rewards/margins": 1.0045907497406006, "rewards/rejected": -3.3901519775390625, "step": 2100 }, { "epoch": 0.55, "eval_logits/chosen": -1.2462238073349, "eval_logits/rejected": -1.125494360923767, "eval_logps/chosen": -512.4297485351562, "eval_logps/rejected": -591.323974609375, "eval_loss": 0.4939311146736145, "eval_rewards/accuracies": 0.7429999709129333, "eval_rewards/chosen": -2.4777767658233643, "eval_rewards/margins": 0.9893770217895508, "eval_rewards/rejected": -3.467153787612915, "eval_runtime": 385.17, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 2100 }, { "epoch": 0.55, "grad_norm": 9.6875, "learning_rate": 2.4817244638019333e-06, "logits/chosen": -1.3495204448699951, "logits/rejected": -1.1980758905410767, "logps/chosen": -514.2600708007812, "logps/rejected": -565.2801513671875, "loss": 0.5135, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.384481906890869, "rewards/margins": 0.922328770160675, "rewards/rejected": -3.3068108558654785, "step": 2110 }, { "epoch": 0.55, "grad_norm": 14.3125, "learning_rate": 2.4588815314058155e-06, "logits/chosen": -1.3099550008773804, "logits/rejected": -1.2511496543884277, "logps/chosen": -468.06011962890625, "logps/rejected": -523.5824584960938, "loss": 0.4817, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.224961757659912, "rewards/margins": 0.8980560302734375, "rewards/rejected": -3.1230177879333496, "step": 2120 }, { "epoch": 0.56, "grad_norm": 9.75, "learning_rate": 2.4360420323899922e-06, "logits/chosen": -1.353991985321045, "logits/rejected": -1.2306454181671143, "logps/chosen": -505.89434814453125, "logps/rejected": -550.9930419921875, "loss": 0.5674, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.3008170127868652, "rewards/margins": 0.7767833471298218, "rewards/rejected": -3.0776004791259766, "step": 2130 }, { "epoch": 0.56, "grad_norm": 8.0625, "learning_rate": 2.4132078738460585e-06, "logits/chosen": -1.3921695947647095, "logits/rejected": -1.2415539026260376, "logps/chosen": -491.42401123046875, "logps/rejected": -556.8810424804688, "loss": 0.4726, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2695276737213135, "rewards/margins": 1.0401204824447632, "rewards/rejected": -3.309648036956787, "step": 2140 }, { "epoch": 0.56, "grad_norm": 13.4375, "learning_rate": 2.3903809624196826e-06, "logits/chosen": -1.3411505222320557, "logits/rejected": -1.2057361602783203, "logps/chosen": -456.32452392578125, "logps/rejected": -508.445068359375, "loss": 0.5549, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2262067794799805, "rewards/margins": 0.8460358381271362, "rewards/rejected": -3.072242498397827, "step": 2150 }, { "epoch": 0.57, "grad_norm": 12.8125, "learning_rate": 2.3675632041513978e-06, "logits/chosen": -1.4614931344985962, "logits/rejected": -1.2260310649871826, "logps/chosen": -524.8610229492188, "logps/rejected": -565.4326171875, "loss": 0.4894, "rewards/accuracies": 0.75, "rewards/chosen": -2.378574848175049, "rewards/margins": 1.0525071620941162, "rewards/rejected": -3.431082248687744, "step": 2160 }, { "epoch": 0.57, "grad_norm": 12.1875, "learning_rate": 2.3447565043174533e-06, "logits/chosen": -1.3028042316436768, "logits/rejected": -1.1499183177947998, "logps/chosen": -515.6001586914062, "logps/rejected": -565.5277099609375, "loss": 0.5241, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5728909969329834, "rewards/margins": 0.8863977193832397, "rewards/rejected": -3.4592888355255127, "step": 2170 }, { "epoch": 0.57, "grad_norm": 10.3125, "learning_rate": 2.321962767270724e-06, "logits/chosen": -1.3512235879898071, "logits/rejected": -1.2086089849472046, "logps/chosen": -495.2906188964844, "logps/rejected": -538.8243408203125, "loss": 0.5573, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4533779621124268, "rewards/margins": 0.8070129156112671, "rewards/rejected": -3.2603907585144043, "step": 2180 }, { "epoch": 0.57, "grad_norm": 10.0, "learning_rate": 2.299183896281692e-06, "logits/chosen": -1.301710844039917, "logits/rejected": -1.1697108745574951, "logps/chosen": -466.3893127441406, "logps/rejected": -546.2555541992188, "loss": 0.524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1145732402801514, "rewards/margins": 0.8218411207199097, "rewards/rejected": -2.9364142417907715, "step": 2190 }, { "epoch": 0.58, "grad_norm": 7.34375, "learning_rate": 2.2764217933795297e-06, "logits/chosen": -1.4019851684570312, "logits/rejected": -1.2783384323120117, "logps/chosen": -460.39227294921875, "logps/rejected": -538.6397705078125, "loss": 0.4832, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9896026849746704, "rewards/margins": 0.9877565503120422, "rewards/rejected": -2.9773590564727783, "step": 2200 }, { "epoch": 0.58, "eval_logits/chosen": -1.289854884147644, "eval_logits/rejected": -1.1670362949371338, "eval_logps/chosen": -477.1521911621094, "eval_logps/rejected": -549.7868041992188, "eval_loss": 0.49245789647102356, "eval_rewards/accuracies": 0.7425000071525574, "eval_rewards/chosen": -2.125001907348633, "eval_rewards/margins": 0.9267801642417908, "eval_rewards/rejected": -3.05178165435791, "eval_runtime": 385.1303, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 2200 }, { "epoch": 0.58, "grad_norm": 5.1875, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -1.4467527866363525, "logits/rejected": -1.2898051738739014, "logps/chosen": -501.9493103027344, "logps/rejected": -568.07080078125, "loss": 0.5262, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.26446270942688, "rewards/margins": 0.842617392539978, "rewards/rejected": -3.1070799827575684, "step": 2210 }, { "epoch": 0.58, "grad_norm": 7.84375, "learning_rate": 2.230955492793149e-06, "logits/chosen": -1.2303822040557861, "logits/rejected": -1.1834524869918823, "logps/chosen": -536.91796875, "logps/rejected": -603.58203125, "loss": 0.5935, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6312527656555176, "rewards/margins": 0.7955335378646851, "rewards/rejected": -3.4267868995666504, "step": 2220 }, { "epoch": 0.58, "grad_norm": 5.6875, "learning_rate": 2.208255091531947e-06, "logits/chosen": -1.2445331811904907, "logits/rejected": -1.1615046262741089, "logps/chosen": -523.9738159179688, "logps/rejected": -601.7839965820312, "loss": 0.4818, "rewards/accuracies": 0.8125, "rewards/chosen": -2.469764232635498, "rewards/margins": 1.127774953842163, "rewards/rejected": -3.597539186477661, "step": 2230 }, { "epoch": 0.59, "grad_norm": 11.75, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -1.3009926080703735, "logits/rejected": -1.1936320066452026, "logps/chosen": -551.2839965820312, "logps/rejected": -635.8419799804688, "loss": 0.5122, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.5426268577575684, "rewards/margins": 1.0221750736236572, "rewards/rejected": -3.5648021697998047, "step": 2240 }, { "epoch": 0.59, "grad_norm": 7.0, "learning_rate": 2.162929264300107e-06, "logits/chosen": -1.313072919845581, "logits/rejected": -1.2196762561798096, "logps/chosen": -495.29840087890625, "logps/rejected": -598.8929443359375, "loss": 0.4195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2633450031280518, "rewards/margins": 1.2595245838165283, "rewards/rejected": -3.52286958694458, "step": 2250 }, { "epoch": 0.59, "grad_norm": 12.0, "learning_rate": 2.1403076230230006e-06, "logits/chosen": -1.2646925449371338, "logits/rejected": -1.1446959972381592, "logps/chosen": -531.4093017578125, "logps/rejected": -583.4620971679688, "loss": 0.587, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6022684574127197, "rewards/margins": 0.7986178994178772, "rewards/rejected": -3.4008865356445312, "step": 2260 }, { "epoch": 0.59, "grad_norm": 11.625, "learning_rate": 2.11771601595586e-06, "logits/chosen": -1.3460241556167603, "logits/rejected": -1.232742428779602, "logps/chosen": -530.1009521484375, "logps/rejected": -569.1173095703125, "loss": 0.5295, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.4805283546447754, "rewards/margins": 0.9129024744033813, "rewards/rejected": -3.3934311866760254, "step": 2270 }, { "epoch": 0.6, "grad_norm": 16.625, "learning_rate": 2.0951563294913737e-06, "logits/chosen": -1.344582438468933, "logits/rejected": -1.1410505771636963, "logps/chosen": -493.6297912597656, "logps/rejected": -556.1669921875, "loss": 0.4651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3209660053253174, "rewards/margins": 0.930306613445282, "rewards/rejected": -3.251272678375244, "step": 2280 }, { "epoch": 0.6, "grad_norm": 7.59375, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -1.3250610828399658, "logits/rejected": -1.207024097442627, "logps/chosen": -501.9657287597656, "logps/rejected": -565.3271484375, "loss": 0.4841, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4302685260772705, "rewards/margins": 0.9568912386894226, "rewards/rejected": -3.387159824371338, "step": 2290 }, { "epoch": 0.6, "grad_norm": 10.25, "learning_rate": 2.050140250457023e-06, "logits/chosen": -1.4138681888580322, "logits/rejected": -1.1992824077606201, "logps/chosen": -557.7728881835938, "logps/rejected": -629.088623046875, "loss": 0.4731, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.802109479904175, "rewards/margins": 1.0537548065185547, "rewards/rejected": -3.8558642864227295, "step": 2300 }, { "epoch": 0.6, "eval_logits/chosen": -1.2155396938323975, "eval_logits/rejected": -1.095304250717163, "eval_logps/chosen": -552.5741577148438, "eval_logps/rejected": -645.44482421875, "eval_loss": 0.49232217669487, "eval_rewards/accuracies": 0.7434999942779541, "eval_rewards/chosen": -2.8792214393615723, "eval_rewards/margins": 1.129140853881836, "eval_rewards/rejected": -4.008362770080566, "eval_runtime": 385.2143, "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 2300 }, { "epoch": 0.6, "grad_norm": 14.0625, "learning_rate": 2.0276876167168042e-06, "logits/chosen": -1.1646645069122314, "logits/rejected": -1.0743215084075928, "logps/chosen": -514.7222900390625, "logps/rejected": -580.4427490234375, "loss": 0.5834, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9022274017333984, "rewards/margins": 0.9829545021057129, "rewards/rejected": -3.8851819038391113, "step": 2310 }, { "epoch": 0.61, "grad_norm": 8.8125, "learning_rate": 2.0052744209246682e-06, "logits/chosen": -1.3135536909103394, "logits/rejected": -1.1998984813690186, "logps/chosen": -542.7693481445312, "logps/rejected": -606.01123046875, "loss": 0.5182, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.874311685562134, "rewards/margins": 0.9611810445785522, "rewards/rejected": -3.8354930877685547, "step": 2320 }, { "epoch": 0.61, "grad_norm": 9.625, "learning_rate": 1.9829025345760127e-06, "logits/chosen": -1.3124678134918213, "logits/rejected": -1.2832801342010498, "logps/chosen": -549.1907958984375, "logps/rejected": -632.5858764648438, "loss": 0.5333, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7018771171569824, "rewards/margins": 0.8947007060050964, "rewards/rejected": -3.5965774059295654, "step": 2330 }, { "epoch": 0.61, "grad_norm": 7.65625, "learning_rate": 1.9605738257169115e-06, "logits/chosen": -1.2838572263717651, "logits/rejected": -1.117290735244751, "logps/chosen": -497.5326232910156, "logps/rejected": -604.8740234375, "loss": 0.4837, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6455423831939697, "rewards/margins": 1.1532337665557861, "rewards/rejected": -3.798776149749756, "step": 2340 }, { "epoch": 0.62, "grad_norm": 9.9375, "learning_rate": 1.9382901587881275e-06, "logits/chosen": -1.3377434015274048, "logits/rejected": -1.2184029817581177, "logps/chosen": -514.0582275390625, "logps/rejected": -602.654052734375, "loss": 0.4292, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5530195236206055, "rewards/margins": 1.208957552909851, "rewards/rejected": -3.761976957321167, "step": 2350 }, { "epoch": 0.62, "grad_norm": 10.5625, "learning_rate": 1.916053394469437e-06, "logits/chosen": -1.3620846271514893, "logits/rejected": -1.1589324474334717, "logps/chosen": -535.8505859375, "logps/rejected": -625.4491577148438, "loss": 0.5293, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.725583791732788, "rewards/margins": 1.0414365530014038, "rewards/rejected": -3.7670199871063232, "step": 2360 }, { "epoch": 0.62, "grad_norm": 11.3125, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -1.3228267431259155, "logits/rejected": -1.1428587436676025, "logps/chosen": -536.3853759765625, "logps/rejected": -627.5452880859375, "loss": 0.441, "rewards/accuracies": 0.8125, "rewards/chosen": -2.720797300338745, "rewards/margins": 1.1999356746673584, "rewards/rejected": -3.9207332134246826, "step": 2370 }, { "epoch": 0.62, "grad_norm": 10.125, "learning_rate": 1.8717279966446267e-06, "logits/chosen": -1.1800651550292969, "logits/rejected": -1.102126955986023, "logps/chosen": -539.4421997070312, "logps/rejected": -641.0511474609375, "loss": 0.4566, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.90867018699646, "rewards/margins": 1.1115381717681885, "rewards/rejected": -4.020208358764648, "step": 2380 }, { "epoch": 0.63, "grad_norm": 10.125, "learning_rate": 1.8496430642964698e-06, "logits/chosen": -1.258175015449524, "logits/rejected": -1.1534559726715088, "logps/chosen": -557.5374755859375, "logps/rejected": -637.3475341796875, "loss": 0.51, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8512537479400635, "rewards/margins": 1.0346183776855469, "rewards/rejected": -3.8858723640441895, "step": 2390 }, { "epoch": 0.63, "grad_norm": 8.75, "learning_rate": 1.827612436565286e-06, "logits/chosen": -1.2754342555999756, "logits/rejected": -1.123130440711975, "logps/chosen": -543.8443603515625, "logps/rejected": -633.3651123046875, "loss": 0.4782, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.808797836303711, "rewards/margins": 1.1008532047271729, "rewards/rejected": -3.909651279449463, "step": 2400 }, { "epoch": 0.63, "eval_logits/chosen": -1.1977647542953491, "eval_logits/rejected": -1.0794349908828735, "eval_logps/chosen": -549.680419921875, "eval_logps/rejected": -637.0914306640625, "eval_loss": 0.4923146665096283, "eval_rewards/accuracies": 0.7419999837875366, "eval_rewards/chosen": -2.8502840995788574, "eval_rewards/margins": 1.0745435953140259, "eval_rewards/rejected": -3.9248275756835938, "eval_runtime": 385.0636, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 2400 }, { "epoch": 0.63, "grad_norm": 13.875, "learning_rate": 1.8056379530021492e-06, "logits/chosen": -1.3143008947372437, "logits/rejected": -1.2356500625610352, "logps/chosen": -539.9703979492188, "logps/rejected": -599.3643188476562, "loss": 0.5312, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9577202796936035, "rewards/margins": 0.8420518040657043, "rewards/rejected": -3.799771785736084, "step": 2410 }, { "epoch": 0.63, "grad_norm": 11.0625, "learning_rate": 1.7837214484701154e-06, "logits/chosen": -1.3325443267822266, "logits/rejected": -1.2115572690963745, "logps/chosen": -515.3961181640625, "logps/rejected": -601.1583862304688, "loss": 0.4782, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.630460262298584, "rewards/margins": 1.091335415840149, "rewards/rejected": -3.7217955589294434, "step": 2420 }, { "epoch": 0.64, "grad_norm": 13.9375, "learning_rate": 1.7618647529910043e-06, "logits/chosen": -1.3422627449035645, "logits/rejected": -1.2155346870422363, "logps/chosen": -517.1422119140625, "logps/rejected": -613.8555908203125, "loss": 0.5001, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6036880016326904, "rewards/margins": 1.0863001346588135, "rewards/rejected": -3.689988613128662, "step": 2430 }, { "epoch": 0.64, "grad_norm": 9.25, "learning_rate": 1.7400696915925996e-06, "logits/chosen": -1.3564714193344116, "logits/rejected": -1.1683833599090576, "logps/chosen": -539.3397216796875, "logps/rejected": -584.2203979492188, "loss": 0.5162, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.687293529510498, "rewards/margins": 1.0315442085266113, "rewards/rejected": -3.7188377380371094, "step": 2440 }, { "epoch": 0.64, "grad_norm": 11.125, "learning_rate": 1.718338084156254e-06, "logits/chosen": -1.3139379024505615, "logits/rejected": -1.1639807224273682, "logps/chosen": -541.3829956054688, "logps/rejected": -613.3155517578125, "loss": 0.4505, "rewards/accuracies": 0.78125, "rewards/chosen": -2.545919179916382, "rewards/margins": 1.1031758785247803, "rewards/rejected": -3.649095058441162, "step": 2450 }, { "epoch": 0.64, "grad_norm": 10.8125, "learning_rate": 1.6966717452649372e-06, "logits/chosen": -1.4163377285003662, "logits/rejected": -1.2610633373260498, "logps/chosen": -529.8837890625, "logps/rejected": -588.6536254882812, "loss": 0.4533, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.5294137001037598, "rewards/margins": 1.1063209772109985, "rewards/rejected": -3.6357345581054688, "step": 2460 }, { "epoch": 0.65, "grad_norm": 7.78125, "learning_rate": 1.6750724840517103e-06, "logits/chosen": -1.3619472980499268, "logits/rejected": -1.2863503694534302, "logps/chosen": -506.430908203125, "logps/rejected": -603.09228515625, "loss": 0.5196, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.5362462997436523, "rewards/margins": 0.925518810749054, "rewards/rejected": -3.4617652893066406, "step": 2470 }, { "epoch": 0.65, "grad_norm": 14.375, "learning_rate": 1.6535421040486686e-06, "logits/chosen": -1.175429105758667, "logits/rejected": -1.0819575786590576, "logps/chosen": -522.57373046875, "logps/rejected": -610.5762939453125, "loss": 0.4362, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.69221830368042, "rewards/margins": 1.2225408554077148, "rewards/rejected": -3.914759874343872, "step": 2480 }, { "epoch": 0.65, "grad_norm": 12.25, "learning_rate": 1.6320824030363458e-06, "logits/chosen": -1.2581863403320312, "logits/rejected": -1.1994664669036865, "logps/chosen": -505.96783447265625, "logps/rejected": -609.1953735351562, "loss": 0.4515, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7113680839538574, "rewards/margins": 1.186835527420044, "rewards/rejected": -3.8982033729553223, "step": 2490 }, { "epoch": 0.65, "grad_norm": 13.75, "learning_rate": 1.6106951728936028e-06, "logits/chosen": -1.3734843730926514, "logits/rejected": -1.2433956861495972, "logps/chosen": -518.4763793945312, "logps/rejected": -614.0827026367188, "loss": 0.4983, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6127266883850098, "rewards/margins": 1.034812569618225, "rewards/rejected": -3.6475391387939453, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": -1.2522040605545044, "eval_logits/rejected": -1.1292414665222168, "eval_logps/chosen": -521.7777709960938, "eval_logps/rejected": -610.1890258789062, "eval_loss": 0.49058130383491516, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -2.5712568759918213, "eval_rewards/margins": 1.0845470428466797, "eval_rewards/rejected": -3.655803918838501, "eval_runtime": 384.7732, "eval_samples_per_second": 5.198, "eval_steps_per_second": 0.65, "step": 2500 }, { "epoch": 0.66, "grad_norm": 8.4375, "learning_rate": 1.5893821994479996e-06, "logits/chosen": -1.372878909111023, "logits/rejected": -1.2597870826721191, "logps/chosen": -519.8887939453125, "logps/rejected": -593.8539428710938, "loss": 0.476, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.424806594848633, "rewards/margins": 1.1252799034118652, "rewards/rejected": -3.550086259841919, "step": 2510 }, { "epoch": 0.66, "grad_norm": 9.0625, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -1.347572684288025, "logits/rejected": -1.115192174911499, "logps/chosen": -546.6536254882812, "logps/rejected": -608.205078125, "loss": 0.478, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5760815143585205, "rewards/margins": 1.198232889175415, "rewards/rejected": -3.7743141651153564, "step": 2520 }, { "epoch": 0.66, "grad_norm": 7.9375, "learning_rate": 1.5469861348078014e-06, "logits/chosen": -1.3562158346176147, "logits/rejected": -1.2117723226547241, "logps/chosen": -505.29254150390625, "logps/rejected": -614.58251953125, "loss": 0.4407, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.5852229595184326, "rewards/margins": 1.1992766857147217, "rewards/rejected": -3.784499406814575, "step": 2530 }, { "epoch": 0.66, "grad_norm": 10.3125, "learning_rate": 1.5259065836724035e-06, "logits/chosen": -1.2109121084213257, "logits/rejected": -1.152276635169983, "logps/chosen": -509.5875549316406, "logps/rejected": -634.964111328125, "loss": 0.4268, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6477839946746826, "rewards/margins": 1.3151264190673828, "rewards/rejected": -3.9629104137420654, "step": 2540 }, { "epoch": 0.67, "grad_norm": 19.375, "learning_rate": 1.5049083690569456e-06, "logits/chosen": -1.2700594663619995, "logits/rejected": -1.166520118713379, "logps/chosen": -509.182861328125, "logps/rejected": -621.1192626953125, "loss": 0.5163, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.71712589263916, "rewards/margins": 1.1634531021118164, "rewards/rejected": -3.8805785179138184, "step": 2550 }, { "epoch": 0.67, "grad_norm": 16.5, "learning_rate": 1.4839932443063057e-06, "logits/chosen": -1.275468349456787, "logits/rejected": -1.1098088026046753, "logps/chosen": -555.3331909179688, "logps/rejected": -615.7780151367188, "loss": 0.4743, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6335442066192627, "rewards/margins": 1.167301058769226, "rewards/rejected": -3.8008453845977783, "step": 2560 }, { "epoch": 0.67, "grad_norm": 18.0, "learning_rate": 1.4631629558273803e-06, "logits/chosen": -1.2889525890350342, "logits/rejected": -1.1872795820236206, "logps/chosen": -510.55615234375, "logps/rejected": -586.6162109375, "loss": 0.6102, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.688310146331787, "rewards/margins": 0.8770621418952942, "rewards/rejected": -3.5653719902038574, "step": 2570 }, { "epoch": 0.68, "grad_norm": 6.71875, "learning_rate": 1.4424192429432657e-06, "logits/chosen": -1.359438419342041, "logits/rejected": -1.2795076370239258, "logps/chosen": -480.8011779785156, "logps/rejected": -599.15966796875, "loss": 0.4647, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.249246120452881, "rewards/margins": 1.150412917137146, "rewards/rejected": -3.3996593952178955, "step": 2580 }, { "epoch": 0.68, "grad_norm": 13.1875, "learning_rate": 1.421763837748016e-06, "logits/chosen": -1.326791763305664, "logits/rejected": -1.2331459522247314, "logps/chosen": -485.2764587402344, "logps/rejected": -594.4434814453125, "loss": 0.4524, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.346505641937256, "rewards/margins": 1.1767139434814453, "rewards/rejected": -3.523219585418701, "step": 2590 }, { "epoch": 0.68, "grad_norm": 10.8125, "learning_rate": 1.401198464962021e-06, "logits/chosen": -1.3617570400238037, "logits/rejected": -1.1875524520874023, "logps/chosen": -524.5842895507812, "logps/rejected": -588.7896728515625, "loss": 0.4746, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5600085258483887, "rewards/margins": 1.0283123254776, "rewards/rejected": -3.5883209705352783, "step": 2600 }, { "epoch": 0.68, "eval_logits/chosen": -1.2491270303726196, "eval_logits/rejected": -1.1266547441482544, "eval_logps/chosen": -523.2234497070312, "eval_logps/rejected": -616.9339599609375, "eval_loss": 0.4946673512458801, "eval_rewards/accuracies": 0.7365000247955322, "eval_rewards/chosen": -2.585714340209961, "eval_rewards/margins": 1.1375384330749512, "eval_rewards/rejected": -3.723253011703491, "eval_runtime": 385.1919, "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 2600 }, { "epoch": 0.68, "grad_norm": 10.1875, "learning_rate": 1.3807248417879896e-06, "logits/chosen": -1.3990509510040283, "logits/rejected": -1.2910901308059692, "logps/chosen": -524.749267578125, "logps/rejected": -631.2271728515625, "loss": 0.445, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5168325901031494, "rewards/margins": 1.2660022974014282, "rewards/rejected": -3.782834529876709, "step": 2610 }, { "epoch": 0.69, "grad_norm": 25.875, "learning_rate": 1.3603446777675665e-06, "logits/chosen": -1.2434417009353638, "logits/rejected": -1.1283738613128662, "logps/chosen": -539.6519165039062, "logps/rejected": -630.5535888671875, "loss": 0.5282, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7645440101623535, "rewards/margins": 1.129831314086914, "rewards/rejected": -3.8943753242492676, "step": 2620 }, { "epoch": 0.69, "grad_norm": 7.5625, "learning_rate": 1.3400596746385817e-06, "logits/chosen": -1.3770835399627686, "logits/rejected": -1.216672658920288, "logps/chosen": -541.1361694335938, "logps/rejected": -622.9951171875, "loss": 0.5016, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7195262908935547, "rewards/margins": 1.0894376039505005, "rewards/rejected": -3.8089637756347656, "step": 2630 }, { "epoch": 0.69, "grad_norm": 9.3125, "learning_rate": 1.3198715261929587e-06, "logits/chosen": -1.344639539718628, "logits/rejected": -1.1973941326141357, "logps/chosen": -521.10888671875, "logps/rejected": -628.2103881835938, "loss": 0.4222, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8465256690979004, "rewards/margins": 1.2073593139648438, "rewards/rejected": -4.053884983062744, "step": 2640 }, { "epoch": 0.69, "grad_norm": 7.96875, "learning_rate": 1.2997819181352823e-06, "logits/chosen": -1.3569964170455933, "logits/rejected": -1.2025775909423828, "logps/chosen": -566.9078369140625, "logps/rejected": -691.9054565429688, "loss": 0.4043, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7007460594177246, "rewards/margins": 1.4950422048568726, "rewards/rejected": -4.195788383483887, "step": 2650 }, { "epoch": 0.7, "grad_norm": 16.375, "learning_rate": 1.2797925279420454e-06, "logits/chosen": -1.3312625885009766, "logits/rejected": -1.1907614469528198, "logps/chosen": -577.4212646484375, "logps/rejected": -690.229248046875, "loss": 0.4911, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.055377244949341, "rewards/margins": 1.2482696771621704, "rewards/rejected": -4.303646564483643, "step": 2660 }, { "epoch": 0.7, "grad_norm": 16.875, "learning_rate": 1.2599050247215764e-06, "logits/chosen": -1.2753608226776123, "logits/rejected": -1.1736326217651367, "logps/chosen": -555.798828125, "logps/rejected": -654.16357421875, "loss": 0.4766, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.978670835494995, "rewards/margins": 1.214051365852356, "rewards/rejected": -4.192722320556641, "step": 2670 }, { "epoch": 0.7, "grad_norm": 12.0, "learning_rate": 1.2401210690746705e-06, "logits/chosen": -1.3060388565063477, "logits/rejected": -1.1588232517242432, "logps/chosen": -556.8359985351562, "logps/rejected": -636.2288818359375, "loss": 0.5018, "rewards/accuracies": 0.75, "rewards/chosen": -2.8967931270599365, "rewards/margins": 1.125984787940979, "rewards/rejected": -4.022777557373047, "step": 2680 }, { "epoch": 0.7, "grad_norm": 10.625, "learning_rate": 1.2204423129559306e-06, "logits/chosen": -1.3615459203720093, "logits/rejected": -1.3014076948165894, "logps/chosen": -538.6215209960938, "logps/rejected": -644.1961669921875, "loss": 0.5168, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.731085777282715, "rewards/margins": 1.0968948602676392, "rewards/rejected": -3.8279807567596436, "step": 2690 }, { "epoch": 0.71, "grad_norm": 15.1875, "learning_rate": 1.20087039953583e-06, "logits/chosen": -1.375808596611023, "logits/rejected": -1.252746820449829, "logps/chosen": -531.059326171875, "logps/rejected": -624.744140625, "loss": 0.514, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.668408155441284, "rewards/margins": 1.2005492448806763, "rewards/rejected": -3.86895751953125, "step": 2700 }, { "epoch": 0.71, "eval_logits/chosen": -1.2462804317474365, "eval_logits/rejected": -1.1248236894607544, "eval_logps/chosen": -534.3994140625, "eval_logps/rejected": -625.0958251953125, "eval_loss": 0.4923916161060333, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -2.6974740028381348, "eval_rewards/margins": 1.1073981523513794, "eval_rewards/rejected": -3.8048720359802246, "eval_runtime": 385.0439, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 2700 }, { "epoch": 0.71, "grad_norm": 13.125, "learning_rate": 1.181406963063507e-06, "logits/chosen": -1.2778210639953613, "logits/rejected": -1.228360652923584, "logps/chosen": -523.0855102539062, "logps/rejected": -629.9219970703125, "loss": 0.5097, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.572385787963867, "rewards/margins": 1.0744374990463257, "rewards/rejected": -3.6468231678009033, "step": 2710 }, { "epoch": 0.71, "grad_norm": 6.6875, "learning_rate": 1.1620536287303052e-06, "logits/chosen": -1.3865063190460205, "logits/rejected": -1.2557927370071411, "logps/chosen": -545.7744750976562, "logps/rejected": -609.2724609375, "loss": 0.5395, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.5462071895599365, "rewards/margins": 0.9365339279174805, "rewards/rejected": -3.482741117477417, "step": 2720 }, { "epoch": 0.71, "grad_norm": 9.3125, "learning_rate": 1.1428120125340717e-06, "logits/chosen": -1.3251538276672363, "logits/rejected": -1.1808980703353882, "logps/chosen": -494.53924560546875, "logps/rejected": -603.8756103515625, "loss": 0.3923, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.404003143310547, "rewards/margins": 1.5079169273376465, "rewards/rejected": -3.9119198322296143, "step": 2730 }, { "epoch": 0.72, "grad_norm": 10.125, "learning_rate": 1.123683721144223e-06, "logits/chosen": -1.319456696510315, "logits/rejected": -1.213781714439392, "logps/chosen": -539.8772583007812, "logps/rejected": -638.1966552734375, "loss": 0.44, "rewards/accuracies": 0.75, "rewards/chosen": -2.5620384216308594, "rewards/margins": 1.3065942525863647, "rewards/rejected": -3.8686325550079346, "step": 2740 }, { "epoch": 0.72, "grad_norm": 6.25, "learning_rate": 1.1046703517675848e-06, "logits/chosen": -1.3422720432281494, "logits/rejected": -1.2605860233306885, "logps/chosen": -512.2991943359375, "logps/rejected": -620.3077392578125, "loss": 0.522, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.5718300342559814, "rewards/margins": 1.018854022026062, "rewards/rejected": -3.590684175491333, "step": 2750 }, { "epoch": 0.72, "grad_norm": 10.75, "learning_rate": 1.085773492015028e-06, "logits/chosen": -1.3229783773422241, "logits/rejected": -1.1519359350204468, "logps/chosen": -497.25701904296875, "logps/rejected": -590.8815307617188, "loss": 0.4271, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.508775472640991, "rewards/margins": 1.2793452739715576, "rewards/rejected": -3.788120985031128, "step": 2760 }, { "epoch": 0.72, "grad_norm": 32.0, "learning_rate": 1.0669947197689034e-06, "logits/chosen": -1.2877874374389648, "logits/rejected": -1.1616923809051514, "logps/chosen": -543.9298095703125, "logps/rejected": -625.6560668945312, "loss": 0.487, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.712940216064453, "rewards/margins": 1.1042835712432861, "rewards/rejected": -3.8172237873077393, "step": 2770 }, { "epoch": 0.73, "grad_norm": 9.4375, "learning_rate": 1.048335603051291e-06, "logits/chosen": -1.282389521598816, "logits/rejected": -1.1512477397918701, "logps/chosen": -572.5489501953125, "logps/rejected": -676.9873046875, "loss": 0.4351, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8474509716033936, "rewards/margins": 1.3263962268829346, "rewards/rejected": -4.173847198486328, "step": 2780 }, { "epoch": 0.73, "grad_norm": 9.0, "learning_rate": 1.0297976998930665e-06, "logits/chosen": -1.2781507968902588, "logits/rejected": -1.1678388118743896, "logps/chosen": -534.2879638671875, "logps/rejected": -643.2774047851562, "loss": 0.4393, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7367353439331055, "rewards/margins": 1.3677116632461548, "rewards/rejected": -4.104446887969971, "step": 2790 }, { "epoch": 0.73, "grad_norm": 8.4375, "learning_rate": 1.0113825582038078e-06, "logits/chosen": -1.3029879331588745, "logits/rejected": -1.196803092956543, "logps/chosen": -556.0444946289062, "logps/rejected": -652.0103149414062, "loss": 0.4662, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9077467918395996, "rewards/margins": 1.127124547958374, "rewards/rejected": -4.0348711013793945, "step": 2800 }, { "epoch": 0.73, "eval_logits/chosen": -1.2345499992370605, "eval_logits/rejected": -1.1134350299835205, "eval_logps/chosen": -547.6557006835938, "eval_logps/rejected": -641.2913208007812, "eval_loss": 0.4899207055568695, "eval_rewards/accuracies": 0.7379999756813049, "eval_rewards/chosen": -2.830036163330078, "eval_rewards/margins": 1.1367909908294678, "eval_rewards/rejected": -3.966827154159546, "eval_runtime": 384.9651, "eval_samples_per_second": 5.195, "eval_steps_per_second": 0.649, "step": 2800 }, { "epoch": 0.74, "grad_norm": 9.5625, "learning_rate": 9.930917156425477e-07, "logits/chosen": -1.2949634790420532, "logits/rejected": -1.183593988418579, "logps/chosen": -563.5440673828125, "logps/rejected": -668.3243408203125, "loss": 0.5295, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0242040157318115, "rewards/margins": 1.0766557455062866, "rewards/rejected": -4.100859642028809, "step": 2810 }, { "epoch": 0.74, "grad_norm": 19.0, "learning_rate": 9.749266994893756e-07, "logits/chosen": -1.2192089557647705, "logits/rejected": -1.0985405445098877, "logps/chosen": -531.9083251953125, "logps/rejected": -606.6322021484375, "loss": 0.5603, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9121012687683105, "rewards/margins": 0.8815471529960632, "rewards/rejected": -3.7936484813690186, "step": 2820 }, { "epoch": 0.74, "grad_norm": 12.125, "learning_rate": 9.56889026517913e-07, "logits/chosen": -1.2642897367477417, "logits/rejected": -1.1569067239761353, "logps/chosen": -561.394287109375, "logps/rejected": -641.754638671875, "loss": 0.5072, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0005598068237305, "rewards/margins": 1.063594102859497, "rewards/rejected": -4.064153671264648, "step": 2830 }, { "epoch": 0.74, "grad_norm": 7.40625, "learning_rate": 9.389802028686617e-07, "logits/chosen": -1.3579823970794678, "logits/rejected": -1.2555077075958252, "logps/chosen": -551.67626953125, "logps/rejected": -596.185546875, "loss": 0.5982, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.90908145904541, "rewards/margins": 0.7748203277587891, "rewards/rejected": -3.6839020252227783, "step": 2840 }, { "epoch": 0.75, "grad_norm": 9.75, "learning_rate": 9.212017239232427e-07, "logits/chosen": -1.2956401109695435, "logits/rejected": -1.1352595090866089, "logps/chosen": -550.6188354492188, "logps/rejected": -647.8556518554688, "loss": 0.4704, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7766430377960205, "rewards/margins": 1.2017085552215576, "rewards/rejected": -3.97835111618042, "step": 2850 }, { "epoch": 0.75, "grad_norm": 9.875, "learning_rate": 9.03555074179533e-07, "logits/chosen": -1.2600593566894531, "logits/rejected": -1.2393784523010254, "logps/chosen": -524.3843994140625, "logps/rejected": -654.7698364257812, "loss": 0.4337, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.6760458946228027, "rewards/margins": 1.2455599308013916, "rewards/rejected": -3.921605348587036, "step": 2860 }, { "epoch": 0.75, "grad_norm": 20.5, "learning_rate": 8.860417271277067e-07, "logits/chosen": -1.3854873180389404, "logits/rejected": -1.3558924198150635, "logps/chosen": -545.82568359375, "logps/rejected": -628.0182495117188, "loss": 0.4992, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.765906810760498, "rewards/margins": 0.901807963848114, "rewards/rejected": -3.6677145957946777, "step": 2870 }, { "epoch": 0.75, "grad_norm": 10.125, "learning_rate": 8.686631451272029e-07, "logits/chosen": -1.3561471700668335, "logits/rejected": -1.2010211944580078, "logps/chosen": -551.3495483398438, "logps/rejected": -639.118896484375, "loss": 0.5022, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.979217052459717, "rewards/margins": 1.1320674419403076, "rewards/rejected": -4.1112847328186035, "step": 2880 }, { "epoch": 0.76, "grad_norm": 9.3125, "learning_rate": 8.514207792846168e-07, "logits/chosen": -1.3641732931137085, "logits/rejected": -1.2438944578170776, "logps/chosen": -541.0029296875, "logps/rejected": -626.8678588867188, "loss": 0.487, "rewards/accuracies": 0.75, "rewards/chosen": -2.901430130004883, "rewards/margins": 1.142988681793213, "rewards/rejected": -4.044418811798096, "step": 2890 }, { "epoch": 0.76, "grad_norm": 8.5, "learning_rate": 8.343160693325356e-07, "logits/chosen": -1.2573918104171753, "logits/rejected": -1.1431939601898193, "logps/chosen": -554.5100708007812, "logps/rejected": -662.68212890625, "loss": 0.5111, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.98957896232605, "rewards/margins": 1.1248613595962524, "rewards/rejected": -4.11444091796875, "step": 2900 }, { "epoch": 0.76, "eval_logits/chosen": -1.2396172285079956, "eval_logits/rejected": -1.1188315153121948, "eval_logps/chosen": -558.570556640625, "eval_logps/rejected": -650.9627075195312, "eval_loss": 0.48732802271842957, "eval_rewards/accuracies": 0.7404999732971191, "eval_rewards/chosen": -2.9391860961914062, "eval_rewards/margins": 1.1243551969528198, "eval_rewards/rejected": -4.063540935516357, "eval_runtime": 385.3295, "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 2900 }, { "epoch": 0.76, "grad_norm": 7.8125, "learning_rate": 8.173504435093174e-07, "logits/chosen": -1.252179741859436, "logits/rejected": -1.0778075456619263, "logps/chosen": -531.073974609375, "logps/rejected": -619.1007690429688, "loss": 0.4851, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.895054817199707, "rewards/margins": 1.2014925479888916, "rewards/rejected": -4.096547603607178, "step": 2910 }, { "epoch": 0.76, "grad_norm": 6.84375, "learning_rate": 8.00525318439836e-07, "logits/chosen": -1.2942620515823364, "logits/rejected": -1.1525405645370483, "logps/chosen": -569.043701171875, "logps/rejected": -657.7420043945312, "loss": 0.5304, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9221444129943848, "rewards/margins": 0.9633318185806274, "rewards/rejected": -3.8854763507843018, "step": 2920 }, { "epoch": 0.77, "grad_norm": 7.53125, "learning_rate": 7.838420990171927e-07, "logits/chosen": -1.3769783973693848, "logits/rejected": -1.217556357383728, "logps/chosen": -552.2919921875, "logps/rejected": -631.7188720703125, "loss": 0.5073, "rewards/accuracies": 0.75, "rewards/chosen": -2.8292160034179688, "rewards/margins": 1.050167202949524, "rewards/rejected": -3.879383087158203, "step": 2930 }, { "epoch": 0.77, "grad_norm": 9.5, "learning_rate": 7.673021782854084e-07, "logits/chosen": -1.2488492727279663, "logits/rejected": -1.1089154481887817, "logps/chosen": -549.6131591796875, "logps/rejected": -629.2005615234375, "loss": 0.4792, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8892455101013184, "rewards/margins": 1.214680790901184, "rewards/rejected": -4.103926658630371, "step": 2940 }, { "epoch": 0.77, "grad_norm": 11.75, "learning_rate": 7.509069373231039e-07, "logits/chosen": -1.259916067123413, "logits/rejected": -1.1467456817626953, "logps/chosen": -547.0595092773438, "logps/rejected": -607.7587280273438, "loss": 0.5723, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9437592029571533, "rewards/margins": 0.854836106300354, "rewards/rejected": -3.798595428466797, "step": 2950 }, { "epoch": 0.77, "grad_norm": 7.34375, "learning_rate": 7.346577451281822e-07, "logits/chosen": -1.275743007659912, "logits/rejected": -1.1921640634536743, "logps/chosen": -545.425537109375, "logps/rejected": -653.1339111328125, "loss": 0.4519, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.832059383392334, "rewards/margins": 1.3181660175323486, "rewards/rejected": -4.150225639343262, "step": 2960 }, { "epoch": 0.78, "grad_norm": 12.25, "learning_rate": 7.185559585035138e-07, "logits/chosen": -1.3098807334899902, "logits/rejected": -1.1533119678497314, "logps/chosen": -584.9642333984375, "logps/rejected": -682.4730224609375, "loss": 0.4797, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.021878957748413, "rewards/margins": 1.133847951889038, "rewards/rejected": -4.155727386474609, "step": 2970 }, { "epoch": 0.78, "grad_norm": 8.625, "learning_rate": 7.026029219436504e-07, "logits/chosen": -1.3365461826324463, "logits/rejected": -1.1761207580566406, "logps/chosen": -542.1203002929688, "logps/rejected": -646.118896484375, "loss": 0.4723, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8982977867126465, "rewards/margins": 1.1679728031158447, "rewards/rejected": -4.0662713050842285, "step": 2980 }, { "epoch": 0.78, "grad_norm": 7.0, "learning_rate": 6.867999675225523e-07, "logits/chosen": -1.3771815299987793, "logits/rejected": -1.2472676038742065, "logps/chosen": -512.2825317382812, "logps/rejected": -608.7750854492188, "loss": 0.487, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8136465549468994, "rewards/margins": 1.1070338487625122, "rewards/rejected": -3.920680284500122, "step": 2990 }, { "epoch": 0.79, "grad_norm": 10.6875, "learning_rate": 6.711484147823663e-07, "logits/chosen": -1.2860959768295288, "logits/rejected": -1.2111051082611084, "logps/chosen": -506.64581298828125, "logps/rejected": -628.4481811523438, "loss": 0.4758, "rewards/accuracies": 0.75, "rewards/chosen": -2.7242074012756348, "rewards/margins": 1.1695196628570557, "rewards/rejected": -3.8937268257141113, "step": 3000 }, { "epoch": 0.79, "eval_logits/chosen": -1.2526096105575562, "eval_logits/rejected": -1.1318107843399048, "eval_logps/chosen": -550.865478515625, "eval_logps/rejected": -638.7723999023438, "eval_loss": 0.4866448938846588, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -2.8621349334716797, "eval_rewards/margins": 1.079501986503601, "eval_rewards/rejected": -3.9416370391845703, "eval_runtime": 385.0884, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 3000 }, { "epoch": 0.79, "grad_norm": 10.625, "learning_rate": 6.556495706232413e-07, "logits/chosen": -1.2896664142608643, "logits/rejected": -1.1979024410247803, "logps/chosen": -560.0714721679688, "logps/rejected": -646.5289916992188, "loss": 0.5296, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9137609004974365, "rewards/margins": 1.0487867593765259, "rewards/rejected": -3.9625473022460938, "step": 3010 }, { "epoch": 0.79, "grad_norm": 9.6875, "learning_rate": 6.403047291942057e-07, "logits/chosen": -1.2192307710647583, "logits/rejected": -1.0712454319000244, "logps/chosen": -515.818115234375, "logps/rejected": -601.6507568359375, "loss": 0.4944, "rewards/accuracies": 0.75, "rewards/chosen": -2.895503520965576, "rewards/margins": 1.1022310256958008, "rewards/rejected": -3.997734785079956, "step": 3020 }, { "epoch": 0.79, "grad_norm": 12.6875, "learning_rate": 6.251151717851023e-07, "logits/chosen": -1.2880637645721436, "logits/rejected": -1.2091928720474243, "logps/chosen": -509.5738220214844, "logps/rejected": -608.5218505859375, "loss": 0.4853, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7617316246032715, "rewards/margins": 1.1223886013031006, "rewards/rejected": -3.884120464324951, "step": 3030 }, { "epoch": 0.8, "grad_norm": 6.40625, "learning_rate": 6.100821667196041e-07, "logits/chosen": -1.4694463014602661, "logits/rejected": -1.2010104656219482, "logps/chosen": -551.3878173828125, "logps/rejected": -589.3790283203125, "loss": 0.4979, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7574946880340576, "rewards/margins": 1.0347812175750732, "rewards/rejected": -3.792275905609131, "step": 3040 }, { "epoch": 0.8, "grad_norm": 29.5, "learning_rate": 5.952069692493062e-07, "logits/chosen": -1.2609448432922363, "logits/rejected": -1.1505969762802124, "logps/chosen": -498.6568908691406, "logps/rejected": -627.9306640625, "loss": 0.4171, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.703416347503662, "rewards/margins": 1.2583777904510498, "rewards/rejected": -3.961793899536133, "step": 3050 }, { "epoch": 0.8, "grad_norm": 10.625, "learning_rate": 5.80490821448918e-07, "logits/chosen": -1.216658353805542, "logits/rejected": -1.2167049646377563, "logps/chosen": -540.7564086914062, "logps/rejected": -711.563232421875, "loss": 0.4298, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7823424339294434, "rewards/margins": 1.2834153175354004, "rewards/rejected": -4.065757751464844, "step": 3060 }, { "epoch": 0.8, "grad_norm": 9.5625, "learning_rate": 5.659349521125459e-07, "logits/chosen": -1.4194704294204712, "logits/rejected": -1.3601640462875366, "logps/chosen": -555.782958984375, "logps/rejected": -634.6406860351562, "loss": 0.5047, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.743645191192627, "rewards/margins": 0.9945963025093079, "rewards/rejected": -3.7382407188415527, "step": 3070 }, { "epoch": 0.81, "grad_norm": 6.4375, "learning_rate": 5.5154057665109e-07, "logits/chosen": -1.3637388944625854, "logits/rejected": -1.216048240661621, "logps/chosen": -546.4483642578125, "logps/rejected": -646.1047973632812, "loss": 0.4807, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.855586528778076, "rewards/margins": 1.2608329057693481, "rewards/rejected": -4.116419792175293, "step": 3080 }, { "epoch": 0.81, "grad_norm": 11.8125, "learning_rate": 5.373088969907586e-07, "logits/chosen": -1.3931351900100708, "logits/rejected": -1.2272682189941406, "logps/chosen": -558.13232421875, "logps/rejected": -618.197265625, "loss": 0.4482, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8131866455078125, "rewards/margins": 1.1027860641479492, "rewards/rejected": -3.915972948074341, "step": 3090 }, { "epoch": 0.81, "grad_norm": 7.53125, "learning_rate": 5.23241101472709e-07, "logits/chosen": -1.3162554502487183, "logits/rejected": -1.1940876245498657, "logps/chosen": -549.4010009765625, "logps/rejected": -625.9002075195312, "loss": 0.4908, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7309937477111816, "rewards/margins": 0.9850690960884094, "rewards/rejected": -3.7160630226135254, "step": 3100 }, { "epoch": 0.81, "eval_logits/chosen": -1.2554689645767212, "eval_logits/rejected": -1.1347445249557495, "eval_logps/chosen": -549.6837158203125, "eval_logps/rejected": -638.7192993164062, "eval_loss": 0.4868563115596771, "eval_rewards/accuracies": 0.7419999837875366, "eval_rewards/chosen": -2.8503170013427734, "eval_rewards/margins": 1.0907903909683228, "eval_rewards/rejected": -3.9411072731018066, "eval_runtime": 385.4515, "eval_samples_per_second": 5.189, "eval_steps_per_second": 0.649, "step": 3100 }, { "epoch": 0.81, "grad_norm": 8.375, "learning_rate": 5.09338364753818e-07, "logits/chosen": -1.3838107585906982, "logits/rejected": -1.2234851121902466, "logps/chosen": -565.4810791015625, "logps/rejected": -655.7274169921875, "loss": 0.5191, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.800589084625244, "rewards/margins": 1.0603386163711548, "rewards/rejected": -3.8609280586242676, "step": 3110 }, { "epoch": 0.82, "grad_norm": 11.0625, "learning_rate": 4.956018477086005e-07, "logits/chosen": -1.3474712371826172, "logits/rejected": -1.1852939128875732, "logps/chosen": -559.21142578125, "logps/rejected": -640.77685546875, "loss": 0.5116, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9007859230041504, "rewards/margins": 1.0891984701156616, "rewards/rejected": -3.9899849891662598, "step": 3120 }, { "epoch": 0.82, "grad_norm": 12.125, "learning_rate": 4.820326973322764e-07, "logits/chosen": -1.2560558319091797, "logits/rejected": -1.1815481185913086, "logps/chosen": -549.0807495117188, "logps/rejected": -643.4081420898438, "loss": 0.5513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9811716079711914, "rewards/margins": 1.0034395456314087, "rewards/rejected": -3.9846110343933105, "step": 3130 }, { "epoch": 0.82, "grad_norm": 10.5, "learning_rate": 4.686320466449981e-07, "logits/chosen": -1.2670228481292725, "logits/rejected": -1.0823358297348022, "logps/chosen": -515.7471923828125, "logps/rejected": -646.492919921875, "loss": 0.454, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7649807929992676, "rewards/margins": 1.3290727138519287, "rewards/rejected": -4.094053745269775, "step": 3140 }, { "epoch": 0.82, "grad_norm": 6.8125, "learning_rate": 4.554010145972418e-07, "logits/chosen": -1.4123005867004395, "logits/rejected": -1.2410565614700317, "logps/chosen": -551.8477783203125, "logps/rejected": -645.891357421875, "loss": 0.5464, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.880317449569702, "rewards/margins": 1.0536738634109497, "rewards/rejected": -3.9339919090270996, "step": 3150 }, { "epoch": 0.83, "grad_norm": 8.75, "learning_rate": 4.4234070597637455e-07, "logits/chosen": -1.2695270776748657, "logits/rejected": -1.1814700365066528, "logps/chosen": -558.7033081054688, "logps/rejected": -645.794189453125, "loss": 0.5261, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.831700086593628, "rewards/margins": 0.990101158618927, "rewards/rejected": -3.8218014240264893, "step": 3160 }, { "epoch": 0.83, "grad_norm": 6.75, "learning_rate": 4.2945221131440783e-07, "logits/chosen": -1.244091272354126, "logits/rejected": -1.0454550981521606, "logps/chosen": -539.8818359375, "logps/rejected": -634.0319213867188, "loss": 0.43, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.7417213916778564, "rewards/margins": 1.257968544960022, "rewards/rejected": -3.9996895790100098, "step": 3170 }, { "epoch": 0.83, "grad_norm": 9.6875, "learning_rate": 4.167366067969381e-07, "logits/chosen": -1.3269858360290527, "logits/rejected": -1.2656229734420776, "logps/chosen": -505.6949157714844, "logps/rejected": -628.41015625, "loss": 0.4885, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.827846050262451, "rewards/margins": 0.9887911677360535, "rewards/rejected": -3.816636562347412, "step": 3180 }, { "epoch": 0.83, "grad_norm": 7.0, "learning_rate": 4.041949541732826e-07, "logits/chosen": -1.327467441558838, "logits/rejected": -1.272200584411621, "logps/chosen": -555.987060546875, "logps/rejected": -642.5946655273438, "loss": 0.5129, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9210267066955566, "rewards/margins": 1.0088088512420654, "rewards/rejected": -3.929835557937622, "step": 3190 }, { "epoch": 0.84, "grad_norm": 12.1875, "learning_rate": 3.9182830066782614e-07, "logits/chosen": -1.2530772686004639, "logits/rejected": -1.2375959157943726, "logps/chosen": -542.0306396484375, "logps/rejected": -671.2916259765625, "loss": 0.4641, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8428683280944824, "rewards/margins": 1.1683650016784668, "rewards/rejected": -4.011233329772949, "step": 3200 }, { "epoch": 0.84, "eval_logits/chosen": -1.2554447650909424, "eval_logits/rejected": -1.1346678733825684, "eval_logps/chosen": -545.7666015625, "eval_logps/rejected": -634.5078735351562, "eval_loss": 0.48661333322525024, "eval_rewards/accuracies": 0.7404999732971191, "eval_rewards/chosen": -2.8111462593078613, "eval_rewards/margins": 1.0878463983535767, "eval_rewards/rejected": -3.8989927768707275, "eval_runtime": 385.3303, "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 3200 }, { "epoch": 0.84, "grad_norm": 8.375, "learning_rate": 3.796376788925771e-07, "logits/chosen": -1.264981985092163, "logits/rejected": -1.1978137493133545, "logps/chosen": -532.4588623046875, "logps/rejected": -602.8772583007812, "loss": 0.5036, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7103111743927, "rewards/margins": 0.944588840007782, "rewards/rejected": -3.654900074005127, "step": 3210 }, { "epoch": 0.84, "grad_norm": 6.78125, "learning_rate": 3.676241067609465e-07, "logits/chosen": -1.3384299278259277, "logits/rejected": -1.2301527261734009, "logps/chosen": -568.9376220703125, "logps/rejected": -628.9427490234375, "loss": 0.5105, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7523765563964844, "rewards/margins": 1.0309317111968994, "rewards/rejected": -3.7833080291748047, "step": 3220 }, { "epoch": 0.85, "grad_norm": 11.625, "learning_rate": 3.5578858740274976e-07, "logits/chosen": -1.2620373964309692, "logits/rejected": -1.1610171794891357, "logps/chosen": -548.6265258789062, "logps/rejected": -628.5254516601562, "loss": 0.5183, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8689258098602295, "rewards/margins": 0.9248504638671875, "rewards/rejected": -3.793776273727417, "step": 3230 }, { "epoch": 0.85, "grad_norm": 12.0625, "learning_rate": 3.44132109080447e-07, "logits/chosen": -1.4505221843719482, "logits/rejected": -1.2806892395019531, "logps/chosen": -536.9176025390625, "logps/rejected": -614.2163696289062, "loss": 0.4513, "rewards/accuracies": 0.78125, "rewards/chosen": -2.71694016456604, "rewards/margins": 1.147782802581787, "rewards/rejected": -3.864722490310669, "step": 3240 }, { "epoch": 0.85, "grad_norm": 10.375, "learning_rate": 3.3265564510662344e-07, "logits/chosen": -1.377443790435791, "logits/rejected": -1.2464927434921265, "logps/chosen": -556.8729858398438, "logps/rejected": -654.7142333984375, "loss": 0.4331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6353092193603516, "rewards/margins": 1.2298697233200073, "rewards/rejected": -3.8651790618896484, "step": 3250 }, { "epoch": 0.85, "grad_norm": 16.0, "learning_rate": 3.213601537627195e-07, "logits/chosen": -1.2895920276641846, "logits/rejected": -1.1866865158081055, "logps/chosen": -556.0447998046875, "logps/rejected": -639.7942504882812, "loss": 0.5502, "rewards/accuracies": 0.71875, "rewards/chosen": -2.989259719848633, "rewards/margins": 1.0085315704345703, "rewards/rejected": -3.997791290283203, "step": 3260 }, { "epoch": 0.86, "grad_norm": 12.3125, "learning_rate": 3.1024657821901063e-07, "logits/chosen": -1.3556302785873413, "logits/rejected": -1.2743966579437256, "logps/chosen": -520.0645141601562, "logps/rejected": -612.8271484375, "loss": 0.5058, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.72322678565979, "rewards/margins": 1.0860865116119385, "rewards/rejected": -3.8093135356903076, "step": 3270 }, { "epoch": 0.86, "grad_norm": 13.625, "learning_rate": 2.9931584645585654e-07, "logits/chosen": -1.3034099340438843, "logits/rejected": -1.271439790725708, "logps/chosen": -548.0056762695312, "logps/rejected": -648.6056518554688, "loss": 0.514, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.734912395477295, "rewards/margins": 0.9768469929695129, "rewards/rejected": -3.711759090423584, "step": 3280 }, { "epoch": 0.86, "grad_norm": 5.9375, "learning_rate": 2.885688711862136e-07, "logits/chosen": -1.3113230466842651, "logits/rejected": -1.3101108074188232, "logps/chosen": -549.2462768554688, "logps/rejected": -667.9041748046875, "loss": 0.5153, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8819470405578613, "rewards/margins": 1.2062867879867554, "rewards/rejected": -4.0882344245910645, "step": 3290 }, { "epoch": 0.86, "grad_norm": 12.1875, "learning_rate": 2.7800654977942486e-07, "logits/chosen": -1.3042861223220825, "logits/rejected": -1.1825424432754517, "logps/chosen": -531.413818359375, "logps/rejected": -631.4432373046875, "loss": 0.5096, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.740572452545166, "rewards/margins": 1.0384232997894287, "rewards/rejected": -3.778996229171753, "step": 3300 }, { "epoch": 0.86, "eval_logits/chosen": -1.2585511207580566, "eval_logits/rejected": -1.1378772258758545, "eval_logps/chosen": -544.573974609375, "eval_logps/rejected": -633.404052734375, "eval_loss": 0.4864084720611572, "eval_rewards/accuracies": 0.7394999861717224, "eval_rewards/chosen": -2.7992191314697266, "eval_rewards/margins": 1.0887356996536255, "eval_rewards/rejected": -3.8879551887512207, "eval_runtime": 385.2344, "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 3300 }, { "epoch": 0.87, "grad_norm": 15.8125, "learning_rate": 2.6762976418628797e-07, "logits/chosen": -1.345733880996704, "logits/rejected": -1.2021456956863403, "logps/chosen": -504.2335510253906, "logps/rejected": -557.3888549804688, "loss": 0.5433, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.744576930999756, "rewards/margins": 0.9676315188407898, "rewards/rejected": -3.7122085094451904, "step": 3310 }, { "epoch": 0.87, "grad_norm": 10.25, "learning_rate": 2.5743938086541354e-07, "logits/chosen": -1.3191635608673096, "logits/rejected": -1.1912063360214233, "logps/chosen": -537.1017456054688, "logps/rejected": -623.8556518554688, "loss": 0.4854, "rewards/accuracies": 0.75, "rewards/chosen": -2.728193998336792, "rewards/margins": 1.1158511638641357, "rewards/rejected": -3.8440451622009277, "step": 3320 }, { "epoch": 0.87, "grad_norm": 12.5625, "learning_rate": 2.4743625071087574e-07, "logits/chosen": -1.4545891284942627, "logits/rejected": -1.2835543155670166, "logps/chosen": -547.1002807617188, "logps/rejected": -641.8110961914062, "loss": 0.481, "rewards/accuracies": 0.75, "rewards/chosen": -2.6952996253967285, "rewards/margins": 1.241824746131897, "rewards/rejected": -3.937124252319336, "step": 3330 }, { "epoch": 0.87, "grad_norm": 11.9375, "learning_rate": 2.3762120898116498e-07, "logits/chosen": -1.3351377248764038, "logits/rejected": -1.2292808294296265, "logps/chosen": -560.9166259765625, "logps/rejected": -651.31103515625, "loss": 0.4951, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.9212374687194824, "rewards/margins": 0.9840442538261414, "rewards/rejected": -3.9052817821502686, "step": 3340 }, { "epoch": 0.88, "grad_norm": 9.25, "learning_rate": 2.2799507522944048e-07, "logits/chosen": -1.2632300853729248, "logits/rejected": -1.1798118352890015, "logps/chosen": -535.8150634765625, "logps/rejected": -648.6831665039062, "loss": 0.4501, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7001757621765137, "rewards/margins": 1.211038589477539, "rewards/rejected": -3.9112143516540527, "step": 3350 }, { "epoch": 0.88, "grad_norm": 9.5625, "learning_rate": 2.1855865323510056e-07, "logits/chosen": -1.3267244100570679, "logits/rejected": -1.1418159008026123, "logps/chosen": -551.0586547851562, "logps/rejected": -682.3759765625, "loss": 0.4332, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.783801317214966, "rewards/margins": 1.3722645044326782, "rewards/rejected": -4.156065940856934, "step": 3360 }, { "epoch": 0.88, "grad_norm": 6.84375, "learning_rate": 2.0931273093666575e-07, "logits/chosen": -1.2914237976074219, "logits/rejected": -1.1433568000793457, "logps/chosen": -527.1336669921875, "logps/rejected": -625.6705322265625, "loss": 0.4545, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.8694703578948975, "rewards/margins": 1.1879098415374756, "rewards/rejected": -4.057379722595215, "step": 3370 }, { "epoch": 0.88, "grad_norm": 12.1875, "learning_rate": 2.002580803659873e-07, "logits/chosen": -1.2892788648605347, "logits/rejected": -1.1720714569091797, "logps/chosen": -541.9439697265625, "logps/rejected": -635.7294311523438, "loss": 0.4668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.873889446258545, "rewards/margins": 1.1193937063217163, "rewards/rejected": -3.9932830333709717, "step": 3380 }, { "epoch": 0.89, "grad_norm": 5.5625, "learning_rate": 1.913954575837826e-07, "logits/chosen": -1.3597743511199951, "logits/rejected": -1.1029024124145508, "logps/chosen": -555.9939575195312, "logps/rejected": -613.6519775390625, "loss": 0.4753, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.8387951850891113, "rewards/margins": 1.0735225677490234, "rewards/rejected": -3.9123177528381348, "step": 3390 }, { "epoch": 0.89, "grad_norm": 8.875, "learning_rate": 1.827256026165028e-07, "logits/chosen": -1.37373685836792, "logits/rejected": -1.178899884223938, "logps/chosen": -578.7473754882812, "logps/rejected": -641.29345703125, "loss": 0.455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.636767864227295, "rewards/margins": 1.1961476802825928, "rewards/rejected": -3.8329155445098877, "step": 3400 }, { "epoch": 0.89, "eval_logits/chosen": -1.2543540000915527, "eval_logits/rejected": -1.1335822343826294, "eval_logps/chosen": -545.915283203125, "eval_logps/rejected": -635.4321899414062, "eval_loss": 0.48658648133277893, "eval_rewards/accuracies": 0.7394999861717224, "eval_rewards/chosen": -2.8126325607299805, "eval_rewards/margins": 1.0956026315689087, "eval_rewards/rejected": -3.9082350730895996, "eval_runtime": 385.1178, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 3400 }, { "epoch": 0.89, "grad_norm": 14.25, "learning_rate": 1.7424923939454274e-07, "logits/chosen": -1.3175909519195557, "logits/rejected": -1.1420743465423584, "logps/chosen": -559.6712036132812, "logps/rejected": -638.4949951171875, "loss": 0.4204, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7832179069519043, "rewards/margins": 1.2219712734222412, "rewards/rejected": -4.005189418792725, "step": 3410 }, { "epoch": 0.9, "grad_norm": 18.25, "learning_rate": 1.6596707569179304e-07, "logits/chosen": -1.4023295640945435, "logits/rejected": -1.2579714059829712, "logps/chosen": -565.6453857421875, "logps/rejected": -633.6785278320312, "loss": 0.5107, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.844580888748169, "rewards/margins": 1.0315876007080078, "rewards/rejected": -3.8761680126190186, "step": 3420 }, { "epoch": 0.9, "grad_norm": 9.3125, "learning_rate": 1.578798030665385e-07, "logits/chosen": -1.3531277179718018, "logits/rejected": -1.1701006889343262, "logps/chosen": -551.0891723632812, "logps/rejected": -663.329345703125, "loss": 0.4451, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7639107704162598, "rewards/margins": 1.2928920984268188, "rewards/rejected": -4.056802749633789, "step": 3430 }, { "epoch": 0.9, "grad_norm": 8.625, "learning_rate": 1.499880968037165e-07, "logits/chosen": -1.3360685110092163, "logits/rejected": -1.204347014427185, "logps/chosen": -529.5865478515625, "logps/rejected": -599.6653442382812, "loss": 0.5141, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7397305965423584, "rewards/margins": 1.0634615421295166, "rewards/rejected": -3.803192138671875, "step": 3440 }, { "epoch": 0.9, "grad_norm": 14.5, "learning_rate": 1.4229261585852805e-07, "logits/chosen": -1.3513076305389404, "logits/rejected": -1.2741743326187134, "logps/chosen": -541.47216796875, "logps/rejected": -626.5045776367188, "loss": 0.457, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.730405807495117, "rewards/margins": 1.1085281372070312, "rewards/rejected": -3.8389339447021484, "step": 3450 }, { "epoch": 0.91, "grad_norm": 10.0625, "learning_rate": 1.3479400280141886e-07, "logits/chosen": -1.2767086029052734, "logits/rejected": -1.2311673164367676, "logps/chosen": -530.0853881835938, "logps/rejected": -643.3887939453125, "loss": 0.4882, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8382620811462402, "rewards/margins": 1.153211236000061, "rewards/rejected": -3.9914729595184326, "step": 3460 }, { "epoch": 0.91, "grad_norm": 9.375, "learning_rate": 1.2749288376442044e-07, "logits/chosen": -1.3543965816497803, "logits/rejected": -1.169668436050415, "logps/chosen": -568.0045166015625, "logps/rejected": -621.7847900390625, "loss": 0.4688, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.714702606201172, "rewards/margins": 1.1129640340805054, "rewards/rejected": -3.827666759490967, "step": 3470 }, { "epoch": 0.91, "grad_norm": 9.8125, "learning_rate": 1.203898683888713e-07, "logits/chosen": -1.3755584955215454, "logits/rejected": -1.2311782836914062, "logps/chosen": -532.9498901367188, "logps/rejected": -625.6002197265625, "loss": 0.5499, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.8895249366760254, "rewards/margins": 0.9575474858283997, "rewards/rejected": -3.8470726013183594, "step": 3480 }, { "epoch": 0.91, "grad_norm": 9.75, "learning_rate": 1.1348554977451132e-07, "logits/chosen": -1.4002835750579834, "logits/rejected": -1.255327820777893, "logps/chosen": -557.9288330078125, "logps/rejected": -629.8707275390625, "loss": 0.5076, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.759397029876709, "rewards/margins": 1.0484449863433838, "rewards/rejected": -3.8078417778015137, "step": 3490 }, { "epoch": 0.92, "grad_norm": 6.75, "learning_rate": 1.0678050442995802e-07, "logits/chosen": -1.345915675163269, "logits/rejected": -1.1581257581710815, "logps/chosen": -561.4415283203125, "logps/rejected": -623.0227661132812, "loss": 0.5262, "rewards/accuracies": 0.75, "rewards/chosen": -2.8064818382263184, "rewards/margins": 1.0665854215621948, "rewards/rejected": -3.8730673789978027, "step": 3500 }, { "epoch": 0.92, "eval_logits/chosen": -1.255007028579712, "eval_logits/rejected": -1.1342185735702515, "eval_logps/chosen": -545.7534790039062, "eval_logps/rejected": -635.4207153320312, "eval_loss": 0.4864389896392822, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -2.8110146522521973, "eval_rewards/margins": 1.0971060991287231, "eval_rewards/rejected": -3.908120632171631, "eval_runtime": 385.1023, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 3500 }, { "epoch": 0.92, "grad_norm": 9.0, "learning_rate": 1.0027529222456755e-07, "logits/chosen": -1.3322703838348389, "logits/rejected": -1.164650321006775, "logps/chosen": -529.0792236328125, "logps/rejected": -624.087646484375, "loss": 0.4498, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7491297721862793, "rewards/margins": 1.125643014907837, "rewards/rejected": -3.8747730255126953, "step": 3510 }, { "epoch": 0.92, "grad_norm": 8.375, "learning_rate": 9.397045634168766e-08, "logits/chosen": -1.36007821559906, "logits/rejected": -1.2929136753082275, "logps/chosen": -542.2086181640625, "logps/rejected": -667.2059326171875, "loss": 0.4547, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.736257553100586, "rewards/margins": 1.2475742101669312, "rewards/rejected": -3.9838318824768066, "step": 3520 }, { "epoch": 0.92, "grad_norm": 11.625, "learning_rate": 8.78665232332998e-08, "logits/chosen": -1.2781856060028076, "logits/rejected": -1.210409164428711, "logps/chosen": -516.8308715820312, "logps/rejected": -617.1106567382812, "loss": 0.4795, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8612074851989746, "rewards/margins": 1.0104413032531738, "rewards/rejected": -3.8716487884521484, "step": 3530 }, { "epoch": 0.93, "grad_norm": 9.375, "learning_rate": 8.196400257606208e-08, "logits/chosen": -1.385122537612915, "logits/rejected": -1.24273681640625, "logps/chosen": -560.0970458984375, "logps/rejected": -682.8623046875, "loss": 0.4425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7701196670532227, "rewards/margins": 1.2804229259490967, "rewards/rejected": -4.05054235458374, "step": 3540 }, { "epoch": 0.93, "grad_norm": 9.9375, "learning_rate": 7.626338722875076e-08, "logits/chosen": -1.3233528137207031, "logits/rejected": -1.269012212753296, "logps/chosen": -527.5838623046875, "logps/rejected": -637.5836791992188, "loss": 0.4828, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7349019050598145, "rewards/margins": 1.0903024673461914, "rewards/rejected": -3.8252041339874268, "step": 3550 }, { "epoch": 0.93, "grad_norm": 7.03125, "learning_rate": 7.076515319110688e-08, "logits/chosen": -1.3301162719726562, "logits/rejected": -1.2381629943847656, "logps/chosen": -530.2079467773438, "logps/rejected": -606.7450561523438, "loss": 0.5044, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.739201545715332, "rewards/margins": 1.1811503171920776, "rewards/rejected": -3.9203522205352783, "step": 3560 }, { "epoch": 0.93, "grad_norm": 7.90625, "learning_rate": 6.54697595640899e-08, "logits/chosen": -1.3412398099899292, "logits/rejected": -1.2113425731658936, "logps/chosen": -574.6760864257812, "logps/rejected": -662.1055908203125, "loss": 0.4814, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8245387077331543, "rewards/margins": 1.1259464025497437, "rewards/rejected": -3.9504852294921875, "step": 3570 }, { "epoch": 0.94, "grad_norm": 10.125, "learning_rate": 6.037764851154426e-08, "logits/chosen": -1.3283928632736206, "logits/rejected": -1.286163568496704, "logps/chosen": -535.401611328125, "logps/rejected": -654.7335205078125, "loss": 0.4822, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7134242057800293, "rewards/margins": 1.1457810401916504, "rewards/rejected": -3.8592045307159424, "step": 3580 }, { "epoch": 0.94, "grad_norm": 6.9375, "learning_rate": 5.548924522327748e-08, "logits/chosen": -1.3209599256515503, "logits/rejected": -1.182340383529663, "logps/chosen": -537.6961669921875, "logps/rejected": -631.2122192382812, "loss": 0.4825, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7593882083892822, "rewards/margins": 1.081923484802246, "rewards/rejected": -3.8413116931915283, "step": 3590 }, { "epoch": 0.94, "grad_norm": 10.9375, "learning_rate": 5.0804957879556915e-08, "logits/chosen": -1.249495506286621, "logits/rejected": -1.1639585494995117, "logps/chosen": -500.98992919921875, "logps/rejected": -613.1359252929688, "loss": 0.466, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7542366981506348, "rewards/margins": 1.0757157802581787, "rewards/rejected": -3.8299522399902344, "step": 3600 }, { "epoch": 0.94, "eval_logits/chosen": -1.2554669380187988, "eval_logits/rejected": -1.1347417831420898, "eval_logps/chosen": -545.9835815429688, "eval_logps/rejected": -635.6727294921875, "eval_loss": 0.48658978939056396, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -2.813314914703369, "eval_rewards/margins": 1.0973262786865234, "eval_rewards/rejected": -3.9106414318084717, "eval_runtime": 385.0907, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 3600 }, { "epoch": 0.94, "grad_norm": 10.1875, "learning_rate": 4.632517761702815e-08, "logits/chosen": -1.2666916847229004, "logits/rejected": -1.127403974533081, "logps/chosen": -518.2024536132812, "logps/rejected": -636.0374145507812, "loss": 0.4363, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.827592134475708, "rewards/margins": 1.3016375303268433, "rewards/rejected": -4.129229545593262, "step": 3610 }, { "epoch": 0.95, "grad_norm": 12.5, "learning_rate": 4.205027849605359e-08, "logits/chosen": -1.2991037368774414, "logits/rejected": -1.2022724151611328, "logps/chosen": -558.1617431640625, "logps/rejected": -612.0789794921875, "loss": 0.68, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.1023831367492676, "rewards/margins": 0.8397830128669739, "rewards/rejected": -3.9421660900115967, "step": 3620 }, { "epoch": 0.95, "grad_norm": 9.375, "learning_rate": 3.798061746947995e-08, "logits/chosen": -1.4298536777496338, "logits/rejected": -1.2711670398712158, "logps/chosen": -541.1964721679688, "logps/rejected": -615.2937622070312, "loss": 0.479, "rewards/accuracies": 0.71875, "rewards/chosen": -2.770465135574341, "rewards/margins": 1.1328895092010498, "rewards/rejected": -3.9033546447753906, "step": 3630 }, { "epoch": 0.95, "grad_norm": 10.9375, "learning_rate": 3.411653435283158e-08, "logits/chosen": -1.3373726606369019, "logits/rejected": -1.1359134912490845, "logps/chosen": -546.8034057617188, "logps/rejected": -593.7394409179688, "loss": 0.4962, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7354676723480225, "rewards/margins": 1.0266190767288208, "rewards/rejected": -3.762086868286133, "step": 3640 }, { "epoch": 0.96, "grad_norm": 7.03125, "learning_rate": 3.04583517959367e-08, "logits/chosen": -1.3844215869903564, "logits/rejected": -1.2332738637924194, "logps/chosen": -517.3903198242188, "logps/rejected": -597.7869262695312, "loss": 0.4532, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6527369022369385, "rewards/margins": 1.128391146659851, "rewards/rejected": -3.7811279296875, "step": 3650 }, { "epoch": 0.96, "grad_norm": 9.0625, "learning_rate": 2.7006375255985984e-08, "logits/chosen": -1.3098169565200806, "logits/rejected": -1.2761331796646118, "logps/chosen": -556.2361450195312, "logps/rejected": -640.65283203125, "loss": 0.5747, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.914341449737549, "rewards/margins": 0.838718056678772, "rewards/rejected": -3.753058910369873, "step": 3660 }, { "epoch": 0.96, "grad_norm": 10.1875, "learning_rate": 2.3760892972027328e-08, "logits/chosen": -1.4296592473983765, "logits/rejected": -1.253159761428833, "logps/chosen": -565.5921020507812, "logps/rejected": -640.4884643554688, "loss": 0.5445, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.93088960647583, "rewards/margins": 1.0891053676605225, "rewards/rejected": -4.01999568939209, "step": 3670 }, { "epoch": 0.96, "grad_norm": 11.875, "learning_rate": 2.072217594089765e-08, "logits/chosen": -1.2928217649459839, "logits/rejected": -1.2739886045455933, "logps/chosen": -544.60205078125, "logps/rejected": -657.6286010742188, "loss": 0.4198, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8306102752685547, "rewards/margins": 1.2527996301651, "rewards/rejected": -4.083409786224365, "step": 3680 }, { "epoch": 0.97, "grad_norm": 8.8125, "learning_rate": 1.789047789459375e-08, "logits/chosen": -1.3845082521438599, "logits/rejected": -1.1975808143615723, "logps/chosen": -600.7689819335938, "logps/rejected": -661.3809204101562, "loss": 0.5254, "rewards/accuracies": 0.78125, "rewards/chosen": -2.837923049926758, "rewards/margins": 1.0966850519180298, "rewards/rejected": -3.934607744216919, "step": 3690 }, { "epoch": 0.97, "grad_norm": 6.875, "learning_rate": 1.5266035279088708e-08, "logits/chosen": -1.2211766242980957, "logits/rejected": -1.0900758504867554, "logps/chosen": -589.4644775390625, "logps/rejected": -672.6224975585938, "loss": 0.4945, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.928469181060791, "rewards/margins": 1.0935529470443726, "rewards/rejected": -4.022022247314453, "step": 3700 }, { "epoch": 0.97, "eval_logits/chosen": -1.252778172492981, "eval_logits/rejected": -1.1321126222610474, "eval_logps/chosen": -545.6665649414062, "eval_logps/rejected": -635.412353515625, "eval_loss": 0.486397385597229, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -2.8101449012756348, "eval_rewards/margins": 1.0978920459747314, "eval_rewards/rejected": -3.9080374240875244, "eval_runtime": 385.1334, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 3700 }, { "epoch": 0.97, "grad_norm": 12.1875, "learning_rate": 1.2849067234584623e-08, "logits/chosen": -1.2232288122177124, "logits/rejected": -1.1615407466888428, "logps/chosen": -513.3945922851562, "logps/rejected": -625.0435791015625, "loss": 0.4715, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7703394889831543, "rewards/margins": 1.192608118057251, "rewards/rejected": -3.9629478454589844, "step": 3710 }, { "epoch": 0.97, "grad_norm": 11.1875, "learning_rate": 1.0639775577218625e-08, "logits/chosen": -1.2154030799865723, "logits/rejected": -1.0470209121704102, "logps/chosen": -534.564453125, "logps/rejected": -611.8311767578125, "loss": 0.5179, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8537747859954834, "rewards/margins": 1.1320233345031738, "rewards/rejected": -3.985797882080078, "step": 3720 }, { "epoch": 0.98, "grad_norm": 10.3125, "learning_rate": 8.638344782207486e-09, "logits/chosen": -1.2473368644714355, "logits/rejected": -1.1350939273834229, "logps/chosen": -516.2252197265625, "logps/rejected": -601.020263671875, "loss": 0.4856, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.709373950958252, "rewards/margins": 1.07589852809906, "rewards/rejected": -3.7852725982666016, "step": 3730 }, { "epoch": 0.98, "grad_norm": 9.3125, "learning_rate": 6.84494196844715e-09, "logits/chosen": -1.2988349199295044, "logits/rejected": -1.1963183879852295, "logps/chosen": -549.7848510742188, "logps/rejected": -670.5635986328125, "loss": 0.4567, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7767837047576904, "rewards/margins": 1.3236409425735474, "rewards/rejected": -4.100424766540527, "step": 3740 }, { "epoch": 0.98, "grad_norm": 7.96875, "learning_rate": 5.259716884556121e-09, "logits/chosen": -1.3606340885162354, "logits/rejected": -1.224469780921936, "logps/chosen": -543.1236572265625, "logps/rejected": -640.4927978515625, "loss": 0.4694, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7865681648254395, "rewards/margins": 1.1219072341918945, "rewards/rejected": -3.908475399017334, "step": 3750 }, { "epoch": 0.98, "grad_norm": 9.0, "learning_rate": 3.882801896372967e-09, "logits/chosen": -1.3460079431533813, "logits/rejected": -1.2832306623458862, "logps/chosen": -539.73583984375, "logps/rejected": -619.1431884765625, "loss": 0.4913, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7437405586242676, "rewards/margins": 1.1124091148376465, "rewards/rejected": -3.8561501502990723, "step": 3760 }, { "epoch": 0.99, "grad_norm": 12.8125, "learning_rate": 2.7143119759026614e-09, "logits/chosen": -1.3690940141677856, "logits/rejected": -1.1916528940200806, "logps/chosen": -560.9449462890625, "logps/rejected": -645.1325073242188, "loss": 0.4329, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7667250633239746, "rewards/margins": 1.096879482269287, "rewards/rejected": -3.8636043071746826, "step": 3770 }, { "epoch": 0.99, "grad_norm": 7.65625, "learning_rate": 1.754344691717591e-09, "logits/chosen": -1.2690956592559814, "logits/rejected": -1.2165257930755615, "logps/chosen": -535.2860107421875, "logps/rejected": -644.5172119140625, "loss": 0.5302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8549444675445557, "rewards/margins": 0.8715957403182983, "rewards/rejected": -3.7265400886535645, "step": 3780 }, { "epoch": 0.99, "grad_norm": 10.9375, "learning_rate": 1.0029802008096335e-09, "logits/chosen": -1.2903029918670654, "logits/rejected": -1.1491575241088867, "logps/chosen": -556.7200927734375, "logps/rejected": -649.4456176757812, "loss": 0.4709, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.824282169342041, "rewards/margins": 1.1551063060760498, "rewards/rejected": -3.979388475418091, "step": 3790 }, { "epoch": 0.99, "grad_norm": 9.625, "learning_rate": 4.602812418974534e-10, "logits/chosen": -1.3866922855377197, "logits/rejected": -1.2608470916748047, "logps/chosen": -567.4412231445312, "logps/rejected": -653.213623046875, "loss": 0.5013, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.856003999710083, "rewards/margins": 1.1031793355941772, "rewards/rejected": -3.9591832160949707, "step": 3800 }, { "epoch": 0.99, "eval_logits/chosen": -1.2524324655532837, "eval_logits/rejected": -1.1317205429077148, "eval_logps/chosen": -545.9131469726562, "eval_logps/rejected": -635.618408203125, "eval_loss": 0.48637571930885315, "eval_rewards/accuracies": 0.7394999861717224, "eval_rewards/chosen": -2.8126115798950195, "eval_rewards/margins": 1.0974864959716797, "eval_rewards/rejected": -3.910098075866699, "eval_runtime": 385.0016, "eval_samples_per_second": 5.195, "eval_steps_per_second": 0.649, "step": 3800 }, { "epoch": 1.0, "grad_norm": 10.4375, "learning_rate": 1.2629313018819312e-10, "logits/chosen": -1.312417984008789, "logits/rejected": -1.191304326057434, "logps/chosen": -529.3787231445312, "logps/rejected": -611.1173095703125, "loss": 0.5046, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7690625190734863, "rewards/margins": 0.9739207029342651, "rewards/rejected": -3.742983341217041, "step": 3810 }, { "epoch": 1.0, "grad_norm": 21.25, "learning_rate": 1.0437535929996855e-12, "logits/chosen": -1.306217908859253, "logits/rejected": -1.1442514657974243, "logps/chosen": -570.4493408203125, "logps/rejected": -660.94970703125, "loss": 0.4569, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.836827516555786, "rewards/margins": 1.3515799045562744, "rewards/rejected": -4.188406944274902, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.5238101308459981, "train_runtime": 42749.2467, "train_samples_per_second": 1.43, "train_steps_per_second": 0.089 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }