diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -27,6348 +27,6348 @@ "epoch": 0.0, "grad_norm": 2.40625, "learning_rate": 1.3054830287206266e-07, - "logits/chosen": -2.2499454021453857, - "logits/rejected": -2.0522336959838867, - "logps/chosen": -279.5985107421875, - "logps/rejected": -245.43223571777344, - "loss": 0.6929, - "rewards/accuracies": 0.4861111044883728, - "rewards/chosen": 0.0009557433077134192, - "rewards/margins": 0.0005407779826782644, - "rewards/rejected": 0.0004149653250351548, + "logits/chosen": -2.2492425441741943, + "logits/rejected": -2.0517687797546387, + "logps/chosen": -279.6344909667969, + "logps/rejected": -245.47564697265625, + "loss": 0.6928, + "rewards/accuracies": 0.4930555522441864, + "rewards/chosen": 0.0005959311965852976, + "rewards/margins": 0.000615339376963675, + "rewards/rejected": -1.9408274965826422e-05, "step": 10 }, { "epoch": 0.01, "grad_norm": 2.5, "learning_rate": 2.610966057441253e-07, - "logits/chosen": -2.2457704544067383, - "logits/rejected": -1.944566011428833, - "logps/chosen": -305.46026611328125, - "logps/rejected": -237.7046356201172, - "loss": 0.6927, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.004223807714879513, - "rewards/margins": 0.0008180936565622687, - "rewards/rejected": 0.0034057139419019222, + "logits/chosen": -2.244947671890259, + "logits/rejected": -1.943969964981079, + "logps/chosen": -305.4734802246094, + "logps/rejected": -237.70083618164062, + "loss": 0.6928, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004091521259397268, + "rewards/margins": 0.000647729029878974, + "rewards/rejected": 0.0034437919966876507, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.3125, "learning_rate": 3.9164490861618804e-07, - "logits/chosen": -2.2053937911987305, - "logits/rejected": -2.1369049549102783, - "logps/chosen": -251.22940063476562, - "logps/rejected": -251.3945770263672, + "logits/chosen": -2.205514907836914, + "logits/rejected": -2.1370320320129395, + "logps/chosen": -251.25662231445312, + "logps/rejected": -251.41213989257812, "loss": 0.6922, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.011935219168663025, - "rewards/margins": 0.001991255208849907, - "rewards/rejected": 0.009943963959813118, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.011662699282169342, + "rewards/margins": 0.0018940108129754663, + "rewards/rejected": 0.00976868998259306, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.9453125, "learning_rate": 5.221932114882506e-07, - "logits/chosen": -2.0609099864959717, - "logits/rejected": -2.0232198238372803, - "logps/chosen": -216.2375030517578, - "logps/rejected": -221.68643188476562, - "loss": 0.6916, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.019067076966166496, - "rewards/margins": 0.0031169778667390347, - "rewards/rejected": 0.015950100496411324, + "logits/chosen": -2.0618391036987305, + "logits/rejected": -2.0241973400115967, + "logps/chosen": -216.21438598632812, + "logps/rejected": -221.6951141357422, + "loss": 0.6915, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.019298259168863297, + "rewards/margins": 0.0034350629430264235, + "rewards/rejected": 0.015863195061683655, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.078125, "learning_rate": 6.527415143603135e-07, - "logits/chosen": -2.1113991737365723, - "logits/rejected": -2.0994935035705566, - "logps/chosen": -266.8785095214844, - "logps/rejected": -234.3098907470703, + "logits/chosen": -2.1124298572540283, + "logits/rejected": -2.1008057594299316, + "logps/chosen": -266.8966064453125, + "logps/rejected": -234.32998657226562, "loss": 0.6906, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.030090373009443283, - "rewards/margins": 0.005098854657262564, - "rewards/rejected": 0.024991516023874283, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.029909158125519753, + "rewards/margins": 0.005118774715811014, + "rewards/rejected": 0.024790380150079727, "step": 50 }, { "epoch": 0.02, "grad_norm": 2.125, "learning_rate": 7.832898172323761e-07, - "logits/chosen": -2.099452257156372, - "logits/rejected": -1.9423996210098267, - "logps/chosen": -252.3367919921875, - "logps/rejected": -226.71066284179688, - "loss": 0.69, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.031631551682949066, - "rewards/margins": 0.006400656886398792, - "rewards/rejected": 0.025230899453163147, + "logits/chosen": -2.099602460861206, + "logits/rejected": -1.9424635171890259, + "logps/chosen": -252.27310180664062, + "logps/rejected": -226.72030639648438, + "loss": 0.6897, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.03226853534579277, + "rewards/margins": 0.007134293206036091, + "rewards/rejected": 0.025134241208434105, "step": 60 }, { "epoch": 0.02, "grad_norm": 2.03125, "learning_rate": 9.138381201044387e-07, - "logits/chosen": -2.243349552154541, - "logits/rejected": -2.035635471343994, - "logps/chosen": -272.0526428222656, - "logps/rejected": -246.71243286132812, - "loss": 0.6878, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.04103558883070946, - "rewards/margins": 0.01095958799123764, - "rewards/rejected": 0.03007599711418152, + "logits/chosen": -2.2440109252929688, + "logits/rejected": -2.036339282989502, + "logps/chosen": -272.09234619140625, + "logps/rejected": -246.6947784423828, + "loss": 0.6881, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.04063863307237625, + "rewards/margins": 0.010386193171143532, + "rewards/rejected": 0.03025243617594242, "step": 70 }, { "epoch": 0.02, - "grad_norm": 2.40625, + "grad_norm": 2.359375, "learning_rate": 1.0443864229765013e-06, - "logits/chosen": -2.154019355773926, - "logits/rejected": -1.9776532649993896, - "logps/chosen": -257.60150146484375, - "logps/rejected": -246.8785858154297, - "loss": 0.6873, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.03827068209648132, - "rewards/margins": 0.01215548999607563, - "rewards/rejected": 0.026115190237760544, + "logits/chosen": -2.1541717052459717, + "logits/rejected": -1.9777501821517944, + "logps/chosen": -257.61871337890625, + "logps/rejected": -246.86483764648438, + "loss": 0.6874, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.038099195808172226, + "rewards/margins": 0.011846454814076424, + "rewards/rejected": 0.026252740994095802, "step": 80 }, { "epoch": 0.02, - "grad_norm": 2.203125, + "grad_norm": 2.1875, "learning_rate": 1.1749347258485642e-06, - "logits/chosen": -2.135911703109741, - "logits/rejected": -2.000063419342041, - "logps/chosen": -250.10903930664062, - "logps/rejected": -234.52127075195312, + "logits/chosen": -2.1348958015441895, + "logits/rejected": -1.998792290687561, + "logps/chosen": -250.1610107421875, + "logps/rejected": -234.56787109375, "loss": 0.6846, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.04227185994386673, - "rewards/margins": 0.017635947093367577, - "rewards/rejected": 0.024635912850499153, + "rewards/chosen": 0.04175186529755592, + "rewards/margins": 0.01758204773068428, + "rewards/rejected": 0.024169817566871643, "step": 90 }, { "epoch": 0.03, - "grad_norm": 2.140625, + "grad_norm": 2.125, "learning_rate": 1.305483028720627e-06, - "logits/chosen": -2.180065631866455, - "logits/rejected": -2.069608449935913, - "logps/chosen": -247.0371551513672, - "logps/rejected": -230.7642822265625, - "loss": 0.6823, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.04732387885451317, - "rewards/margins": 0.022458035498857498, - "rewards/rejected": 0.02486584149301052, + "logits/chosen": -2.1793951988220215, + "logits/rejected": -2.0686168670654297, + "logps/chosen": -247.0215301513672, + "logps/rejected": -230.79537963867188, + "loss": 0.6821, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.04747994989156723, + "rewards/margins": 0.0229250006377697, + "rewards/rejected": 0.024554943665862083, "step": 100 }, { "epoch": 0.03, - "eval_logits/chosen": -2.095273017883301, - "eval_logits/rejected": -1.955887794494629, - "eval_logps/chosen": -259.675048828125, - "eval_logps/rejected": -241.93234252929688, - "eval_loss": 0.6821526885032654, - "eval_rewards/accuracies": 0.6610000133514404, - "eval_rewards/chosen": 0.04976964741945267, - "eval_rewards/margins": 0.02300717867910862, - "eval_rewards/rejected": 0.026762468740344048, - "eval_runtime": 384.6998, - "eval_samples_per_second": 5.199, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -2.0950841903686523, + "eval_logits/rejected": -1.9557065963745117, + "eval_logps/chosen": -259.6705627441406, + "eval_logps/rejected": -241.93917846679688, + "eval_loss": 0.6820979714393616, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": 0.04981444031000137, + "eval_rewards/margins": 0.02312026545405388, + "eval_rewards/rejected": 0.026694171130657196, + "eval_runtime": 385.815, + "eval_samples_per_second": 5.184, + "eval_steps_per_second": 0.648, "step": 100 }, { "epoch": 0.03, "grad_norm": 2.3125, "learning_rate": 1.4360313315926894e-06, - "logits/chosen": -2.14589786529541, - "logits/rejected": -2.0023417472839355, - "logps/chosen": -284.3846130371094, - "logps/rejected": -238.9386444091797, - "loss": 0.6789, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.04979206249117851, - "rewards/margins": 0.02978363260626793, - "rewards/rejected": 0.020008429884910583, + "logits/chosen": -2.146080493927002, + "logits/rejected": -2.002453327178955, + "logps/chosen": -284.4079895019531, + "logps/rejected": -238.9375457763672, + "loss": 0.6791, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04955831170082092, + "rewards/margins": 0.029538575559854507, + "rewards/rejected": 0.020019738003611565, "step": 110 }, { "epoch": 0.03, "grad_norm": 2.15625, "learning_rate": 1.5665796344647521e-06, - "logits/chosen": -2.192373275756836, - "logits/rejected": -2.052762985229492, - "logps/chosen": -287.498291015625, - "logps/rejected": -271.96441650390625, - "loss": 0.6727, + "logits/chosen": -2.1928741931915283, + "logits/rejected": -2.0533928871154785, + "logps/chosen": -287.5110778808594, + "logps/rejected": -271.9446716308594, + "loss": 0.6728, "rewards/accuracies": 0.6875, - "rewards/chosen": 0.05546677112579346, - "rewards/margins": 0.04281745105981827, - "rewards/rejected": 0.012649321928620338, + "rewards/chosen": 0.05533873289823532, + "rewards/margins": 0.04249165579676628, + "rewards/rejected": 0.01284707523882389, "step": 120 }, { "epoch": 0.03, - "grad_norm": 2.640625, + "grad_norm": 2.671875, "learning_rate": 1.6971279373368146e-06, - "logits/chosen": -2.2076873779296875, - "logits/rejected": -2.1181204319000244, - "logps/chosen": -250.1416015625, - "logps/rejected": -252.60836791992188, + "logits/chosen": -2.2082314491271973, + "logits/rejected": -2.118213653564453, + "logps/chosen": -250.14013671875, + "logps/rejected": -252.6034393310547, "loss": 0.6701, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.05010415241122246, - "rewards/margins": 0.04891490936279297, - "rewards/rejected": 0.0011892480542883277, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05011880397796631, + "rewards/margins": 0.048879969865083694, + "rewards/rejected": 0.001238831551745534, "step": 130 }, { "epoch": 0.04, "grad_norm": 2.5, "learning_rate": 1.8276762402088774e-06, - "logits/chosen": -2.2444560527801514, - "logits/rejected": -1.9101192951202393, - "logps/chosen": -270.51544189453125, - "logps/rejected": -226.2876739501953, - "loss": 0.6685, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.04222479462623596, - "rewards/margins": 0.05299673601984978, - "rewards/rejected": -0.010771943256258965, + "logits/chosen": -2.24537992477417, + "logits/rejected": -1.9110206365585327, + "logps/chosen": -270.5356750488281, + "logps/rejected": -226.2827606201172, + "loss": 0.6686, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.04202268272638321, + "rewards/margins": 0.052745603024959564, + "rewards/rejected": -0.010722924955189228, "step": 140 }, { "epoch": 0.04, - "grad_norm": 2.640625, + "grad_norm": 2.65625, "learning_rate": 1.9582245430809403e-06, - "logits/chosen": -2.264216899871826, - "logits/rejected": -2.0381903648376465, - "logps/chosen": -280.3586120605469, - "logps/rejected": -242.8353271484375, - "loss": 0.6677, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.03652295097708702, - "rewards/margins": 0.05572789907455444, - "rewards/rejected": -0.019204948097467422, + "logits/chosen": -2.264875888824463, + "logits/rejected": -2.0387892723083496, + "logps/chosen": -280.36077880859375, + "logps/rejected": -242.8515625, + "loss": 0.6676, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.036501698195934296, + "rewards/margins": 0.05586882680654526, + "rewards/rejected": -0.019367124885320663, "step": 150 }, { "epoch": 0.04, "grad_norm": 2.71875, "learning_rate": 2.0887728459530026e-06, - "logits/chosen": -2.1548912525177, - "logits/rejected": -2.052673816680908, - "logps/chosen": -256.0513610839844, - "logps/rejected": -261.861328125, - "loss": 0.6688, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.007725180592387915, - "rewards/margins": 0.055973686277866364, - "rewards/rejected": -0.04824850708246231, + "logits/chosen": -2.15449595451355, + "logits/rejected": -2.0523486137390137, + "logps/chosen": -256.1204833984375, + "logps/rejected": -261.9712219238281, + "loss": 0.6686, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.007034213747829199, + "rewards/margins": 0.0563817024230957, + "rewards/rejected": -0.04934748262166977, "step": 160 }, { "epoch": 0.04, "grad_norm": 2.890625, "learning_rate": 2.2193211488250653e-06, - "logits/chosen": -2.124185562133789, - "logits/rejected": -1.9691429138183594, - "logps/chosen": -220.9111328125, - "logps/rejected": -228.2654571533203, - "loss": 0.6707, + "logits/chosen": -2.1238508224487305, + "logits/rejected": -1.9688222408294678, + "logps/chosen": -220.9573211669922, + "logps/rejected": -228.40869140625, + "loss": 0.6703, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.0032043636310845613, - "rewards/margins": 0.051164913922548294, - "rewards/rejected": -0.054369281977415085, + "rewards/chosen": -0.003666641190648079, + "rewards/margins": 0.05213465169072151, + "rewards/rejected": -0.05580129101872444, "step": 170 }, { "epoch": 0.05, - "grad_norm": 2.984375, + "grad_norm": 3.28125, "learning_rate": 2.3498694516971284e-06, - "logits/chosen": -2.122745990753174, - "logits/rejected": -1.9872467517852783, - "logps/chosen": -259.26727294921875, - "logps/rejected": -252.7638702392578, - "loss": 0.6637, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.03562777489423752, - "rewards/margins": 0.06830445677042007, - "rewards/rejected": -0.1039322167634964, + "logits/chosen": -2.1223385334014893, + "logits/rejected": -1.9868714809417725, + "logps/chosen": -258.9825134277344, + "logps/rejected": -252.4698944091797, + "loss": 0.6638, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03278004750609398, + "rewards/margins": 0.06821247935295105, + "rewards/rejected": -0.10099252313375473, "step": 180 }, { "epoch": 0.05, - "grad_norm": 3.8125, + "grad_norm": 3.890625, "learning_rate": 2.4804177545691907e-06, - "logits/chosen": -2.2456932067871094, - "logits/rejected": -2.029968738555908, - "logps/chosen": -275.09014892578125, - "logps/rejected": -256.80023193359375, + "logits/chosen": -2.2460696697235107, + "logits/rejected": -2.0304675102233887, + "logps/chosen": -274.5130920410156, + "logps/rejected": -256.2106628417969, "loss": 0.65, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.050059832632541656, - "rewards/margins": 0.10052521526813507, - "rewards/rejected": -0.15058502554893494, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04428885504603386, + "rewards/margins": 0.10040076822042465, + "rewards/rejected": -0.1446896344423294, "step": 190 }, { "epoch": 0.05, - "grad_norm": 3.40625, + "grad_norm": 3.375, "learning_rate": 2.610966057441254e-06, - "logits/chosen": -2.194169759750366, - "logits/rejected": -1.954360008239746, - "logps/chosen": -258.94854736328125, - "logps/rejected": -231.2574920654297, - "loss": 0.6492, + "logits/chosen": -2.1960341930389404, + "logits/rejected": -1.95565927028656, + "logps/chosen": -259.01934814453125, + "logps/rejected": -231.2660369873047, + "loss": 0.6496, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.09239193052053452, - "rewards/margins": 0.10424431413412094, - "rewards/rejected": -0.19663624465465546, + "rewards/chosen": -0.09310005605220795, + "rewards/margins": 0.10362167656421661, + "rewards/rejected": -0.19672173261642456, "step": 200 }, { "epoch": 0.05, - "eval_logits/chosen": -2.0706493854522705, - "eval_logits/rejected": -1.9337996244430542, - "eval_logps/chosen": -269.44000244140625, - "eval_logps/rejected": -259.9606018066406, - "eval_loss": 0.6490568518638611, - "eval_rewards/accuracies": 0.6815000176429749, - "eval_rewards/chosen": -0.04788003861904144, - "eval_rewards/margins": 0.10563990473747253, - "eval_rewards/rejected": -0.15351995825767517, - "eval_runtime": 408.5567, - "eval_samples_per_second": 4.895, - "eval_steps_per_second": 0.612, + "eval_logits/chosen": -2.068035125732422, + "eval_logits/rejected": -1.9312690496444702, + "eval_logps/chosen": -270.0797119140625, + "eval_logps/rejected": -260.6905517578125, + "eval_loss": 0.6486819982528687, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": -0.05427735298871994, + "eval_rewards/margins": 0.10654205083847046, + "eval_rewards/rejected": -0.160819411277771, + "eval_runtime": 385.2774, + "eval_samples_per_second": 5.191, + "eval_steps_per_second": 0.649, "step": 200 }, { "epoch": 0.05, - "grad_norm": 3.359375, + "grad_norm": 3.484375, "learning_rate": 2.741514360313316e-06, - "logits/chosen": -2.196761131286621, - "logits/rejected": -1.9796245098114014, - "logps/chosen": -267.26092529296875, - "logps/rejected": -249.635986328125, - "loss": 0.6334, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.05040832608938217, - "rewards/margins": 0.14044944941997528, - "rewards/rejected": -0.19085776805877686, + "logits/chosen": -2.197986602783203, + "logits/rejected": -1.9808934926986694, + "logps/chosen": -267.27685546875, + "logps/rejected": -249.9297637939453, + "loss": 0.6319, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.05056775361299515, + "rewards/margins": 0.14322780072689056, + "rewards/rejected": -0.1937955617904663, "step": 210 }, { "epoch": 0.06, - "grad_norm": 4.125, + "grad_norm": 5.09375, "learning_rate": 2.872062663185379e-06, - "logits/chosen": -2.0959529876708984, - "logits/rejected": -1.9805145263671875, - "logps/chosen": -265.8911437988281, - "logps/rejected": -252.39990234375, + "logits/chosen": -2.0990307331085205, + "logits/rejected": -1.983565330505371, + "logps/chosen": -270.3437194824219, + "logps/rejected": -256.6988525390625, "loss": 0.6401, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.11875394731760025, - "rewards/margins": 0.1290549784898758, - "rewards/rejected": -0.24780890345573425, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16327962279319763, + "rewards/margins": 0.12751872837543488, + "rewards/rejected": -0.2907983660697937, "step": 220 }, { "epoch": 0.06, - "grad_norm": 4.21875, + "grad_norm": 5.0625, "learning_rate": 3.0026109660574416e-06, - "logits/chosen": -2.239375591278076, - "logits/rejected": -2.0525119304656982, - "logps/chosen": -317.2463684082031, - "logps/rejected": -291.793212890625, - "loss": 0.657, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.11077094078063965, - "rewards/margins": 0.10038264095783234, - "rewards/rejected": -0.21115358173847198, + "logits/chosen": -2.2433676719665527, + "logits/rejected": -2.056224822998047, + "logps/chosen": -314.1068420410156, + "logps/rejected": -288.00250244140625, + "loss": 0.6589, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.07937607169151306, + "rewards/margins": 0.09387041628360748, + "rewards/rejected": -0.17324648797512054, "step": 230 }, { "epoch": 0.06, - "grad_norm": 2.78125, + "grad_norm": 5.6875, "learning_rate": 3.1331592689295043e-06, - "logits/chosen": -2.149392604827881, - "logits/rejected": -1.961692452430725, - "logps/chosen": -313.08331298828125, - "logps/rejected": -309.60198974609375, - "loss": 0.648, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.16783350706100464, - "rewards/margins": 0.11663570255041122, - "rewards/rejected": -0.28446921706199646, + "logits/chosen": -2.1602792739868164, + "logits/rejected": -1.9714686870574951, + "logps/chosen": -310.117919921875, + "logps/rejected": -308.3526916503906, + "loss": 0.6431, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13817985355854034, + "rewards/margins": 0.13379593193531036, + "rewards/rejected": -0.2719758152961731, "step": 240 }, { "epoch": 0.07, - "grad_norm": 2.9375, + "grad_norm": 5.4375, "learning_rate": 3.263707571801567e-06, - "logits/chosen": -2.109787702560425, - "logits/rejected": -2.0100176334381104, - "logps/chosen": -297.9364929199219, - "logps/rejected": -283.5945129394531, - "loss": 0.6475, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.38356590270996094, - "rewards/margins": 0.11117170751094818, - "rewards/rejected": -0.49473756551742554, + "logits/chosen": -2.129748821258545, + "logits/rejected": -2.028604030609131, + "logps/chosen": -282.7078552246094, + "logps/rejected": -272.08837890625, + "loss": 0.6361, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.23127944767475128, + "rewards/margins": 0.14839713275432587, + "rewards/rejected": -0.37967658042907715, "step": 250 }, { "epoch": 0.07, - "grad_norm": 3.125, + "grad_norm": 3.65625, "learning_rate": 3.3942558746736293e-06, - "logits/chosen": -2.1550464630126953, - "logits/rejected": -1.9509170055389404, - "logps/chosen": -313.568115234375, - "logps/rejected": -301.53759765625, - "loss": 0.654, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.4116322994232178, - "rewards/margins": 0.09985023736953735, - "rewards/rejected": -0.5114825367927551, + "logits/chosen": -2.183048725128174, + "logits/rejected": -1.9789161682128906, + "logps/chosen": -281.8155212402344, + "logps/rejected": -272.23956298828125, + "loss": 0.6437, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09410645067691803, + "rewards/margins": 0.12439638376235962, + "rewards/rejected": -0.21850283443927765, "step": 260 }, { "epoch": 0.07, - "grad_norm": 3.328125, + "grad_norm": 3.921875, "learning_rate": 3.524804177545692e-06, - "logits/chosen": -2.0624794960021973, - "logits/rejected": -1.9370944499969482, - "logps/chosen": -292.0105895996094, - "logps/rejected": -279.63177490234375, - "loss": 0.6173, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3133612275123596, - "rewards/margins": 0.190281480550766, - "rewards/rejected": -0.5036426782608032, + "logits/chosen": -2.083225965499878, + "logits/rejected": -1.9568647146224976, + "logps/chosen": -275.0286560058594, + "logps/rejected": -263.38140869140625, + "loss": 0.6139, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.14354154467582703, + "rewards/margins": 0.19759733974933624, + "rewards/rejected": -0.3411388695240021, "step": 270 }, { "epoch": 0.07, - "grad_norm": 4.6875, + "grad_norm": 4.8125, "learning_rate": 3.6553524804177547e-06, - "logits/chosen": -2.13100528717041, - "logits/rejected": -1.9591659307479858, - "logps/chosen": -289.3089904785156, - "logps/rejected": -282.8084411621094, - "loss": 0.641, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.3035983741283417, - "rewards/margins": 0.1496592015028, - "rewards/rejected": -0.45325756072998047, + "logits/chosen": -2.1453604698181152, + "logits/rejected": -1.9743705987930298, + "logps/chosen": -287.78057861328125, + "logps/rejected": -284.1526794433594, + "loss": 0.6277, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.28831422328948975, + "rewards/margins": 0.17838594317436218, + "rewards/rejected": -0.4667002260684967, "step": 280 }, { "epoch": 0.08, - "grad_norm": 4.875, + "grad_norm": 4.15625, "learning_rate": 3.7859007832898174e-06, - "logits/chosen": -2.057344675064087, - "logits/rejected": -1.9588420391082764, - "logps/chosen": -311.67510986328125, - "logps/rejected": -307.47015380859375, - "loss": 0.6204, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.2604138255119324, - "rewards/margins": 0.209051251411438, - "rewards/rejected": -0.46946510672569275, + "logits/chosen": -2.068016529083252, + "logits/rejected": -1.9705654382705688, + "logps/chosen": -315.2586364746094, + "logps/rejected": -313.2366027832031, + "loss": 0.6125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2962488532066345, + "rewards/margins": 0.2308805286884308, + "rewards/rejected": -0.5271294116973877, "step": 290 }, { "epoch": 0.08, - "grad_norm": 5.15625, + "grad_norm": 5.8125, "learning_rate": 3.9164490861618806e-06, - "logits/chosen": -2.0852608680725098, - "logits/rejected": -1.8826076984405518, - "logps/chosen": -279.9902038574219, - "logps/rejected": -289.1649169921875, - "loss": 0.6101, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.35862046480178833, - "rewards/margins": 0.2289685755968094, - "rewards/rejected": -0.5875889658927917, + "logits/chosen": -2.1018004417419434, + "logits/rejected": -1.8998439311981201, + "logps/chosen": -275.9500732421875, + "logps/rejected": -287.0372009277344, + "loss": 0.6042, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.31821924448013306, + "rewards/margins": 0.24809296429157257, + "rewards/rejected": -0.5663121938705444, "step": 300 }, { "epoch": 0.08, - "eval_logits/chosen": -2.0021328926086426, - "eval_logits/rejected": -1.8679821491241455, - "eval_logps/chosen": -298.72515869140625, - "eval_logps/rejected": -299.372802734375, - "eval_loss": 0.6216704249382019, - "eval_rewards/accuracies": 0.6769999861717224, - "eval_rewards/chosen": -0.3407318592071533, - "eval_rewards/margins": 0.20691031217575073, - "eval_rewards/rejected": -0.547642171382904, - "eval_runtime": 385.1163, - "eval_samples_per_second": 5.193, + "eval_logits/chosen": -2.0229153633117676, + "eval_logits/rejected": -1.889541745185852, + "eval_logps/chosen": -295.1513671875, + "eval_logps/rejected": -296.011474609375, + "eval_loss": 0.6216087937355042, + "eval_rewards/accuracies": 0.6729999780654907, + "eval_rewards/chosen": -0.30499377846717834, + "eval_rewards/margins": 0.2090347856283188, + "eval_rewards/rejected": -0.5140285491943359, + "eval_runtime": 385.3276, + "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 300 }, { "epoch": 0.08, - "grad_norm": 4.15625, + "grad_norm": 6.78125, "learning_rate": 4.046997389033943e-06, - "logits/chosen": -2.2290728092193604, - "logits/rejected": -2.029090642929077, - "logps/chosen": -319.5593566894531, - "logps/rejected": -296.0638732910156, - "loss": 0.5797, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.30653077363967896, - "rewards/margins": 0.3014482855796814, - "rewards/rejected": -0.6079790592193604, + "logits/chosen": -2.246411085128784, + "logits/rejected": -2.0464656352996826, + "logps/chosen": -320.37054443359375, + "logps/rejected": -296.6560363769531, + "loss": 0.5823, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.31464242935180664, + "rewards/margins": 0.29925835132598877, + "rewards/rejected": -0.6139007806777954, "step": 310 }, { "epoch": 0.08, - "grad_norm": 4.78125, + "grad_norm": 4.90625, "learning_rate": 4.177545691906005e-06, - "logits/chosen": -2.1043484210968018, - "logits/rejected": -1.9178568124771118, - "logps/chosen": -299.31610107421875, - "logps/rejected": -301.1820068359375, - "loss": 0.6348, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.41052132844924927, - "rewards/margins": 0.18894672393798828, - "rewards/rejected": -0.5994681119918823, + "logits/chosen": -2.1202454566955566, + "logits/rejected": -1.933571457862854, + "logps/chosen": -300.3293151855469, + "logps/rejected": -303.07177734375, + "loss": 0.6333, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.42065340280532837, + "rewards/margins": 0.19771243631839752, + "rewards/rejected": -0.6183657646179199, "step": 320 }, { "epoch": 0.09, - "grad_norm": 4.625, + "grad_norm": 5.65625, "learning_rate": 4.308093994778068e-06, - "logits/chosen": -2.0332717895507812, - "logits/rejected": -1.891761064529419, - "logps/chosen": -292.5372619628906, - "logps/rejected": -289.52130126953125, - "loss": 0.6187, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.3294296860694885, - "rewards/margins": 0.21162664890289307, - "rewards/rejected": -0.5410563945770264, + "logits/chosen": -2.0555598735809326, + "logits/rejected": -1.9103734493255615, + "logps/chosen": -286.83306884765625, + "logps/rejected": -285.3974609375, + "loss": 0.6152, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2723875939846039, + "rewards/margins": 0.22743086516857147, + "rewards/rejected": -0.49981847405433655, "step": 330 }, { "epoch": 0.09, - "grad_norm": 3.875, + "grad_norm": 3.6875, "learning_rate": 4.4386422976501306e-06, - "logits/chosen": -2.096933364868164, - "logits/rejected": -1.9842134714126587, - "logps/chosen": -318.83001708984375, - "logps/rejected": -326.63128662109375, - "loss": 0.5927, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.29144176840782166, - "rewards/margins": 0.2920153737068176, - "rewards/rejected": -0.5834571719169617, + "logits/chosen": -2.1098897457122803, + "logits/rejected": -1.996603012084961, + "logps/chosen": -339.12225341796875, + "logps/rejected": -342.5606994628906, + "loss": 0.611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.49436426162719727, + "rewards/margins": 0.24838733673095703, + "rewards/rejected": -0.7427516579627991, "step": 340 }, { "epoch": 0.09, - "grad_norm": 5.625, + "grad_norm": 4.71875, "learning_rate": 4.569190600522193e-06, - "logits/chosen": -2.0155937671661377, - "logits/rejected": -1.8702361583709717, - "logps/chosen": -324.283447265625, - "logps/rejected": -333.35772705078125, - "loss": 0.6323, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.3502510190010071, - "rewards/margins": 0.20564763247966766, - "rewards/rejected": -0.5558986067771912, + "logits/chosen": -2.0351061820983887, + "logits/rejected": -1.8878052234649658, + "logps/chosen": -342.15667724609375, + "logps/rejected": -348.20281982421875, + "loss": 0.6434, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5289834141731262, + "rewards/margins": 0.1753660887479782, + "rewards/rejected": -0.7043493986129761, "step": 350 }, { "epoch": 0.09, - "grad_norm": 5.84375, + "grad_norm": 3.84375, "learning_rate": 4.699738903394257e-06, - "logits/chosen": -1.9730371236801147, - "logits/rejected": -1.927020788192749, - "logps/chosen": -289.91351318359375, - "logps/rejected": -299.1167907714844, - "loss": 0.5914, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.4892953038215637, - "rewards/margins": 0.3004259467124939, - "rewards/rejected": -0.7897213697433472, + "logits/chosen": -2.014333963394165, + "logits/rejected": -1.9689722061157227, + "logps/chosen": -274.50213623046875, + "logps/rejected": -278.16351318359375, + "loss": 0.6081, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.33518165349960327, + "rewards/margins": 0.24500660598278046, + "rewards/rejected": -0.5801882743835449, "step": 360 }, { "epoch": 0.1, - "grad_norm": 6.0625, + "grad_norm": 5.09375, "learning_rate": 4.8302872062663196e-06, - "logits/chosen": -2.0422720909118652, - "logits/rejected": -1.8679052591323853, - "logps/chosen": -364.3427429199219, - "logps/rejected": -349.3939514160156, - "loss": 0.608, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.8744655847549438, - "rewards/margins": 0.2888025939464569, - "rewards/rejected": -1.1632683277130127, + "logits/chosen": -2.1054439544677734, + "logits/rejected": -1.9295707941055298, + "logps/chosen": -315.6613464355469, + "logps/rejected": -300.81231689453125, + "loss": 0.5976, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.38765162229537964, + "rewards/margins": 0.2898003160953522, + "rewards/rejected": -0.6774519681930542, "step": 370 }, { "epoch": 0.1, - "grad_norm": 4.375, + "grad_norm": 6.375, "learning_rate": 4.9608355091383814e-06, - "logits/chosen": -1.9862794876098633, - "logits/rejected": -1.7763971090316772, - "logps/chosen": -339.8470153808594, - "logps/rejected": -339.43743896484375, - "loss": 0.5892, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.41790348291397095, - "rewards/margins": 0.3297092318534851, - "rewards/rejected": -0.7476127743721008, + "logits/chosen": -2.0662331581115723, + "logits/rejected": -1.8568542003631592, + "logps/chosen": -335.3840637207031, + "logps/rejected": -334.6043395996094, + "loss": 0.5885, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.373274028301239, + "rewards/margins": 0.3260083794593811, + "rewards/rejected": -0.6992824077606201, "step": 380 }, { "epoch": 0.1, - "grad_norm": 5.78125, + "grad_norm": 5.53125, "learning_rate": 4.9999488562447675e-06, - "logits/chosen": -1.9420530796051025, - "logits/rejected": -1.8238258361816406, - "logps/chosen": -318.894775390625, - "logps/rejected": -334.8386535644531, - "loss": 0.5721, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.3784860074520111, - "rewards/margins": 0.35959383845329285, - "rewards/rejected": -0.738079845905304, + "logits/chosen": -2.0750114917755127, + "logits/rejected": -1.9580342769622803, + "logps/chosen": -320.772705078125, + "logps/rejected": -333.44476318359375, + "loss": 0.5855, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.39726486802101135, + "rewards/margins": 0.3268759846687317, + "rewards/rejected": -0.7241408228874207, "step": 390 }, { "epoch": 0.1, - "grad_norm": 6.90625, + "grad_norm": 5.3125, "learning_rate": 4.999698361256577e-06, - "logits/chosen": -1.9580987691879272, - "logits/rejected": -1.7359654903411865, - "logps/chosen": -302.38946533203125, - "logps/rejected": -285.8229675292969, - "loss": 0.6173, + "logits/chosen": -2.0969738960266113, + "logits/rejected": -1.8604263067245483, + "logps/chosen": -311.9226989746094, + "logps/rejected": -294.60662841796875, + "loss": 0.6218, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.36912912130355835, - "rewards/margins": 0.24084654450416565, - "rewards/rejected": -0.6099756956100464, + "rewards/chosen": -0.4644620418548584, + "rewards/margins": 0.23335090279579163, + "rewards/rejected": -0.6978129148483276, "step": 400 }, { "epoch": 0.1, - "eval_logits/chosen": -1.7878212928771973, - "eval_logits/rejected": -1.658238410949707, - "eval_logps/chosen": -314.925048828125, - "eval_logps/rejected": -327.9222412109375, - "eval_loss": 0.5952155590057373, - "eval_rewards/accuracies": 0.6834999918937683, - "eval_rewards/chosen": -0.5027302503585815, - "eval_rewards/margins": 0.33040642738342285, - "eval_rewards/rejected": -0.8331366777420044, - "eval_runtime": 384.9418, - "eval_samples_per_second": 5.196, + "eval_logits/chosen": -1.9431427717208862, + "eval_logits/rejected": -1.8155378103256226, + "eval_logps/chosen": -326.5406799316406, + "eval_logps/rejected": -340.4455261230469, + "eval_loss": 0.5939911007881165, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": -0.6188870072364807, + "eval_rewards/margins": 0.3394821286201477, + "eval_rewards/rejected": -0.9583691358566284, + "eval_runtime": 385.2303, + "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 400 }, { "epoch": 0.11, - "grad_norm": 5.28125, + "grad_norm": 5.59375, "learning_rate": 4.999239142174581e-06, - "logits/chosen": -1.791033148765564, - "logits/rejected": -1.727502465248108, - "logps/chosen": -305.58447265625, - "logps/rejected": -327.44293212890625, - "loss": 0.631, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6326209306716919, - "rewards/margins": 0.2363467961549759, - "rewards/rejected": -0.8689676523208618, + "logits/chosen": -1.9562289714813232, + "logits/rejected": -1.8964239358901978, + "logps/chosen": -315.13616943359375, + "logps/rejected": -334.50677490234375, + "loss": 0.6431, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7281379699707031, + "rewards/margins": 0.2114681750535965, + "rewards/rejected": -0.9396060705184937, "step": 410 }, { "epoch": 0.11, - "grad_norm": 6.09375, + "grad_norm": 6.28125, "learning_rate": 4.99857123734344e-06, - "logits/chosen": -1.758462905883789, - "logits/rejected": -1.625067114830017, - "logps/chosen": -289.7398376464844, - "logps/rejected": -331.71600341796875, - "loss": 0.5301, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.6094896197319031, - "rewards/margins": 0.51044100522995, - "rewards/rejected": -1.119930624961853, + "logits/chosen": -1.9491183757781982, + "logits/rejected": -1.8290717601776123, + "logps/chosen": -280.4700622558594, + "logps/rejected": -309.1809997558594, + "loss": 0.5735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5167919397354126, + "rewards/margins": 0.3777889609336853, + "rewards/rejected": -0.8945809602737427, "step": 420 }, { "epoch": 0.11, - "grad_norm": 8.1875, + "grad_norm": 9.5, "learning_rate": 4.997694702533016e-06, - "logits/chosen": -1.7083183526992798, - "logits/rejected": -1.6363675594329834, - "logps/chosen": -356.60418701171875, - "logps/rejected": -385.64410400390625, - "loss": 0.5625, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.790561318397522, - "rewards/margins": 0.5265924334526062, - "rewards/rejected": -1.317153811454773, + "logits/chosen": -1.9259364604949951, + "logits/rejected": -1.8644450902938843, + "logps/chosen": -345.35797119140625, + "logps/rejected": -365.54449462890625, + "loss": 0.5722, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6780990362167358, + "rewards/margins": 0.4380587637424469, + "rewards/rejected": -1.1161577701568604, "step": 430 }, { "epoch": 0.12, - "grad_norm": 7.09375, + "grad_norm": 7.59375, "learning_rate": 4.996609610933713e-06, - "logits/chosen": -1.8173853158950806, - "logits/rejected": -1.73589289188385, - "logps/chosen": -327.7349548339844, - "logps/rejected": -338.24334716796875, - "loss": 0.5833, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5558608770370483, - "rewards/margins": 0.3886395990848541, - "rewards/rejected": -0.9445004463195801, + "logits/chosen": -2.0121302604675293, + "logits/rejected": -1.9294341802597046, + "logps/chosen": -349.0380554199219, + "logps/rejected": -362.43768310546875, + "loss": 0.5912, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7688915729522705, + "rewards/margins": 0.4175523817539215, + "rewards/rejected": -1.1864439249038696, "step": 440 }, { "epoch": 0.12, - "grad_norm": 7.4375, + "grad_norm": 5.46875, "learning_rate": 4.995316053150366e-06, - "logits/chosen": -1.674541711807251, - "logits/rejected": -1.5631834268569946, - "logps/chosen": -329.3605651855469, - "logps/rejected": -351.2587890625, - "loss": 0.556, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.6152701377868652, - "rewards/margins": 0.4492935538291931, - "rewards/rejected": -1.0645637512207031, + "logits/chosen": -1.889850378036499, + "logits/rejected": -1.7697973251342773, + "logps/chosen": -332.23077392578125, + "logps/rejected": -353.26593017578125, + "loss": 0.5642, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.643971860408783, + "rewards/margins": 0.4406636357307434, + "rewards/rejected": -1.0846354961395264, "step": 450 }, { "epoch": 0.12, - "grad_norm": 12.5, + "grad_norm": 9.875, "learning_rate": 4.9938141371946815e-06, - "logits/chosen": -1.616389513015747, - "logits/rejected": -1.5250511169433594, - "logps/chosen": -378.802978515625, - "logps/rejected": -416.6460876464844, - "loss": 0.5634, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.1370322704315186, - "rewards/margins": 0.523708701133728, - "rewards/rejected": -1.6607410907745361, + "logits/chosen": -1.8695566654205322, + "logits/rejected": -1.7812881469726562, + "logps/chosen": -366.63818359375, + "logps/rejected": -409.49755859375, + "loss": 0.5388, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0153841972351074, + "rewards/margins": 0.5738715529441833, + "rewards/rejected": -1.589255690574646, "step": 460 }, { "epoch": 0.12, - "grad_norm": 6.625, + "grad_norm": 7.46875, "learning_rate": 4.992103988476206e-06, - "logits/chosen": -1.6163583993911743, - "logits/rejected": -1.4710776805877686, - "logps/chosen": -357.6758728027344, - "logps/rejected": -395.6207580566406, - "loss": 0.5805, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1385869979858398, - "rewards/margins": 0.504925012588501, - "rewards/rejected": -1.6435120105743408, + "logits/chosen": -1.8687667846679688, + "logits/rejected": -1.7270047664642334, + "logps/chosen": -376.8227844238281, + "logps/rejected": -413.8404846191406, + "loss": 0.5719, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3300559520721436, + "rewards/margins": 0.49565353989601135, + "rewards/rejected": -1.825709581375122, "step": 470 }, { "epoch": 0.13, - "grad_norm": 5.71875, + "grad_norm": 6.0625, "learning_rate": 4.990185749791866e-06, - "logits/chosen": -1.6142799854278564, - "logits/rejected": -1.484505534172058, - "logps/chosen": -354.9118957519531, - "logps/rejected": -412.96539306640625, - "loss": 0.542, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.980329155921936, - "rewards/margins": 0.5626578330993652, - "rewards/rejected": -1.5429868698120117, + "logits/chosen": -1.8790470361709595, + "logits/rejected": -1.7465674877166748, + "logps/chosen": -361.17974853515625, + "logps/rejected": -419.521484375, + "loss": 0.5472, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0430080890655518, + "rewards/margins": 0.5655397176742554, + "rewards/rejected": -1.6085479259490967, "step": 480 }, { "epoch": 0.13, - "grad_norm": 8.25, + "grad_norm": 6.53125, "learning_rate": 4.9880595813140395e-06, - "logits/chosen": -1.6605031490325928, - "logits/rejected": -1.519315242767334, - "logps/chosen": -388.9188537597656, - "logps/rejected": -418.9161071777344, - "loss": 0.5346, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0402735471725464, - "rewards/margins": 0.603448212146759, - "rewards/rejected": -1.6437218189239502, + "logits/chosen": -1.923179268836975, + "logits/rejected": -1.7839629650115967, + "logps/chosen": -394.8546142578125, + "logps/rejected": -421.29730224609375, + "loss": 0.5317, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0996313095092773, + "rewards/margins": 0.5679025053977966, + "rewards/rejected": -1.6675338745117188, "step": 490 }, { "epoch": 0.13, - "grad_norm": 6.21875, + "grad_norm": 6.25, "learning_rate": 4.985725660577184e-06, - "logits/chosen": -1.6577751636505127, - "logits/rejected": -1.5049307346343994, - "logps/chosen": -385.02813720703125, - "logps/rejected": -409.57379150390625, - "loss": 0.5435, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.11268150806427, - "rewards/margins": 0.6577950119972229, - "rewards/rejected": -1.7704765796661377, + "logits/chosen": -1.887112021446228, + "logits/rejected": -1.7504981756210327, + "logps/chosen": -411.74951171875, + "logps/rejected": -424.2745666503906, + "loss": 0.5674, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3798956871032715, + "rewards/margins": 0.5375889539718628, + "rewards/rejected": -1.9174845218658447, "step": 500 }, { "epoch": 0.13, - "eval_logits/chosen": -1.5543876886367798, - "eval_logits/rejected": -1.427258014678955, - "eval_logps/chosen": -376.1609191894531, - "eval_logps/rejected": -405.3195495605469, - "eval_loss": 0.5753844976425171, - "eval_rewards/accuracies": 0.6890000104904175, - "eval_rewards/chosen": -1.115088939666748, - "eval_rewards/margins": 0.4920206665992737, - "eval_rewards/rejected": -1.607109785079956, - "eval_runtime": 388.587, - "eval_samples_per_second": 5.147, - "eval_steps_per_second": 0.643, + "eval_logits/chosen": -1.7892649173736572, + "eval_logits/rejected": -1.6636674404144287, + "eval_logps/chosen": -421.9456787109375, + "eval_logps/rejected": -449.8769836425781, + "eval_loss": 0.5779695510864258, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -1.572936773300171, + "eval_rewards/margins": 0.4797472655773163, + "eval_rewards/rejected": -2.0526838302612305, + "eval_runtime": 385.3091, + "eval_samples_per_second": 5.191, + "eval_steps_per_second": 0.649, "step": 500 }, { "epoch": 0.13, - "grad_norm": 6.5625, + "grad_norm": 5.40625, "learning_rate": 4.983184182463009e-06, - "logits/chosen": -1.6563122272491455, - "logits/rejected": -1.5474936962127686, - "logps/chosen": -371.3856201171875, - "logps/rejected": -394.072021484375, - "loss": 0.5478, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9312448501586914, - "rewards/margins": 0.5972838401794434, - "rewards/rejected": -1.5285285711288452, + "logits/chosen": -1.853735327720642, + "logits/rejected": -1.7524950504302979, + "logps/chosen": -404.90545654296875, + "logps/rejected": -425.74676513671875, + "loss": 0.5607, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.266443133354187, + "rewards/margins": 0.5788331031799316, + "rewards/rejected": -1.8452762365341187, "step": 510 }, { "epoch": 0.14, - "grad_norm": 10.1875, + "grad_norm": 7.3125, "learning_rate": 4.980435359184203e-06, - "logits/chosen": -1.6844135522842407, - "logits/rejected": -1.6247421503067017, - "logps/chosen": -349.91943359375, - "logps/rejected": -377.5780029296875, - "loss": 0.5953, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.768576443195343, - "rewards/margins": 0.43869537115097046, - "rewards/rejected": -1.2072718143463135, + "logits/chosen": -1.9005975723266602, + "logits/rejected": -1.8376613855361938, + "logps/chosen": -341.048828125, + "logps/rejected": -359.40496826171875, + "loss": 0.6122, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6798708438873291, + "rewards/margins": 0.3456707298755646, + "rewards/rejected": -1.0255415439605713, "step": 520 }, { "epoch": 0.14, - "grad_norm": 10.75, + "grad_norm": 10.875, "learning_rate": 4.9774794202667236e-06, - "logits/chosen": -1.68100106716156, - "logits/rejected": -1.63496994972229, - "logps/chosen": -316.89166259765625, - "logps/rejected": -371.69012451171875, - "loss": 0.577, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.5579014420509338, - "rewards/margins": 0.4634733200073242, - "rewards/rejected": -1.0213747024536133, + "logits/chosen": -1.8874883651733398, + "logits/rejected": -1.8308721780776978, + "logps/chosen": -315.84173583984375, + "logps/rejected": -365.2502746582031, + "loss": 0.5734, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5474014282226562, + "rewards/margins": 0.40957459807395935, + "rewards/rejected": -0.9569761157035828, "step": 530 }, { "epoch": 0.14, - "grad_norm": 10.9375, + "grad_norm": 8.5, "learning_rate": 4.974316612530615e-06, - "logits/chosen": -1.5825644731521606, - "logits/rejected": -1.4188969135284424, - "logps/chosen": -361.13397216796875, - "logps/rejected": -398.6629638671875, - "loss": 0.4772, + "logits/chosen": -1.8144280910491943, + "logits/rejected": -1.657810926437378, + "logps/chosen": -369.9844665527344, + "logps/rejected": -390.8047180175781, + "loss": 0.5011, "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.8265112042427063, - "rewards/margins": 0.8431331515312195, - "rewards/rejected": -1.6696443557739258, + "rewards/chosen": -0.9150163531303406, + "rewards/margins": 0.676045298576355, + "rewards/rejected": -1.5910617113113403, "step": 540 }, { "epoch": 0.14, - "grad_norm": 15.25, + "grad_norm": 12.625, "learning_rate": 4.970947200069416e-06, - "logits/chosen": -1.460860252380371, - "logits/rejected": -1.3948581218719482, - "logps/chosen": -469.55615234375, - "logps/rejected": -493.6904296875, - "loss": 0.6643, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.9104303121566772, - "rewards/margins": 0.43918126821517944, - "rewards/rejected": -2.349611759185791, + "logits/chosen": -1.7606821060180664, + "logits/rejected": -1.7015259265899658, + "logps/chosen": -427.96990966796875, + "logps/rejected": -451.92205810546875, + "loss": 0.6311, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.494568943977356, + "rewards/margins": 0.4373590350151062, + "rewards/rejected": -1.931928038597107, "step": 550 }, { "epoch": 0.15, - "grad_norm": 7.6875, + "grad_norm": 7.78125, "learning_rate": 4.967371464228096e-06, - "logits/chosen": -1.6625244617462158, - "logits/rejected": -1.5688579082489014, - "logps/chosen": -452.31256103515625, - "logps/rejected": -504.4009704589844, - "loss": 0.57, + "logits/chosen": -1.9176502227783203, + "logits/rejected": -1.832397699356079, + "logps/chosen": -372.6578369140625, + "logps/rejected": -429.7704162597656, + "loss": 0.5482, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.948133111000061, - "rewards/margins": 0.5168659687042236, - "rewards/rejected": -2.464999198913574, + "rewards/chosen": -1.1515864133834839, + "rewards/margins": 0.567107081413269, + "rewards/rejected": -1.718693494796753, "step": 560 }, { "epoch": 0.15, - "grad_norm": 9.5, + "grad_norm": 5.53125, "learning_rate": 4.963589703579569e-06, - "logits/chosen": -1.7668651342391968, - "logits/rejected": -1.6291354894638062, - "logps/chosen": -480.91107177734375, - "logps/rejected": -497.85650634765625, - "loss": 0.6004, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.8087114095687866, - "rewards/margins": 0.5162175297737122, - "rewards/rejected": -2.3249289989471436, + "logits/chosen": -1.9988332986831665, + "logits/rejected": -1.8672618865966797, + "logps/chosen": -407.62664794921875, + "logps/rejected": -419.98291015625, + "loss": 0.5754, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.075867772102356, + "rewards/margins": 0.47032594680786133, + "rewards/rejected": -1.5461935997009277, "step": 570 }, { "epoch": 0.15, - "grad_norm": 8.4375, + "grad_norm": 8.5, "learning_rate": 4.9596022338997615e-06, - "logits/chosen": -1.7466462850570679, - "logits/rejected": -1.5261166095733643, - "logps/chosen": -454.7201232910156, - "logps/rejected": -477.67987060546875, - "loss": 0.5409, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.569333791732788, - "rewards/margins": 0.6403428316116333, - "rewards/rejected": -2.209676504135132, + "logits/chosen": -1.9790706634521484, + "logits/rejected": -1.7595863342285156, + "logps/chosen": -397.14752197265625, + "logps/rejected": -413.5733337402344, + "loss": 0.5495, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9936078190803528, + "rewards/margins": 0.5750035047531128, + "rewards/rejected": -1.5686112642288208, "step": 580 }, { "epoch": 0.15, - "grad_norm": 5.8125, + "grad_norm": 7.9375, "learning_rate": 4.955409388141243e-06, - "logits/chosen": -1.620123267173767, - "logits/rejected": -1.504206895828247, - "logps/chosen": -378.0823669433594, - "logps/rejected": -403.84649658203125, - "loss": 0.5869, + "logits/chosen": -1.8258365392684937, + "logits/rejected": -1.7129390239715576, + "logps/chosen": -363.6575622558594, + "logps/rejected": -387.19378662109375, + "loss": 0.6003, "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2078220844268799, - "rewards/margins": 0.49430006742477417, - "rewards/rejected": -1.7021223306655884, + "rewards/chosen": -1.0635744333267212, + "rewards/margins": 0.4720209240913391, + "rewards/rejected": -1.5355952978134155, "step": 590 }, { "epoch": 0.16, - "grad_norm": 4.5, + "grad_norm": 4.84375, "learning_rate": 4.951011516405429e-06, - "logits/chosen": -1.7101682424545288, - "logits/rejected": -1.6404016017913818, - "logps/chosen": -328.59051513671875, - "logps/rejected": -365.19866943359375, - "loss": 0.5547, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7783375978469849, - "rewards/margins": 0.5154664516448975, - "rewards/rejected": -1.2938039302825928, + "logits/chosen": -1.8798444271087646, + "logits/rejected": -1.8100011348724365, + "logps/chosen": -338.61151123046875, + "logps/rejected": -374.54974365234375, + "loss": 0.5632, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8785476684570312, + "rewards/margins": 0.5087668895721436, + "rewards/rejected": -1.3873146772384644, "step": 600 }, { "epoch": 0.16, - "eval_logits/chosen": -1.5725802183151245, - "eval_logits/rejected": -1.439643383026123, - "eval_logps/chosen": -340.6527404785156, - "eval_logps/rejected": -371.2197570800781, - "eval_loss": 0.5694642066955566, - "eval_rewards/accuracies": 0.6984999775886536, - "eval_rewards/chosen": -0.7600072622299194, - "eval_rewards/margins": 0.5061042904853821, - "eval_rewards/rejected": -1.2661116123199463, - "eval_runtime": 384.8138, - "eval_samples_per_second": 5.197, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.778578281402588, + "eval_logits/rejected": -1.6489102840423584, + "eval_logps/chosen": -342.7493896484375, + "eval_logps/rejected": -372.69134521484375, + "eval_loss": 0.5649436712265015, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -0.7809735536575317, + "eval_rewards/margins": 0.49985405802726746, + "eval_rewards/rejected": -1.2808276414871216, + "eval_runtime": 385.3125, + "eval_samples_per_second": 5.191, + "eval_steps_per_second": 0.649, "step": 600 }, { "epoch": 0.16, - "grad_norm": 8.375, + "grad_norm": 6.71875, "learning_rate": 4.946408985913344e-06, - "logits/chosen": -1.6002981662750244, - "logits/rejected": -1.5132791996002197, - "logps/chosen": -320.1644592285156, - "logps/rejected": -367.06707763671875, - "loss": 0.5344, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7430071830749512, - "rewards/margins": 0.6466056108474731, - "rewards/rejected": -1.3896129131317139, + "logits/chosen": -1.8086153268814087, + "logits/rejected": -1.7312501668930054, + "logps/chosen": -321.55279541015625, + "logps/rejected": -367.79229736328125, + "loss": 0.5218, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7568905353546143, + "rewards/margins": 0.6399748921394348, + "rewards/rejected": -1.3968654870986938, "step": 610 }, { "epoch": 0.16, - "grad_norm": 11.5, + "grad_norm": 16.25, "learning_rate": 4.941602180974958e-06, - "logits/chosen": -1.5505564212799072, - "logits/rejected": -1.3113032579421997, - "logps/chosen": -394.12786865234375, - "logps/rejected": -412.055908203125, - "loss": 0.5398, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.0630056858062744, - "rewards/margins": 0.7659745216369629, - "rewards/rejected": -1.8289800882339478, + "logits/chosen": -1.833062767982483, + "logits/rejected": -1.5977442264556885, + "logps/chosen": -380.17169189453125, + "logps/rejected": -390.75848388671875, + "loss": 0.524, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9234441518783569, + "rewards/margins": 0.6925610303878784, + "rewards/rejected": -1.616005301475525, "step": 620 }, { "epoch": 0.16, - "grad_norm": 4.96875, + "grad_norm": 6.71875, "learning_rate": 4.936591502957101e-06, - "logits/chosen": -1.4749863147735596, - "logits/rejected": -1.315354347229004, - "logps/chosen": -395.21185302734375, - "logps/rejected": -471.55157470703125, - "loss": 0.5375, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.470820665359497, - "rewards/margins": 0.8663395643234253, - "rewards/rejected": -2.337160587310791, + "logits/chosen": -1.813197374343872, + "logits/rejected": -1.6430933475494385, + "logps/chosen": -355.9547424316406, + "logps/rejected": -418.7765197753906, + "loss": 0.5344, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.078249216079712, + "rewards/margins": 0.7311606407165527, + "rewards/rejected": -1.809409737586975, "step": 630 }, { "epoch": 0.17, - "grad_norm": 8.6875, + "grad_norm": 7.0, "learning_rate": 4.931377370249946e-06, - "logits/chosen": -1.4970736503601074, - "logits/rejected": -1.2784922122955322, - "logps/chosen": -469.60565185546875, - "logps/rejected": -513.3096923828125, - "loss": 0.5511, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0428223609924316, - "rewards/margins": 0.6945194602012634, - "rewards/rejected": -2.73734188079834, + "logits/chosen": -1.8197021484375, + "logits/rejected": -1.5834531784057617, + "logps/chosen": -435.12738037109375, + "logps/rejected": -468.70501708984375, + "loss": 0.5641, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6980397701263428, + "rewards/margins": 0.593255341053009, + "rewards/rejected": -2.291295289993286, "step": 640 }, { "epoch": 0.17, - "grad_norm": 12.1875, + "grad_norm": 10.0, "learning_rate": 4.925960218232073e-06, - "logits/chosen": -1.5300517082214355, - "logits/rejected": -1.4055113792419434, - "logps/chosen": -411.9557189941406, - "logps/rejected": -478.2559509277344, - "loss": 0.5422, + "logits/chosen": -1.7958835363388062, + "logits/rejected": -1.6748111248016357, + "logps/chosen": -392.5576171875, + "logps/rejected": -455.75811767578125, + "loss": 0.5384, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5665361881256104, - "rewards/margins": 0.7580282092094421, - "rewards/rejected": -2.324564218521118, + "rewards/chosen": -1.372554898262024, + "rewards/margins": 0.7270306348800659, + "rewards/rejected": -2.09958553314209, "step": 650 }, { "epoch": 0.17, - "grad_norm": 8.9375, + "grad_norm": 8.1875, "learning_rate": 4.920340499234116e-06, - "logits/chosen": -1.5464222431182861, - "logits/rejected": -1.322009563446045, - "logps/chosen": -396.75299072265625, - "logps/rejected": -416.83868408203125, - "loss": 0.5751, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.307492733001709, - "rewards/margins": 0.5529674291610718, - "rewards/rejected": -1.8604600429534912, + "logits/chosen": -1.7571017742156982, + "logits/rejected": -1.5184545516967773, + "logps/chosen": -403.0295715332031, + "logps/rejected": -419.2205505371094, + "loss": 0.5787, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3702582120895386, + "rewards/margins": 0.5140202641487122, + "rewards/rejected": -1.884278655052185, "step": 660 }, { "epoch": 0.18, - "grad_norm": 6.875, + "grad_norm": 6.53125, "learning_rate": 4.914518682500995e-06, - "logits/chosen": -1.7627025842666626, - "logits/rejected": -1.554469347000122, - "logps/chosen": -392.9539794921875, - "logps/rejected": -419.94952392578125, - "loss": 0.529, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0941137075424194, - "rewards/margins": 0.6771708726882935, - "rewards/rejected": -1.7712844610214233, + "logits/chosen": -1.9124584197998047, + "logits/rejected": -1.694361925125122, + "logps/chosen": -436.59747314453125, + "logps/rejected": -460.3738708496094, + "loss": 0.5391, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.530548334121704, + "rewards/margins": 0.6449794769287109, + "rewards/rejected": -2.175528049468994, "step": 670 }, { "epoch": 0.18, - "grad_norm": 7.3125, + "grad_norm": 9.5625, "learning_rate": 4.9084952541527315e-06, - "logits/chosen": -1.6612813472747803, - "logits/rejected": -1.459668755531311, - "logps/chosen": -392.9391784667969, - "logps/rejected": -423.2802734375, - "loss": 0.5137, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1960499286651611, - "rewards/margins": 0.774462878704071, - "rewards/rejected": -1.9705129861831665, + "logits/chosen": -1.7815015316009521, + "logits/rejected": -1.5756428241729736, + "logps/chosen": -448.1412658691406, + "logps/rejected": -469.43603515625, + "loss": 0.5139, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7480707168579102, + "rewards/margins": 0.6839998364448547, + "rewards/rejected": -2.432070255279541, "step": 680 }, { "epoch": 0.18, - "grad_norm": 9.75, + "grad_norm": 9.3125, "learning_rate": 4.902270717143858e-06, - "logits/chosen": -1.5917980670928955, - "logits/rejected": -1.4846012592315674, - "logps/chosen": -386.42498779296875, - "logps/rejected": -509.9639587402344, - "loss": 0.4446, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.5019272565841675, - "rewards/margins": 1.1186031103134155, - "rewards/rejected": -2.620530605316162, + "logits/chosen": -1.7120873928070068, + "logits/rejected": -1.6082136631011963, + "logps/chosen": -419.0126037597656, + "logps/rejected": -534.6773681640625, + "loss": 0.4522, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.827803611755371, + "rewards/margins": 1.0398612022399902, + "rewards/rejected": -2.8676648139953613, "step": 690 }, { "epoch": 0.18, - "grad_norm": 10.375, + "grad_norm": 6.8125, "learning_rate": 4.895845591221427e-06, - "logits/chosen": -1.5352580547332764, - "logits/rejected": -1.4566528797149658, - "logps/chosen": -440.7987365722656, - "logps/rejected": -521.3024291992188, - "loss": 0.5282, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.8748830556869507, - "rewards/margins": 0.8497939109802246, - "rewards/rejected": -2.7246768474578857, + "logits/chosen": -1.676849365234375, + "logits/rejected": -1.601438283920288, + "logps/chosen": -455.9642639160156, + "logps/rejected": -528.1475219726562, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0265378952026367, + "rewards/margins": 0.7665891647338867, + "rewards/rejected": -2.7931270599365234, "step": 700 }, { "epoch": 0.18, - "eval_logits/chosen": -1.3803926706314087, - "eval_logits/rejected": -1.2514902353286743, - "eval_logps/chosen": -470.9231262207031, - "eval_logps/rejected": -536.3329467773438, - "eval_loss": 0.5560212135314941, - "eval_rewards/accuracies": 0.7164999842643738, - "eval_rewards/chosen": -2.062711000442505, - "eval_rewards/margins": 0.8545322418212891, - "eval_rewards/rejected": -2.917243242263794, - "eval_runtime": 385.2779, - "eval_samples_per_second": 5.191, + "eval_logits/chosen": -1.5919249057769775, + "eval_logits/rejected": -1.469058632850647, + "eval_logps/chosen": -455.5274963378906, + "eval_logps/rejected": -512.6751098632812, + "eval_loss": 0.560720682144165, + "eval_rewards/accuracies": 0.7059999704360962, + "eval_rewards/chosen": -1.9087554216384888, + "eval_rewards/margins": 0.7719098925590515, + "eval_rewards/rejected": -2.6806650161743164, + "eval_runtime": 385.1228, + "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 700 }, { "epoch": 0.19, - "grad_norm": 11.0, + "grad_norm": 9.5, "learning_rate": 4.8892204128816e-06, - "logits/chosen": -1.5460926294326782, - "logits/rejected": -1.4308459758758545, - "logps/chosen": -454.60797119140625, - "logps/rejected": -516.79296875, - "loss": 0.5244, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8704407215118408, - "rewards/margins": 0.7577110528945923, - "rewards/rejected": -2.6281516551971436, + "logits/chosen": -1.7319362163543701, + "logits/rejected": -1.619175672531128, + "logps/chosen": -431.63232421875, + "logps/rejected": -489.86297607421875, + "loss": 0.5277, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6406848430633545, + "rewards/margins": 0.7181671857833862, + "rewards/rejected": -2.358851909637451, "step": 710 }, { "epoch": 0.19, - "grad_norm": 7.78125, + "grad_norm": 7.28125, "learning_rate": 4.882395735324864e-06, - "logits/chosen": -1.5478827953338623, - "logits/rejected": -1.4022762775421143, - "logps/chosen": -418.3267517089844, - "logps/rejected": -491.538818359375, - "loss": 0.5, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.5224473476409912, - "rewards/margins": 0.9039508104324341, - "rewards/rejected": -2.4263980388641357, + "logits/chosen": -1.6986335515975952, + "logits/rejected": -1.5594747066497803, + "logps/chosen": -427.96978759765625, + "logps/rejected": -497.16766357421875, + "loss": 0.4996, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6188774108886719, + "rewards/margins": 0.8638092875480652, + "rewards/rejected": -2.4826865196228027, "step": 720 }, { "epoch": 0.19, - "grad_norm": 7.28125, + "grad_norm": 7.25, "learning_rate": 4.87537212840983e-06, - "logits/chosen": -1.5722401142120361, - "logits/rejected": -1.4339630603790283, - "logps/chosen": -399.0245056152344, - "logps/rejected": -433.6438903808594, - "loss": 0.5526, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3276931047439575, - "rewards/margins": 0.6516003608703613, - "rewards/rejected": -1.9792934656143188, + "logits/chosen": -1.6116526126861572, + "logits/rejected": -1.474578619003296, + "logps/chosen": -464.416259765625, + "logps/rejected": -503.581787109375, + "loss": 0.576, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.981610894203186, + "rewards/margins": 0.6970613598823547, + "rewards/rejected": -2.6786723136901855, "step": 730 }, { "epoch": 0.19, - "grad_norm": 9.8125, + "grad_norm": 9.1875, "learning_rate": 4.8681501786056545e-06, - "logits/chosen": -1.4682786464691162, - "logits/rejected": -1.3141381740570068, - "logps/chosen": -346.26873779296875, - "logps/rejected": -393.406005859375, - "loss": 0.5037, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.1805267333984375, - "rewards/margins": 0.7937687039375305, - "rewards/rejected": -1.9742956161499023, + "logits/chosen": -1.5888502597808838, + "logits/rejected": -1.4401233196258545, + "logps/chosen": -373.1294860839844, + "logps/rejected": -415.46240234375, + "loss": 0.5066, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.449134111404419, + "rewards/margins": 0.7457250356674194, + "rewards/rejected": -2.194859027862549, "step": 740 }, { "epoch": 0.2, - "grad_norm": 15.0, + "grad_norm": 14.75, "learning_rate": 4.860730488943068e-06, - "logits/chosen": -1.3719309568405151, - "logits/rejected": -1.326554536819458, - "logps/chosen": -396.23095703125, - "logps/rejected": -483.92169189453125, - "loss": 0.4852, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.6170562505722046, - "rewards/margins": 0.8960045576095581, - "rewards/rejected": -2.513061046600342, + "logits/chosen": -1.6056511402130127, + "logits/rejected": -1.5784225463867188, + "logps/chosen": -356.6183166503906, + "logps/rejected": -429.750732421875, + "loss": 0.5024, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2209298610687256, + "rewards/margins": 0.7504220008850098, + "rewards/rejected": -1.971351981163025, "step": 750 }, { "epoch": 0.2, - "grad_norm": 6.59375, + "grad_norm": 6.96875, "learning_rate": 4.853113678964022e-06, - "logits/chosen": -1.3708162307739258, - "logits/rejected": -1.295253038406372, - "logps/chosen": -474.16522216796875, - "logps/rejected": -546.8492431640625, - "loss": 0.5003, + "logits/chosen": -1.6386051177978516, + "logits/rejected": -1.5690464973449707, + "logps/chosen": -394.1507568359375, + "logps/rejected": -469.383056640625, + "loss": 0.4908, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.0177154541015625, - "rewards/margins": 0.8057907223701477, - "rewards/rejected": -2.8235061168670654, + "rewards/chosen": -1.2175710201263428, + "rewards/margins": 0.8312736749649048, + "rewards/rejected": -2.048844575881958, "step": 760 }, { "epoch": 0.2, - "grad_norm": 6.875, + "grad_norm": 15.75, "learning_rate": 4.845300384669958e-06, - "logits/chosen": -1.4265882968902588, - "logits/rejected": -1.292400598526001, - "logps/chosen": -428.4332580566406, - "logps/rejected": -476.52471923828125, - "loss": 0.5517, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.7522939443588257, - "rewards/margins": 0.7091314196586609, - "rewards/rejected": -2.461425304412842, + "logits/chosen": -1.6991758346557617, + "logits/rejected": -1.563987374305725, + "logps/chosen": -405.8094482421875, + "logps/rejected": -445.58209228515625, + "loss": 0.5794, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5260562896728516, + "rewards/margins": 0.6259430050849915, + "rewards/rejected": -2.1519992351531982, "step": 770 }, { "epoch": 0.2, - "grad_norm": 12.3125, + "grad_norm": 8.9375, "learning_rate": 4.837291258468701e-06, - "logits/chosen": -1.4694699048995972, - "logits/rejected": -1.3255608081817627, - "logps/chosen": -471.6322326660156, - "logps/rejected": -542.5789794921875, - "loss": 0.5457, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.8628593683242798, - "rewards/margins": 0.9583638310432434, - "rewards/rejected": -2.821223258972168, + "logits/chosen": -1.7494251728057861, + "logits/rejected": -1.6077022552490234, + "logps/chosen": -431.01519775390625, + "logps/rejected": -486.4640197753906, + "loss": 0.5468, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4566891193389893, + "rewards/margins": 0.8033839464187622, + "rewards/rejected": -2.260073184967041, "step": 780 }, { "epoch": 0.21, - "grad_norm": 9.8125, + "grad_norm": 6.78125, "learning_rate": 4.829086969119984e-06, - "logits/chosen": -1.3006738424301147, - "logits/rejected": -1.2943049669265747, - "logps/chosen": -477.1316833496094, - "logps/rejected": -552.9581298828125, - "loss": 0.6434, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.2640693187713623, - "rewards/margins": 0.740705132484436, - "rewards/rejected": -3.004774570465088, + "logits/chosen": -1.613250732421875, + "logits/rejected": -1.5955699682235718, + "logps/chosen": -397.90008544921875, + "logps/rejected": -463.9117126464844, + "loss": 0.6001, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4717532396316528, + "rewards/margins": 0.64255690574646, + "rewards/rejected": -2.1143100261688232, "step": 790 }, { "epoch": 0.21, - "grad_norm": 9.625, + "grad_norm": 7.5, "learning_rate": 4.820688201679605e-06, - "logits/chosen": -1.5880753993988037, - "logits/rejected": -1.2926288843154907, - "logps/chosen": -464.5062561035156, - "logps/rejected": -476.73968505859375, - "loss": 0.5205, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.0187087059020996, - "rewards/margins": 0.745971143245697, - "rewards/rejected": -2.7646796703338623, + "logits/chosen": -1.8398478031158447, + "logits/rejected": -1.5474001169204712, + "logps/chosen": -399.21368408203125, + "logps/rejected": -416.2703552246094, + "loss": 0.4996, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3657824993133545, + "rewards/margins": 0.7942038774490356, + "rewards/rejected": -2.1599864959716797, "step": 800 }, { "epoch": 0.21, - "eval_logits/chosen": -1.4040697813034058, - "eval_logits/rejected": -1.275604009628296, - "eval_logps/chosen": -434.3306884765625, - "eval_logps/rejected": -484.6470031738281, - "eval_loss": 0.5363825559616089, - "eval_rewards/accuracies": 0.7264999747276306, - "eval_rewards/chosen": -1.6967861652374268, - "eval_rewards/margins": 0.7035976052284241, - "eval_rewards/rejected": -2.400383472442627, - "eval_runtime": 385.6904, - "eval_samples_per_second": 5.186, - "eval_steps_per_second": 0.648, + "eval_logits/chosen": -1.6709563732147217, + "eval_logits/rejected": -1.546115756034851, + "eval_logps/chosen": -409.65435791015625, + "eval_logps/rejected": -460.5684814453125, + "eval_loss": 0.543312132358551, + "eval_rewards/accuracies": 0.7070000171661377, + "eval_rewards/chosen": -1.4500234127044678, + "eval_rewards/margins": 0.7095751166343689, + "eval_rewards/rejected": -2.1595985889434814, + "eval_runtime": 385.2124, + "eval_samples_per_second": 5.192, + "eval_steps_per_second": 0.649, "step": 800 }, { "epoch": 0.21, - "grad_norm": 7.28125, + "grad_norm": 7.34375, "learning_rate": 4.8120956574422315e-06, - "logits/chosen": -1.5584218502044678, - "logits/rejected": -1.5333359241485596, - "logps/chosen": -437.0270080566406, - "logps/rejected": -483.4334411621094, - "loss": 0.5977, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.6310217380523682, - "rewards/margins": 0.5348454117774963, - "rewards/rejected": -2.165867328643799, + "logits/chosen": -1.7810264825820923, + "logits/rejected": -1.7489475011825562, + "logps/chosen": -431.69219970703125, + "logps/rejected": -477.871337890625, + "loss": 0.6275, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5776736736297607, + "rewards/margins": 0.5325725674629211, + "rewards/rejected": -2.110246181488037, "step": 810 }, { "epoch": 0.21, - "grad_norm": 8.3125, + "grad_norm": 13.625, "learning_rate": 4.803310053882831e-06, - "logits/chosen": -1.5471457242965698, - "logits/rejected": -1.5452721118927002, - "logps/chosen": -370.4143981933594, - "logps/rejected": -443.0765686035156, - "loss": 0.5346, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4108860492706299, - "rewards/margins": 0.6031405925750732, - "rewards/rejected": -2.014026641845703, + "logits/chosen": -1.7703052759170532, + "logits/rejected": -1.7803173065185547, + "logps/chosen": -363.9437561035156, + "logps/rejected": -435.0057678222656, + "loss": 0.5542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3461793661117554, + "rewards/margins": 0.5871396064758301, + "rewards/rejected": -1.933318853378296, "step": 820 }, { "epoch": 0.22, - "grad_norm": 7.65625, + "grad_norm": 6.875, "learning_rate": 4.794332124596775e-06, - "logits/chosen": -1.542186975479126, - "logits/rejected": -1.4280331134796143, - "logps/chosen": -428.5491638183594, - "logps/rejected": -479.2059631347656, - "loss": 0.5879, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.6010816097259521, - "rewards/margins": 0.5433088541030884, - "rewards/rejected": -2.144390344619751, + "logits/chosen": -1.8022472858428955, + "logits/rejected": -1.6746841669082642, + "logps/chosen": -397.36090087890625, + "logps/rejected": -445.603759765625, + "loss": 0.5885, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2891987562179565, + "rewards/margins": 0.5191696882247925, + "rewards/rejected": -1.808368444442749, "step": 830 }, { "epoch": 0.22, - "grad_norm": 6.25, + "grad_norm": 9.375, "learning_rate": 4.785162619238575e-06, - "logits/chosen": -1.5202006101608276, - "logits/rejected": -1.357043743133545, - "logps/chosen": -382.9326171875, - "logps/rejected": -429.1929626464844, - "loss": 0.5204, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2908388376235962, - "rewards/margins": 0.7357919216156006, - "rewards/rejected": -2.0266308784484863, + "logits/chosen": -1.7888991832733154, + "logits/rejected": -1.6187770366668701, + "logps/chosen": -355.0903015136719, + "logps/rejected": -387.1643981933594, + "loss": 0.5416, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.012415885925293, + "rewards/margins": 0.5939286947250366, + "rewards/rejected": -1.6063445806503296, "step": 840 }, { "epoch": 0.22, - "grad_norm": 8.375, + "grad_norm": 6.78125, "learning_rate": 4.775802303459288e-06, - "logits/chosen": -1.4242273569107056, - "logits/rejected": -1.3464834690093994, - "logps/chosen": -368.41864013671875, - "logps/rejected": -433.1285705566406, - "loss": 0.5601, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2065433263778687, - "rewards/margins": 0.69434654712677, - "rewards/rejected": -1.9008897542953491, + "logits/chosen": -1.7059911489486694, + "logits/rejected": -1.6270997524261475, + "logps/chosen": -346.2181091308594, + "logps/rejected": -401.40069580078125, + "loss": 0.5465, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9845376014709473, + "rewards/margins": 0.5990740656852722, + "rewards/rejected": -1.5836117267608643, "step": 850 }, { "epoch": 0.23, - "grad_norm": 11.5625, + "grad_norm": 11.125, "learning_rate": 4.766251958842589e-06, - "logits/chosen": -1.3795868158340454, - "logits/rejected": -1.2619365453720093, - "logps/chosen": -401.754150390625, - "logps/rejected": -458.87786865234375, - "loss": 0.5436, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2925317287445068, - "rewards/margins": 0.6759337186813354, - "rewards/rejected": -1.9684655666351318, + "logits/chosen": -1.676922082901001, + "logits/rejected": -1.5429388284683228, + "logps/chosen": -394.45416259765625, + "logps/rejected": -433.03369140625, + "loss": 0.5815, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.219531774520874, + "rewards/margins": 0.49049144983291626, + "rewards/rejected": -1.710023283958435, "step": 860 }, { "epoch": 0.23, - "grad_norm": 7.375, + "grad_norm": 6.78125, "learning_rate": 4.7565123828395066e-06, - "logits/chosen": -1.2665306329727173, - "logits/rejected": -1.1762630939483643, - "logps/chosen": -408.20220947265625, - "logps/rejected": -490.75634765625, - "loss": 0.5077, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.522733449935913, - "rewards/margins": 0.8316558599472046, - "rewards/rejected": -2.3543894290924072, + "logits/chosen": -1.5784261226654053, + "logits/rejected": -1.5068719387054443, + "logps/chosen": -391.16192626953125, + "logps/rejected": -455.4800720214844, + "loss": 0.531, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3523309230804443, + "rewards/margins": 0.6492956280708313, + "rewards/rejected": -2.001626491546631, "step": 870 }, { "epoch": 0.23, - "grad_norm": 7.71875, + "grad_norm": 6.65625, "learning_rate": 4.746584388701831e-06, - "logits/chosen": -1.2851665019989014, - "logits/rejected": -1.2308191061019897, - "logps/chosen": -443.7979431152344, - "logps/rejected": -522.8231201171875, - "loss": 0.5035, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.8055862188339233, - "rewards/margins": 0.9323374629020691, - "rewards/rejected": -2.7379238605499268, + "logits/chosen": -1.6509666442871094, + "logits/rejected": -1.5814907550811768, + "logps/chosen": -408.57598876953125, + "logps/rejected": -468.8497619628906, + "loss": 0.5239, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4533666372299194, + "rewards/margins": 0.7448235750198364, + "rewards/rejected": -2.198190212249756, "step": 880 }, { "epoch": 0.23, - "grad_norm": 10.5625, + "grad_norm": 9.5625, "learning_rate": 4.736468805414218e-06, - "logits/chosen": -1.250001072883606, - "logits/rejected": -1.2183704376220703, - "logps/chosen": -407.4109802246094, - "logps/rejected": -504.15130615234375, - "loss": 0.5632, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5997565984725952, - "rewards/margins": 0.8288668394088745, - "rewards/rejected": -2.428623676300049, + "logits/chosen": -1.6324241161346436, + "logits/rejected": -1.6051101684570312, + "logps/chosen": -362.0763244628906, + "logps/rejected": -444.11077880859375, + "loss": 0.5667, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1464101076126099, + "rewards/margins": 0.6818080544471741, + "rewards/rejected": -1.8282181024551392, "step": 890 }, { "epoch": 0.24, - "grad_norm": 10.375, + "grad_norm": 12.125, "learning_rate": 4.7261664776249595e-06, - "logits/chosen": -1.1423993110656738, - "logits/rejected": -1.050307273864746, - "logps/chosen": -386.7989807128906, - "logps/rejected": -467.6982421875, - "loss": 0.4983, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5788599252700806, - "rewards/margins": 0.8989045023918152, - "rewards/rejected": -2.477764368057251, + "logits/chosen": -1.5433322191238403, + "logits/rejected": -1.4583094120025635, + "logps/chosen": -336.41778564453125, + "logps/rejected": -401.85772705078125, + "loss": 0.514, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0750483274459839, + "rewards/margins": 0.7443105578422546, + "rewards/rejected": -1.8193588256835938, "step": 900 }, { "epoch": 0.24, - "eval_logits/chosen": -1.2160884141921997, - "eval_logits/rejected": -1.095556616783142, - "eval_logps/chosen": -432.6338806152344, - "eval_logps/rejected": -489.990966796875, - "eval_loss": 0.5329138040542603, - "eval_rewards/accuracies": 0.7204999923706055, - "eval_rewards/chosen": -1.679819107055664, - "eval_rewards/margins": 0.7740048170089722, - "eval_rewards/rejected": -2.4538238048553467, - "eval_runtime": 384.8785, - "eval_samples_per_second": 5.196, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.621368169784546, + "eval_logits/rejected": -1.5014086961746216, + "eval_logps/chosen": -391.2229919433594, + "eval_logps/rejected": -436.3040771484375, + "eval_loss": 0.5440120697021484, + "eval_rewards/accuracies": 0.718999981880188, + "eval_rewards/chosen": -1.2657097578048706, + "eval_rewards/margins": 0.6512450575828552, + "eval_rewards/rejected": -1.916954755783081, + "eval_runtime": 385.3527, + "eval_samples_per_second": 5.19, + "eval_steps_per_second": 0.649, "step": 900 }, { "epoch": 0.24, - "grad_norm": 8.8125, + "grad_norm": 7.8125, "learning_rate": 4.715678265575463e-06, - "logits/chosen": -1.3864091634750366, - "logits/rejected": -1.188279390335083, - "logps/chosen": -431.4292907714844, - "logps/rejected": -437.1918029785156, - "loss": 0.5621, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4715720415115356, - "rewards/margins": 0.6131043434143066, - "rewards/rejected": -2.0846762657165527, + "logits/chosen": -1.7400833368301392, + "logits/rejected": -1.5401082038879395, + "logps/chosen": -410.2032775878906, + "logps/rejected": -411.843994140625, + "loss": 0.556, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2593110799789429, + "rewards/margins": 0.5718873739242554, + "rewards/rejected": -1.8311984539031982, "step": 910 }, { "epoch": 0.24, - "grad_norm": 9.1875, + "grad_norm": 9.3125, "learning_rate": 4.705005045028415e-06, - "logits/chosen": -1.3089066743850708, - "logits/rejected": -1.1869691610336304, - "logps/chosen": -409.567626953125, - "logps/rejected": -467.461181640625, - "loss": 0.5395, + "logits/chosen": -1.6306053400039673, + "logits/rejected": -1.5210235118865967, + "logps/chosen": -400.1542053222656, + "logps/rejected": -448.408447265625, + "loss": 0.5563, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.4643157720565796, - "rewards/margins": 0.7529044151306152, - "rewards/rejected": -2.2172200679779053, + "rewards/chosen": -1.370181679725647, + "rewards/margins": 0.6565110087394714, + "rewards/rejected": -2.0266928672790527, "step": 920 }, { "epoch": 0.24, - "grad_norm": 10.1875, + "grad_norm": 10.4375, "learning_rate": 4.694147707194659e-06, - "logits/chosen": -1.3738486766815186, - "logits/rejected": -1.295534372329712, - "logps/chosen": -467.11102294921875, - "logps/rejected": -519.7848510742188, - "loss": 0.5253, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.9523290395736694, - "rewards/margins": 0.7249565720558167, - "rewards/rejected": -2.677285671234131, + "logits/chosen": -1.6995433568954468, + "logits/rejected": -1.6389293670654297, + "logps/chosen": -427.10137939453125, + "logps/rejected": -471.07952880859375, + "loss": 0.5469, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5522325038909912, + "rewards/margins": 0.6380002498626709, + "rewards/rejected": -2.190232753753662, "step": 930 }, { "epoch": 0.25, - "grad_norm": 8.5, + "grad_norm": 6.65625, "learning_rate": 4.683107158658782e-06, - "logits/chosen": -1.2867457866668701, - "logits/rejected": -1.202335000038147, - "logps/chosen": -514.80126953125, - "logps/rejected": -559.9244384765625, - "loss": 0.5364, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.245605707168579, - "rewards/margins": 0.7386397123336792, - "rewards/rejected": -2.9842450618743896, + "logits/chosen": -1.6130173206329346, + "logits/rejected": -1.5491468906402588, + "logps/chosen": -439.54248046875, + "logps/rejected": -480.834228515625, + "loss": 0.512, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4930182695388794, + "rewards/margins": 0.7003245949745178, + "rewards/rejected": -2.193342924118042, "step": 940 }, { "epoch": 0.25, - "grad_norm": 7.9375, + "grad_norm": 11.3125, "learning_rate": 4.671884321303407e-06, - "logits/chosen": -1.4214376211166382, - "logits/rejected": -1.286566972732544, - "logps/chosen": -444.1116638183594, - "logps/rejected": -498.4622497558594, - "loss": 0.5246, + "logits/chosen": -1.6797221899032593, + "logits/rejected": -1.5230547189712524, + "logps/chosen": -394.5656433105469, + "logps/rejected": -453.25946044921875, + "loss": 0.5134, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.9778563976287842, - "rewards/margins": 0.6870826482772827, - "rewards/rejected": -2.6649391651153564, + "rewards/chosen": -1.4823963642120361, + "rewards/margins": 0.7305151224136353, + "rewards/rejected": -2.212911605834961, "step": 950 }, { "epoch": 0.25, - "grad_norm": 7.625, + "grad_norm": 7.9375, "learning_rate": 4.660480132232224e-06, - "logits/chosen": -1.5147006511688232, - "logits/rejected": -1.4103442430496216, - "logps/chosen": -423.77252197265625, - "logps/rejected": -456.38330078125, - "loss": 0.5784, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.5195940732955933, - "rewards/margins": 0.5671976804733276, - "rewards/rejected": -2.086791753768921, + "logits/chosen": -1.7173080444335938, + "logits/rejected": -1.60665762424469, + "logps/chosen": -406.39117431640625, + "logps/rejected": -445.9922790527344, + "loss": 0.5666, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3457807302474976, + "rewards/margins": 0.637101411819458, + "rewards/rejected": -1.9828822612762451, "step": 960 }, { "epoch": 0.25, - "grad_norm": 14.4375, + "grad_norm": 8.5625, "learning_rate": 4.6488955436917414e-06, - "logits/chosen": -1.5512803792953491, - "logits/rejected": -1.333134412765503, - "logps/chosen": -416.9090270996094, - "logps/rejected": -457.83563232421875, - "loss": 0.5199, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.369279384613037, - "rewards/margins": 0.875058650970459, - "rewards/rejected": -2.244338274002075, + "logits/chosen": -1.7457382678985596, + "logits/rejected": -1.5430558919906616, + "logps/chosen": -429.39300537109375, + "logps/rejected": -465.61224365234375, + "loss": 0.5461, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.49411940574646, + "rewards/margins": 0.8279851078987122, + "rewards/rejected": -2.3221046924591064, "step": 970 }, { "epoch": 0.26, - "grad_norm": 5.71875, + "grad_norm": 7.03125, "learning_rate": 4.6371315229917644e-06, - "logits/chosen": -1.4992878437042236, - "logits/rejected": -1.369624376296997, - "logps/chosen": -448.16314697265625, - "logps/rejected": -506.52191162109375, - "loss": 0.527, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.6474645137786865, - "rewards/margins": 0.7875878810882568, - "rewards/rejected": -2.4350523948669434, + "logits/chosen": -1.7286710739135742, + "logits/rejected": -1.5955041646957397, + "logps/chosen": -443.83270263671875, + "logps/rejected": -498.59967041015625, + "loss": 0.5188, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6041603088378906, + "rewards/margins": 0.7516692876815796, + "rewards/rejected": -2.3558297157287598, "step": 980 }, { "epoch": 0.26, - "grad_norm": 11.9375, + "grad_norm": 10.625, "learning_rate": 4.625189052424638e-06, - "logits/chosen": -1.4010595083236694, - "logits/rejected": -1.259636402130127, - "logps/chosen": -442.61590576171875, - "logps/rejected": -522.6851196289062, - "loss": 0.4455, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.035126209259033, - "rewards/margins": 1.0280003547668457, - "rewards/rejected": -3.063126802444458, + "logits/chosen": -1.6606595516204834, + "logits/rejected": -1.5426713228225708, + "logps/chosen": -412.0262145996094, + "logps/rejected": -478.1866149902344, + "loss": 0.4696, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7292293310165405, + "rewards/margins": 0.888912558555603, + "rewards/rejected": -2.6181421279907227, "step": 990 }, { "epoch": 0.26, - "grad_norm": 9.5, + "grad_norm": 7.46875, "learning_rate": 4.613069129183218e-06, - "logits/chosen": -1.4701205492019653, - "logits/rejected": -1.3206748962402344, - "logps/chosen": -506.3350524902344, - "logps/rejected": -548.3522338867188, - "loss": 0.5443, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.02931809425354, - "rewards/margins": 0.7819973826408386, - "rewards/rejected": -2.8113150596618652, + "logits/chosen": -1.7503217458724976, + "logits/rejected": -1.6148483753204346, + "logps/chosen": -452.8263244628906, + "logps/rejected": -481.222900390625, + "loss": 0.5468, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4942306280136108, + "rewards/margins": 0.6457923054695129, + "rewards/rejected": -2.1400229930877686, "step": 1000 }, { "epoch": 0.26, - "eval_logits/chosen": -1.2533063888549805, - "eval_logits/rejected": -1.1263666152954102, - "eval_logps/chosen": -454.4657897949219, - "eval_logps/rejected": -521.2656860351562, - "eval_loss": 0.5279005169868469, - "eval_rewards/accuracies": 0.7239999771118164, - "eval_rewards/chosen": -1.8981376886367798, - "eval_rewards/margins": 0.8684335947036743, - "eval_rewards/rejected": -2.766571283340454, - "eval_runtime": 396.7737, - "eval_samples_per_second": 5.041, - "eval_steps_per_second": 0.63, + "eval_logits/chosen": -1.5656111240386963, + "eval_logits/rejected": -1.4448813199996948, + "eval_logps/chosen": -401.67669677734375, + "eval_logps/rejected": -451.64080810546875, + "eval_loss": 0.5418093204498291, + "eval_rewards/accuracies": 0.7174999713897705, + "eval_rewards/chosen": -1.3702467679977417, + "eval_rewards/margins": 0.7000752091407776, + "eval_rewards/rejected": -2.070322036743164, + "eval_runtime": 385.2164, + "eval_samples_per_second": 5.192, + "eval_steps_per_second": 0.649, "step": 1000 }, { "epoch": 0.26, - "grad_norm": 8.8125, + "grad_norm": 7.8125, "learning_rate": 4.600772765277607e-06, - "logits/chosen": -1.295363187789917, - "logits/rejected": -1.21110200881958, - "logps/chosen": -406.8030700683594, - "logps/rejected": -491.1182556152344, - "loss": 0.4907, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.7324583530426025, - "rewards/margins": 0.8947044610977173, - "rewards/rejected": -2.6271629333496094, + "logits/chosen": -1.531764268875122, + "logits/rejected": -1.4728986024856567, + "logps/chosen": -375.1974792480469, + "logps/rejected": -444.0108337402344, + "loss": 0.5138, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4164024591445923, + "rewards/margins": 0.7396863698959351, + "rewards/rejected": -2.1560888290405273, "step": 1010 }, { "epoch": 0.27, - "grad_norm": 10.5625, + "grad_norm": 16.25, "learning_rate": 4.588300987450652e-06, - "logits/chosen": -1.3897264003753662, - "logits/rejected": -1.2775170803070068, - "logps/chosen": -400.9536437988281, - "logps/rejected": -446.50244140625, - "loss": 0.5402, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4720966815948486, - "rewards/margins": 0.7910727262496948, - "rewards/rejected": -2.263169050216675, + "logits/chosen": -1.6515556573867798, + "logits/rejected": -1.547123670578003, + "logps/chosen": -394.8990173339844, + "logps/rejected": -431.17645263671875, + "loss": 0.5418, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4115506410598755, + "rewards/margins": 0.6983591318130493, + "rewards/rejected": -2.1099095344543457, "step": 1020 }, { "epoch": 0.27, - "grad_norm": 6.5, + "grad_norm": 6.875, "learning_rate": 4.5756548370922136e-06, - "logits/chosen": -1.3535692691802979, - "logits/rejected": -1.2674891948699951, - "logps/chosen": -364.35394287109375, - "logps/rejected": -435.6012268066406, - "loss": 0.5053, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2365007400512695, - "rewards/margins": 0.8022448420524597, - "rewards/rejected": -2.038745641708374, + "logits/chosen": -1.6495920419692993, + "logits/rejected": -1.5659213066101074, + "logps/chosen": -351.00146484375, + "logps/rejected": -412.519287109375, + "loss": 0.5127, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1029760837554932, + "rewards/margins": 0.7049504518508911, + "rewards/rejected": -1.8079265356063843, "step": 1030 }, { "epoch": 0.27, - "grad_norm": 10.9375, + "grad_norm": 9.0625, "learning_rate": 4.562835370152206e-06, - "logits/chosen": -1.4244534969329834, - "logits/rejected": -1.2097485065460205, - "logps/chosen": -458.898681640625, - "logps/rejected": -542.5496215820312, - "loss": 0.4608, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.5766918659210205, - "rewards/margins": 1.1474573612213135, - "rewards/rejected": -2.724148988723755, + "logits/chosen": -1.7497320175170898, + "logits/rejected": -1.5089380741119385, + "logps/chosen": -426.8157653808594, + "logps/rejected": -491.87860107421875, + "loss": 0.473, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2558623552322388, + "rewards/margins": 0.9615765810012817, + "rewards/rejected": -2.2174386978149414, "step": 1040 }, { "epoch": 0.27, - "grad_norm": 6.8125, + "grad_norm": 7.0625, "learning_rate": 4.54984365705243e-06, - "logits/chosen": -1.3653185367584229, - "logits/rejected": -1.2626806497573853, - "logps/chosen": -441.8795471191406, - "logps/rejected": -547.1009521484375, - "loss": 0.5005, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.7340948581695557, - "rewards/margins": 1.117974042892456, - "rewards/rejected": -2.8520689010620117, + "logits/chosen": -1.6929643154144287, + "logits/rejected": -1.5880625247955322, + "logps/chosen": -421.701416015625, + "logps/rejected": -518.8970947265625, + "loss": 0.4784, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5323131084442139, + "rewards/margins": 1.0377166271209717, + "rewards/rejected": -2.5700297355651855, "step": 1050 }, { "epoch": 0.28, - "grad_norm": 10.1875, + "grad_norm": 12.25, "learning_rate": 4.536680782597191e-06, - "logits/chosen": -1.2812556028366089, - "logits/rejected": -1.2028696537017822, - "logps/chosen": -422.71746826171875, - "logps/rejected": -500.39276123046875, - "loss": 0.5823, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.8669233322143555, - "rewards/margins": 0.8329373598098755, - "rewards/rejected": -2.6998608112335205, + "logits/chosen": -1.5793530941009521, + "logits/rejected": -1.503025770187378, + "logps/chosen": -413.30792236328125, + "logps/rejected": -483.18048095703125, + "loss": 0.5921, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7728277444839478, + "rewards/margins": 0.7549096345901489, + "rewards/rejected": -2.5277373790740967, "step": 1060 }, { "epoch": 0.28, - "grad_norm": 20.625, + "grad_norm": 15.3125, "learning_rate": 4.523347845882718e-06, - "logits/chosen": -1.4049158096313477, - "logits/rejected": -1.2168524265289307, - "logps/chosen": -458.36956787109375, - "logps/rejected": -519.8746337890625, - "loss": 0.4512, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7019942998886108, - "rewards/margins": 1.1078611612319946, - "rewards/rejected": -2.8098552227020264, + "logits/chosen": -1.6937191486358643, + "logits/rejected": -1.5083749294281006, + "logps/chosen": -422.14447021484375, + "logps/rejected": -479.6094665527344, + "loss": 0.4495, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3397438526153564, + "rewards/margins": 1.0674594640731812, + "rewards/rejected": -2.407203197479248, "step": 1070 }, { "epoch": 0.28, - "grad_norm": 7.3125, + "grad_norm": 5.375, "learning_rate": 4.50984596020539e-06, - "logits/chosen": -1.2350142002105713, - "logits/rejected": -1.166898488998413, - "logps/chosen": -446.97833251953125, - "logps/rejected": -499.6363220214844, - "loss": 0.5662, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.6682651042938232, - "rewards/margins": 0.8490194082260132, - "rewards/rejected": -2.517284393310547, + "logits/chosen": -1.544276475906372, + "logits/rejected": -1.4562034606933594, + "logps/chosen": -403.8301696777344, + "logps/rejected": -444.5962829589844, + "loss": 0.573, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.236783504486084, + "rewards/margins": 0.7301002740859985, + "rewards/rejected": -1.9668840169906616, "step": 1080 }, { "epoch": 0.29, - "grad_norm": 7.125, + "grad_norm": 7.40625, "learning_rate": 4.4961762529687745e-06, - "logits/chosen": -1.427812933921814, - "logits/rejected": -1.3044004440307617, - "logps/chosen": -407.113525390625, - "logps/rejected": -485.9717712402344, - "loss": 0.4963, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.4671194553375244, - "rewards/margins": 0.9535934329032898, - "rewards/rejected": -2.420712947845459, + "logits/chosen": -1.6948843002319336, + "logits/rejected": -1.5669870376586914, + "logps/chosen": -365.44342041015625, + "logps/rejected": -436.5625915527344, + "loss": 0.5044, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0504177808761597, + "rewards/margins": 0.8762027621269226, + "rewards/rejected": -1.9266207218170166, "step": 1090 }, { "epoch": 0.29, - "grad_norm": 6.96875, + "grad_norm": 9.6875, "learning_rate": 4.482339865589492e-06, - "logits/chosen": -1.4146265983581543, - "logits/rejected": -1.2579143047332764, - "logps/chosen": -434.85498046875, - "logps/rejected": -476.2469787597656, - "loss": 0.565, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.685683012008667, - "rewards/margins": 0.8535755276679993, - "rewards/rejected": -2.5392584800720215, + "logits/chosen": -1.6588748693466187, + "logits/rejected": -1.5048010349273682, + "logps/chosen": -401.0564270019531, + "logps/rejected": -414.84466552734375, + "loss": 0.569, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3476970195770264, + "rewards/margins": 0.5775381922721863, + "rewards/rejected": -1.925235390663147, "step": 1100 }, { "epoch": 0.29, - "eval_logits/chosen": -1.271475911140442, - "eval_logits/rejected": -1.1444941759109497, - "eval_logps/chosen": -415.9482727050781, - "eval_logps/rejected": -478.28485107421875, - "eval_loss": 0.5207434296607971, - "eval_rewards/accuracies": 0.7289999723434448, - "eval_rewards/chosen": -1.5129626989364624, - "eval_rewards/margins": 0.8237999677658081, - "eval_rewards/rejected": -2.3367626667022705, - "eval_runtime": 400.4213, - "eval_samples_per_second": 4.995, - "eval_steps_per_second": 0.624, + "eval_logits/chosen": -1.5524324178695679, + "eval_logits/rejected": -1.427809476852417, + "eval_logps/chosen": -378.61767578125, + "eval_logps/rejected": -430.84136962890625, + "eval_loss": 0.5299040675163269, + "eval_rewards/accuracies": 0.7210000157356262, + "eval_rewards/chosen": -1.1396570205688477, + "eval_rewards/margins": 0.7226706147193909, + "eval_rewards/rejected": -1.8623274564743042, + "eval_runtime": 385.4496, + "eval_samples_per_second": 5.189, + "eval_steps_per_second": 0.649, "step": 1100 }, { "epoch": 0.29, - "grad_norm": 7.1875, + "grad_norm": 5.6875, "learning_rate": 4.468337953401909e-06, - "logits/chosen": -1.4210216999053955, - "logits/rejected": -1.3622697591781616, - "logps/chosen": -422.3374938964844, - "logps/rejected": -482.82489013671875, - "loss": 0.5625, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.5198637247085571, - "rewards/margins": 0.6534841656684875, - "rewards/rejected": -2.1733479499816895, + "logits/chosen": -1.661257028579712, + "logits/rejected": -1.5975781679153442, + "logps/chosen": -380.5933837890625, + "logps/rejected": -433.12139892578125, + "loss": 0.5657, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1024227142333984, + "rewards/margins": 0.5738898515701294, + "rewards/rejected": -1.6763126850128174, "step": 1110 }, { "epoch": 0.29, - "grad_norm": 8.5, + "grad_norm": 7.34375, "learning_rate": 4.45417168556166e-06, - "logits/chosen": -1.328687310218811, - "logits/rejected": -1.2253139019012451, - "logps/chosen": -382.0760192871094, - "logps/rejected": -463.2789001464844, - "loss": 0.5021, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4092481136322021, - "rewards/margins": 0.8184933662414551, - "rewards/rejected": -2.2277414798736572, + "logits/chosen": -1.5824635028839111, + "logits/rejected": -1.4781149625778198, + "logps/chosen": -340.6497497558594, + "logps/rejected": -407.69293212890625, + "loss": 0.5255, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9949854016304016, + "rewards/margins": 0.6768967509269714, + "rewards/rejected": -1.6718822717666626, "step": 1120 }, { "epoch": 0.3, - "grad_norm": 9.25, + "grad_norm": 10.5625, "learning_rate": 4.439842244948036e-06, - "logits/chosen": -1.2945640087127686, - "logits/rejected": -1.1504161357879639, - "logps/chosen": -432.81524658203125, - "logps/rejected": -501.5387268066406, - "loss": 0.5614, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.7175318002700806, - "rewards/margins": 0.742792010307312, - "rewards/rejected": -2.4603238105773926, + "logits/chosen": -1.5540910959243774, + "logits/rejected": -1.4291226863861084, + "logps/chosen": -390.7538757324219, + "logps/rejected": -446.49310302734375, + "loss": 0.5752, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.296918511390686, + "rewards/margins": 0.6129493117332458, + "rewards/rejected": -1.9098678827285767, "step": 1130 }, { "epoch": 0.3, - "grad_norm": 17.75, + "grad_norm": 14.125, "learning_rate": 4.425350828065204e-06, - "logits/chosen": -1.3531075716018677, - "logits/rejected": -1.145227074623108, - "logps/chosen": -450.9183654785156, - "logps/rejected": -492.70623779296875, - "loss": 0.495, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6683238744735718, - "rewards/margins": 0.8965514898300171, - "rewards/rejected": -2.564875364303589, + "logits/chosen": -1.6088273525238037, + "logits/rejected": -1.3946729898452759, + "logps/chosen": -412.3367614746094, + "logps/rejected": -442.0401916503906, + "loss": 0.5089, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2825069427490234, + "rewards/margins": 0.7757080793380737, + "rewards/rejected": -2.0582151412963867, "step": 1140 }, { "epoch": 0.3, - "grad_norm": 10.375, + "grad_norm": 7.875, "learning_rate": 4.410698644942303e-06, - "logits/chosen": -1.3738346099853516, - "logits/rejected": -1.2316032648086548, - "logps/chosen": -427.7021484375, - "logps/rejected": -499.776611328125, - "loss": 0.4885, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5538952350616455, - "rewards/margins": 0.9599501490592957, - "rewards/rejected": -2.513845443725586, + "logits/chosen": -1.6174886226654053, + "logits/rejected": -1.4844688177108765, + "logps/chosen": -402.29486083984375, + "logps/rejected": -463.25689697265625, + "loss": 0.4913, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.299822211265564, + "rewards/margins": 0.8488262891769409, + "rewards/rejected": -2.148648738861084, "step": 1150 }, { "epoch": 0.3, - "grad_norm": 11.8125, + "grad_norm": 11.125, "learning_rate": 4.395886919032406e-06, - "logits/chosen": -1.240241527557373, - "logits/rejected": -1.113989233970642, - "logps/chosen": -439.04095458984375, - "logps/rejected": -497.86297607421875, - "loss": 0.5348, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.7520458698272705, - "rewards/margins": 0.8531185984611511, - "rewards/rejected": -2.6051642894744873, + "logits/chosen": -1.4636362791061401, + "logits/rejected": -1.3575894832611084, + "logps/chosen": -405.80010986328125, + "logps/rejected": -456.88641357421875, + "loss": 0.5316, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4196369647979736, + "rewards/margins": 0.7757617235183716, + "rewards/rejected": -2.1953988075256348, "step": 1160 }, { "epoch": 0.31, - "grad_norm": 8.3125, + "grad_norm": 6.6875, "learning_rate": 4.380916887110366e-06, - "logits/chosen": -1.3933711051940918, - "logits/rejected": -1.2005256414413452, - "logps/chosen": -448.8775329589844, - "logps/rejected": -503.5738220214844, - "loss": 0.5142, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.897091269493103, - "rewards/margins": 0.9505263566970825, - "rewards/rejected": -2.8476176261901855, + "logits/chosen": -1.6339868307113647, + "logits/rejected": -1.4374290704727173, + "logps/chosen": -406.9070739746094, + "logps/rejected": -451.3981018066406, + "loss": 0.5169, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.477386474609375, + "rewards/margins": 0.8484745025634766, + "rewards/rejected": -2.3258609771728516, "step": 1170 }, { "epoch": 0.31, - "grad_norm": 6.71875, + "grad_norm": 6.84375, "learning_rate": 4.365789799169539e-06, - "logits/chosen": -1.2259958982467651, - "logits/rejected": -1.2687208652496338, - "logps/chosen": -432.62786865234375, - "logps/rejected": -522.6715698242188, - "loss": 0.5228, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8601690530776978, - "rewards/margins": 0.8469558954238892, - "rewards/rejected": -2.707124710083008, + "logits/chosen": -1.4347012042999268, + "logits/rejected": -1.4834723472595215, + "logps/chosen": -395.71014404296875, + "logps/rejected": -475.1640625, + "loss": 0.5232, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4909913539886475, + "rewards/margins": 0.7410578727722168, + "rewards/rejected": -2.2320492267608643, "step": 1180 }, { "epoch": 0.31, - "grad_norm": 10.0625, + "grad_norm": 6.78125, "learning_rate": 4.350506918317416e-06, - "logits/chosen": -1.4284965991973877, - "logits/rejected": -1.2679976224899292, - "logps/chosen": -413.3136291503906, - "logps/rejected": -491.58721923828125, - "loss": 0.4982, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6571779251098633, - "rewards/margins": 0.8545892834663391, - "rewards/rejected": -2.5117671489715576, + "logits/chosen": -1.6247329711914062, + "logits/rejected": -1.4631903171539307, + "logps/chosen": -389.4300842285156, + "logps/rejected": -455.31573486328125, + "loss": 0.5133, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4183425903320312, + "rewards/margins": 0.7307096719741821, + "rewards/rejected": -2.149052143096924, "step": 1190 }, { "epoch": 0.31, - "grad_norm": 8.5625, + "grad_norm": 6.15625, "learning_rate": 4.335069520670149e-06, - "logits/chosen": -1.2467167377471924, - "logits/rejected": -1.170364260673523, - "logps/chosen": -406.6671447753906, - "logps/rejected": -487.4442443847656, - "loss": 0.5837, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.7695426940917969, - "rewards/margins": 0.7259476780891418, - "rewards/rejected": -2.495490312576294, + "logits/chosen": -1.4696300029754639, + "logits/rejected": -1.3941162824630737, + "logps/chosen": -352.44671630859375, + "logps/rejected": -424.1249084472656, + "loss": 0.5732, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2273385524749756, + "rewards/margins": 0.6349586248397827, + "rewards/rejected": -1.8622970581054688, "step": 1200 }, { "epoch": 0.31, - "eval_logits/chosen": -1.2314388751983643, - "eval_logits/rejected": -1.10646390914917, - "eval_logps/chosen": -431.9436950683594, - "eval_logps/rejected": -498.354736328125, - "eval_loss": 0.5104092359542847, - "eval_rewards/accuracies": 0.7354999780654907, - "eval_rewards/chosen": -1.6729168891906738, - "eval_rewards/margins": 0.8645446300506592, - "eval_rewards/rejected": -2.537461280822754, - "eval_runtime": 385.0952, + "eval_logits/chosen": -1.4804484844207764, + "eval_logits/rejected": -1.3595802783966064, + "eval_logps/chosen": -375.21826171875, + "eval_logps/rejected": -427.4810485839844, + "eval_loss": 0.5184832811355591, + "eval_rewards/accuracies": 0.7250000238418579, + "eval_rewards/chosen": -1.1056623458862305, + "eval_rewards/margins": 0.7230623364448547, + "eval_rewards/rejected": -1.82872474193573, + "eval_runtime": 385.0476, "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 1200 }, { "epoch": 0.32, - "grad_norm": 8.375, + "grad_norm": 9.8125, "learning_rate": 4.319478895246e-06, - "logits/chosen": -1.3048737049102783, - "logits/rejected": -1.1300756931304932, - "logps/chosen": -410.984375, - "logps/rejected": -473.4725036621094, - "loss": 0.4842, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.638689637184143, - "rewards/margins": 0.871795654296875, - "rewards/rejected": -2.5104851722717285, + "logits/chosen": -1.5287452936172485, + "logits/rejected": -1.3607852458953857, + "logps/chosen": -350.2371520996094, + "logps/rejected": -398.1286315917969, + "loss": 0.5104, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0312172174453735, + "rewards/margins": 0.7258288264274597, + "rewards/rejected": -1.757046103477478, "step": 1210 }, { "epoch": 0.32, - "grad_norm": 17.875, + "grad_norm": 13.0, "learning_rate": 4.303736343857704e-06, - "logits/chosen": -1.2923145294189453, - "logits/rejected": -1.1951522827148438, - "logps/chosen": -442.15472412109375, - "logps/rejected": -570.807861328125, - "loss": 0.4982, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.8504784107208252, - "rewards/margins": 1.1673939228057861, - "rewards/rejected": -3.0178723335266113, + "logits/chosen": -1.5342817306518555, + "logits/rejected": -1.4489666223526, + "logps/chosen": -372.7054138183594, + "logps/rejected": -475.97601318359375, + "loss": 0.5008, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1559852361679077, + "rewards/margins": 0.9135689735412598, + "rewards/rejected": -2.069554328918457, "step": 1220 }, { "epoch": 0.32, - "grad_norm": 6.03125, + "grad_norm": 8.125, "learning_rate": 4.287843181003772e-06, - "logits/chosen": -1.347501277923584, - "logits/rejected": -1.1933776140213013, - "logps/chosen": -496.73077392578125, - "logps/rejected": -529.7448120117188, - "loss": 0.5797, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.02878999710083, - "rewards/margins": 0.7935463190078735, - "rewards/rejected": -2.822335958480835, + "logits/chosen": -1.5427916049957275, + "logits/rejected": -1.3855717182159424, + "logps/chosen": -458.01641845703125, + "logps/rejected": -475.8519592285156, + "loss": 0.5884, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6416466236114502, + "rewards/margins": 0.6417607665061951, + "rewards/rejected": -2.283407211303711, "step": 1230 }, { "epoch": 0.32, - "grad_norm": 7.90625, + "grad_norm": 6.46875, "learning_rate": 4.27180073375873e-06, - "logits/chosen": -1.411339521408081, - "logits/rejected": -1.2728986740112305, - "logps/chosen": -444.9125061035156, - "logps/rejected": -488.64794921875, - "loss": 0.5208, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5754519701004028, - "rewards/margins": 0.8528854250907898, - "rewards/rejected": -2.428337574005127, + "logits/chosen": -1.5489182472229004, + "logits/rejected": -1.402178168296814, + "logps/chosen": -442.7936096191406, + "logps/rejected": -477.34515380859375, + "loss": 0.5287, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5542631149291992, + "rewards/margins": 0.7610459327697754, + "rewards/rejected": -2.3153088092803955, "step": 1240 }, { "epoch": 0.33, - "grad_norm": 5.59375, + "grad_norm": 4.4375, "learning_rate": 4.255610341662304e-06, - "logits/chosen": -1.4314241409301758, - "logits/rejected": -1.2228209972381592, - "logps/chosen": -408.0103454589844, - "logps/rejected": -470.0995178222656, - "loss": 0.5539, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5272233486175537, - "rewards/margins": 0.8200508952140808, - "rewards/rejected": -2.3472743034362793, + "logits/chosen": -1.6110093593597412, + "logits/rejected": -1.398992896080017, + "logps/chosen": -380.208740234375, + "logps/rejected": -425.40838623046875, + "loss": 0.5553, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2492074966430664, + "rewards/margins": 0.6511562466621399, + "rewards/rejected": -1.900363564491272, "step": 1250 }, { "epoch": 0.33, - "grad_norm": 9.1875, + "grad_norm": 6.84375, "learning_rate": 4.2392733566075764e-06, - "logits/chosen": -1.4001052379608154, - "logits/rejected": -1.2643131017684937, - "logps/chosen": -446.30975341796875, - "logps/rejected": -492.1767578125, - "loss": 0.5838, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.9030348062515259, - "rewards/margins": 0.6141453981399536, - "rewards/rejected": -2.5171799659729004, + "logits/chosen": -1.59576416015625, + "logits/rejected": -1.4599517583847046, + "logps/chosen": -401.14984130859375, + "logps/rejected": -438.921630859375, + "loss": 0.591, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4514347314834595, + "rewards/margins": 0.5331937670707703, + "rewards/rejected": -1.984628438949585, "step": 1260 }, { "epoch": 0.33, - "grad_norm": 8.4375, + "grad_norm": 9.875, "learning_rate": 4.2227911427280975e-06, - "logits/chosen": -1.3647363185882568, - "logits/rejected": -1.1842234134674072, - "logps/chosen": -421.44952392578125, - "logps/rejected": -471.9312438964844, - "loss": 0.5187, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.711439847946167, - "rewards/margins": 0.8332679867744446, - "rewards/rejected": -2.544707775115967, + "logits/chosen": -1.5509364604949951, + "logits/rejected": -1.3630738258361816, + "logps/chosen": -384.2834777832031, + "logps/rejected": -420.7542419433594, + "loss": 0.5353, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.339779257774353, + "rewards/margins": 0.6931589841842651, + "rewards/rejected": -2.0329384803771973, "step": 1270 }, { "epoch": 0.33, - "grad_norm": 11.1875, + "grad_norm": 9.1875, "learning_rate": 4.206165076283983e-06, - "logits/chosen": -1.4086225032806396, - "logits/rejected": -1.258502721786499, - "logps/chosen": -433.7823791503906, - "logps/rejected": -515.7772827148438, - "loss": 0.4621, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.878491759300232, - "rewards/margins": 1.0307425260543823, - "rewards/rejected": -2.9092345237731934, + "logits/chosen": -1.5844643115997314, + "logits/rejected": -1.4324209690093994, + "logps/chosen": -375.78973388671875, + "logps/rejected": -440.9784240722656, + "loss": 0.4792, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.29856538772583, + "rewards/margins": 0.8626803159713745, + "rewards/rejected": -2.161245584487915, "step": 1280 }, { "epoch": 0.34, - "grad_norm": 12.1875, + "grad_norm": 10.5625, "learning_rate": 4.189396545546995e-06, - "logits/chosen": -1.3694835901260376, - "logits/rejected": -1.2587060928344727, - "logps/chosen": -476.4090270996094, - "logps/rejected": -558.2841796875, - "loss": 0.5179, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.272345781326294, - "rewards/margins": 1.0181748867034912, - "rewards/rejected": -3.290520429611206, + "logits/chosen": -1.5281752347946167, + "logits/rejected": -1.4283504486083984, + "logps/chosen": -397.5606384277344, + "logps/rejected": -468.21002197265625, + "loss": 0.5202, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4838616847991943, + "rewards/margins": 0.9059172868728638, + "rewards/rejected": -2.3897788524627686, "step": 1290 }, { "epoch": 0.34, - "grad_norm": 11.875, + "grad_norm": 10.9375, "learning_rate": 4.172486950684627e-06, - "logits/chosen": -1.3458369970321655, - "logits/rejected": -1.2686779499053955, - "logps/chosen": -503.9125061035156, - "logps/rejected": -597.0067138671875, - "loss": 0.5342, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.502702236175537, - "rewards/margins": 0.962365448474884, - "rewards/rejected": -3.4650676250457764, + "logits/chosen": -1.480257511138916, + "logits/rejected": -1.4012019634246826, + "logps/chosen": -429.61181640625, + "logps/rejected": -510.66522216796875, + "loss": 0.5332, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7596956491470337, + "rewards/margins": 0.8419567942619324, + "rewards/rejected": -2.6016526222229004, "step": 1300 }, { "epoch": 0.34, - "eval_logits/chosen": -1.1852304935455322, - "eval_logits/rejected": -1.065573811531067, - "eval_logps/chosen": -541.4911499023438, - "eval_logps/rejected": -629.0701293945312, - "eval_loss": 0.5145753026008606, + "eval_logits/chosen": -1.30724036693573, + "eval_logits/rejected": -1.1976608037948608, + "eval_logps/chosen": -478.32550048828125, + "eval_logps/rejected": -549.7024536132812, + "eval_loss": 0.5315085053443909, "eval_rewards/accuracies": 0.7239999771118164, - "eval_rewards/chosen": -2.7683911323547363, - "eval_rewards/margins": 1.0762238502502441, - "eval_rewards/rejected": -3.8446152210235596, - "eval_runtime": 384.8604, - "eval_samples_per_second": 5.197, - "eval_steps_per_second": 0.65, + "eval_rewards/chosen": -2.1367344856262207, + "eval_rewards/margins": 0.9142037630081177, + "eval_rewards/rejected": -3.050938367843628, + "eval_runtime": 385.0593, + "eval_samples_per_second": 5.194, + "eval_steps_per_second": 0.649, "step": 1300 }, { "epoch": 0.34, - "grad_norm": 10.875, + "grad_norm": 12.6875, "learning_rate": 4.155437703643182e-06, - "logits/chosen": -1.3424588441848755, - "logits/rejected": -1.1757004261016846, - "logps/chosen": -503.0860900878906, - "logps/rejected": -576.0645751953125, - "loss": 0.5175, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.563424587249756, - "rewards/margins": 1.0206806659698486, - "rewards/rejected": -3.5841050148010254, + "logits/chosen": -1.4552199840545654, + "logits/rejected": -1.306873083114624, + "logps/chosen": -439.85382080078125, + "logps/rejected": -500.3904724121094, + "loss": 0.5037, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9311021566390991, + "rewards/margins": 0.8962618708610535, + "rewards/rejected": -2.827363967895508, "step": 1310 }, { "epoch": 0.35, - "grad_norm": 12.5625, + "grad_norm": 12.8125, "learning_rate": 4.138250228029882e-06, - "logits/chosen": -1.2897417545318604, - "logits/rejected": -1.2073286771774292, - "logps/chosen": -513.29541015625, - "logps/rejected": -622.9282836914062, - "loss": 0.4944, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.5636470317840576, - "rewards/margins": 1.061576008796692, - "rewards/rejected": -3.625222682952881, + "logits/chosen": -1.482912302017212, + "logits/rejected": -1.403141736984253, + "logps/chosen": -424.140380859375, + "logps/rejected": -514.3765869140625, + "loss": 0.5066, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6720972061157227, + "rewards/margins": 0.8676088452339172, + "rewards/rejected": -2.539705753326416, "step": 1320 }, { "epoch": 0.35, - "grad_norm": 6.40625, + "grad_norm": 6.9375, "learning_rate": 4.120925958993994e-06, - "logits/chosen": -1.2789089679718018, - "logits/rejected": -1.1816047430038452, - "logps/chosen": -455.1483459472656, - "logps/rejected": -547.6185302734375, - "loss": 0.5614, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.212470293045044, - "rewards/margins": 0.9680255055427551, - "rewards/rejected": -3.180495500564575, + "logits/chosen": -1.4682929515838623, + "logits/rejected": -1.3645504713058472, + "logps/chosen": -376.16033935546875, + "logps/rejected": -447.6339416503906, + "loss": 0.5583, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4225904941558838, + "rewards/margins": 0.7580591440200806, + "rewards/rejected": -2.180649757385254, "step": 1330 }, { "epoch": 0.35, - "grad_norm": 10.875, + "grad_norm": 10.8125, "learning_rate": 4.103466343106999e-06, - "logits/chosen": -1.451428771018982, - "logits/rejected": -1.3242452144622803, - "logps/chosen": -471.908203125, - "logps/rejected": -530.3082275390625, - "loss": 0.531, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.0252251625061035, - "rewards/margins": 0.8427697420120239, - "rewards/rejected": -2.867994785308838, + "logits/chosen": -1.5599358081817627, + "logits/rejected": -1.4370046854019165, + "logps/chosen": -424.14849853515625, + "logps/rejected": -472.6615295410156, + "loss": 0.5315, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5476281642913818, + "rewards/margins": 0.743899941444397, + "rewards/rejected": -2.2915279865264893, "step": 1340 }, { "epoch": 0.35, - "grad_norm": 13.375, + "grad_norm": 8.625, "learning_rate": 4.085872838241797e-06, - "logits/chosen": -1.3740856647491455, - "logits/rejected": -1.2394291162490845, - "logps/chosen": -466.03948974609375, - "logps/rejected": -517.3243408203125, - "loss": 0.5872, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.0367541313171387, - "rewards/margins": 0.7136788368225098, - "rewards/rejected": -2.7504329681396484, + "logits/chosen": -1.464450716972351, + "logits/rejected": -1.3373545408248901, + "logps/chosen": -405.13262939453125, + "logps/rejected": -447.93994140625, + "loss": 0.5899, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.427685022354126, + "rewards/margins": 0.6289039850234985, + "rewards/rejected": -2.056588649749756, "step": 1350 }, { "epoch": 0.36, - "grad_norm": 11.625, + "grad_norm": 9.75, "learning_rate": 4.06814691345098e-06, - "logits/chosen": -1.3596174716949463, - "logits/rejected": -1.1983596086502075, - "logps/chosen": -435.5516052246094, - "logps/rejected": -502.25762939453125, - "loss": 0.4841, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.8000373840332031, - "rewards/margins": 0.8877509832382202, - "rewards/rejected": -2.687788486480713, + "logits/chosen": -1.452643871307373, + "logits/rejected": -1.2927871942520142, + "logps/chosen": -378.2747497558594, + "logps/rejected": -437.4178161621094, + "loss": 0.4989, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2272692918777466, + "rewards/margins": 0.8121210336685181, + "rewards/rejected": -2.0393900871276855, "step": 1360 }, { "epoch": 0.36, - "grad_norm": 14.8125, + "grad_norm": 10.4375, "learning_rate": 4.050290048844171e-06, - "logits/chosen": -1.4539238214492798, - "logits/rejected": -1.3574968576431274, - "logps/chosen": -447.8072204589844, - "logps/rejected": -525.6444091796875, - "loss": 0.5237, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.7919375896453857, - "rewards/margins": 0.829770565032959, - "rewards/rejected": -2.6217081546783447, + "logits/chosen": -1.572665810585022, + "logits/rejected": -1.4710958003997803, + "logps/chosen": -398.8462829589844, + "logps/rejected": -468.70794677734375, + "loss": 0.5368, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.30232834815979, + "rewards/margins": 0.750015139579773, + "rewards/rejected": -2.0523436069488525, "step": 1370 }, { "epoch": 0.36, - "grad_norm": 6.9375, + "grad_norm": 7.46875, "learning_rate": 4.032303735464422e-06, - "logits/chosen": -1.498198390007019, - "logits/rejected": -1.3165781497955322, - "logps/chosen": -449.3099670410156, - "logps/rejected": -533.7264404296875, - "loss": 0.4426, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.7732903957366943, - "rewards/margins": 1.075021505355835, - "rewards/rejected": -2.8483121395111084, + "logits/chosen": -1.6318330764770508, + "logits/rejected": -1.4836442470550537, + "logps/chosen": -405.6830749511719, + "logps/rejected": -475.7168884277344, + "loss": 0.4568, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.337021827697754, + "rewards/margins": 0.9311949610710144, + "rewards/rejected": -2.268216609954834, "step": 1380 }, { "epoch": 0.36, - "grad_norm": 8.0625, + "grad_norm": 9.6875, "learning_rate": 4.014189475163727e-06, - "logits/chosen": -1.2981091737747192, - "logits/rejected": -1.1843674182891846, - "logps/chosen": -436.0013732910156, - "logps/rejected": -536.7470703125, - "loss": 0.4771, + "logits/chosen": -1.4534022808074951, + "logits/rejected": -1.3461982011795044, + "logps/chosen": -380.7342224121094, + "logps/rejected": -464.63916015625, + "loss": 0.4968, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.8134443759918213, - "rewards/margins": 1.0886361598968506, - "rewards/rejected": -2.902080535888672, + "rewards/chosen": -1.2607730627059937, + "rewards/margins": 0.9202286005020142, + "rewards/rejected": -2.181001663208008, "step": 1390 }, { "epoch": 0.37, - "grad_norm": 16.5, + "grad_norm": 15.0625, "learning_rate": 3.995948780477605e-06, - "logits/chosen": -1.4245179891586304, - "logits/rejected": -1.2672098875045776, - "logps/chosen": -442.8516540527344, - "logps/rejected": -501.25775146484375, - "loss": 0.5287, + "logits/chosen": -1.5742177963256836, + "logits/rejected": -1.410463809967041, + "logps/chosen": -382.19830322265625, + "logps/rejected": -427.6187438964844, + "loss": 0.5431, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.7409827709197998, - "rewards/margins": 0.8305804133415222, - "rewards/rejected": -2.571563243865967, + "rewards/chosen": -1.1344490051269531, + "rewards/margins": 0.7007244825363159, + "rewards/rejected": -1.8351733684539795, "step": 1400 }, { "epoch": 0.37, - "eval_logits/chosen": -1.2506229877471924, - "eval_logits/rejected": -1.1252646446228027, - "eval_logps/chosen": -455.32855224609375, - "eval_logps/rejected": -530.7439575195312, - "eval_loss": 0.5196547508239746, - "eval_rewards/accuracies": 0.7235000133514404, - "eval_rewards/chosen": -1.9067655801773071, - "eval_rewards/margins": 0.9545875191688538, - "eval_rewards/rejected": -2.8613533973693848, - "eval_runtime": 384.8833, - "eval_samples_per_second": 5.196, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.431371808052063, + "eval_logits/rejected": -1.3129903078079224, + "eval_logps/chosen": -390.28460693359375, + "eval_logps/rejected": -454.35223388671875, + "eval_loss": 0.521051287651062, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.2563258409500122, + "eval_rewards/margins": 0.841110348701477, + "eval_rewards/rejected": -2.09743595123291, + "eval_runtime": 385.3298, + "eval_samples_per_second": 5.19, + "eval_steps_per_second": 0.649, "step": 1400 }, { "epoch": 0.37, - "grad_norm": 10.0, + "grad_norm": 10.875, "learning_rate": 3.977583174498816e-06, - "logits/chosen": -1.296602487564087, - "logits/rejected": -1.1704097986221313, - "logps/chosen": -470.2802734375, - "logps/rejected": -584.6675415039062, - "loss": 0.3797, - "rewards/accuracies": 0.84375, - "rewards/chosen": -2.065967082977295, - "rewards/margins": 1.363657832145691, - "rewards/rejected": -3.4296250343322754, + "logits/chosen": -1.4515248537063599, + "logits/rejected": -1.3351339101791382, + "logps/chosen": -412.0836486816406, + "logps/rejected": -511.7002868652344, + "loss": 0.3984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4840004444122314, + "rewards/margins": 1.215951681137085, + "rewards/rejected": -2.6999518871307373, "step": 1410 }, { "epoch": 0.37, - "grad_norm": 12.75, + "grad_norm": 10.375, "learning_rate": 3.959094190750172e-06, - "logits/chosen": -1.2688853740692139, - "logits/rejected": -1.1229419708251953, - "logps/chosen": -514.8965454101562, - "logps/rejected": -601.8798217773438, - "loss": 0.5131, + "logits/chosen": -1.4154666662216187, + "logits/rejected": -1.2808506488800049, + "logps/chosen": -463.95367431640625, + "logps/rejected": -530.5446166992188, + "loss": 0.5238, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.297360897064209, - "rewards/margins": 1.1389044523239136, - "rewards/rejected": -3.436265468597412, + "rewards/chosen": -1.7879329919815063, + "rewards/margins": 0.9349812269210815, + "rewards/rejected": -2.722913980484009, "step": 1420 }, { "epoch": 0.37, - "grad_norm": 23.625, + "grad_norm": 14.625, "learning_rate": 3.9404833730564975e-06, - "logits/chosen": -1.12982177734375, - "logits/rejected": -1.0124092102050781, - "logps/chosen": -485.57647705078125, - "logps/rejected": -589.0986328125, - "loss": 0.509, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.324758529663086, - "rewards/margins": 1.1509692668914795, - "rewards/rejected": -3.4757277965545654, + "logits/chosen": -1.3400425910949707, + "logits/rejected": -1.2239243984222412, + "logps/chosen": -414.04168701171875, + "logps/rejected": -493.5077209472656, + "loss": 0.5162, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6094110012054443, + "rewards/margins": 0.910406768321991, + "rewards/rejected": -2.51981782913208, "step": 1430 }, { "epoch": 0.38, - "grad_norm": 18.5, + "grad_norm": 13.0, "learning_rate": 3.921752275415712e-06, - "logits/chosen": -1.1547746658325195, - "logits/rejected": -1.0837663412094116, - "logps/chosen": -519.43994140625, - "logps/rejected": -648.2784423828125, - "loss": 0.4382, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.689040184020996, - "rewards/margins": 1.4942899942398071, - "rewards/rejected": -4.1833295822143555, + "logits/chosen": -1.4123733043670654, + "logits/rejected": -1.379097580909729, + "logps/chosen": -400.0645751953125, + "logps/rejected": -482.3004455566406, + "loss": 0.455, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.495286464691162, + "rewards/margins": 1.028262734413147, + "rewards/rejected": -2.5235490798950195, "step": 1440 }, { "epoch": 0.38, - "grad_norm": 7.125, + "grad_norm": 6.40625, "learning_rate": 3.902902461869079e-06, - "logits/chosen": -1.122580885887146, - "logits/rejected": -0.9964879155158997, - "logps/chosen": -557.9876098632812, - "logps/rejected": -675.2171630859375, - "loss": 0.5761, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.184739351272583, - "rewards/margins": 1.3369649648666382, - "rewards/rejected": -4.52170467376709, + "logits/chosen": -1.3998125791549683, + "logits/rejected": -1.2797114849090576, + "logps/chosen": -421.95794677734375, + "logps/rejected": -507.14361572265625, + "loss": 0.5415, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8244426250457764, + "rewards/margins": 1.0165250301361084, + "rewards/rejected": -2.840967893600464, "step": 1450 }, { "epoch": 0.38, - "grad_norm": 14.4375, + "grad_norm": 15.8125, "learning_rate": 3.883935506370605e-06, - "logits/chosen": -1.2318613529205322, - "logits/rejected": -1.1085087060928345, - "logps/chosen": -504.7203674316406, - "logps/rejected": -563.6139526367188, - "loss": 0.5652, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.5147933959960938, - "rewards/margins": 0.8672431111335754, - "rewards/rejected": -3.3820366859436035, + "logits/chosen": -1.4051461219787598, + "logits/rejected": -1.2663236856460571, + "logps/chosen": -432.9169921875, + "logps/rejected": -484.6170349121094, + "loss": 0.5752, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7967593669891357, + "rewards/margins": 0.7953070402145386, + "rewards/rejected": -2.592066526412964, "step": 1460 }, { "epoch": 0.38, - "grad_norm": 4.46875, + "grad_norm": 5.71875, "learning_rate": 3.864852992655617e-06, - "logits/chosen": -1.3699067831039429, - "logits/rejected": -1.2695183753967285, - "logps/chosen": -446.06903076171875, - "logps/rejected": -533.8704833984375, - "loss": 0.4493, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.9743432998657227, - "rewards/margins": 1.0008807182312012, - "rewards/rejected": -2.975224256515503, + "logits/chosen": -1.5188504457473755, + "logits/rejected": -1.4224086999893188, + "logps/chosen": -385.0553283691406, + "logps/rejected": -460.64166259765625, + "loss": 0.4617, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.364206075668335, + "rewards/margins": 0.8787292242050171, + "rewards/rejected": -2.2429351806640625, "step": 1470 }, { "epoch": 0.39, - "grad_norm": 7.40625, + "grad_norm": 6.0, "learning_rate": 3.845656514108516e-06, - "logits/chosen": -1.3080308437347412, - "logits/rejected": -1.1480329036712646, - "logps/chosen": -499.15087890625, - "logps/rejected": -540.1708984375, - "loss": 0.5052, + "logits/chosen": -1.4730474948883057, + "logits/rejected": -1.3063628673553467, + "logps/chosen": -420.05364990234375, + "logps/rejected": -448.61663818359375, + "loss": 0.4919, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.400202512741089, - "rewards/margins": 0.9600592851638794, - "rewards/rejected": -3.360261917114258, + "rewards/chosen": -1.609230637550354, + "rewards/margins": 0.8354890942573547, + "rewards/rejected": -2.4447197914123535, "step": 1480 }, { "epoch": 0.39, - "grad_norm": 10.0625, + "grad_norm": 15.6875, "learning_rate": 3.826347673629738e-06, - "logits/chosen": -1.3206231594085693, - "logits/rejected": -1.1406716108322144, - "logps/chosen": -456.264404296875, - "logps/rejected": -542.2021484375, - "loss": 0.4827, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.029609203338623, - "rewards/margins": 1.1219456195831299, - "rewards/rejected": -3.151554822921753, + "logits/chosen": -1.447205901145935, + "logits/rejected": -1.2630943059921265, + "logps/chosen": -382.7901916503906, + "logps/rejected": -455.2850036621094, + "loss": 0.4846, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2948672771453857, + "rewards/margins": 0.9875162243843079, + "rewards/rejected": -2.282383441925049, "step": 1490 }, { "epoch": 0.39, - "grad_norm": 16.375, + "grad_norm": 13.4375, "learning_rate": 3.8069280835019062e-06, - "logits/chosen": -1.3296412229537964, - "logits/rejected": -1.1789557933807373, - "logps/chosen": -472.45135498046875, - "logps/rejected": -577.9127197265625, - "loss": 0.4634, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.0813910961151123, - "rewards/margins": 1.246517300605774, - "rewards/rejected": -3.3279082775115967, + "logits/chosen": -1.4306355714797974, + "logits/rejected": -1.2892208099365234, + "logps/chosen": -402.4939880371094, + "logps/rejected": -487.1109313964844, + "loss": 0.4862, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.38181734085083, + "rewards/margins": 1.0380725860595703, + "rewards/rejected": -2.4198899269104004, "step": 1500 }, { "epoch": 0.39, - "eval_logits/chosen": -1.2696473598480225, - "eval_logits/rejected": -1.1408381462097168, - "eval_logps/chosen": -478.6543884277344, - "eval_logps/rejected": -568.5231323242188, - "eval_loss": 0.516459047794342, - "eval_rewards/accuracies": 0.7294999957084656, - "eval_rewards/chosen": -2.1400234699249268, - "eval_rewards/margins": 1.0991216897964478, - "eval_rewards/rejected": -3.239145040512085, - "eval_runtime": 384.85, - "eval_samples_per_second": 5.197, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.4015111923217773, + "eval_logits/rejected": -1.2794849872589111, + "eval_logps/chosen": -401.4261779785156, + "eval_logps/rejected": -472.0146179199219, + "eval_loss": 0.5161935091018677, + "eval_rewards/accuracies": 0.7354999780654907, + "eval_rewards/chosen": -1.3677420616149902, + "eval_rewards/margins": 0.9063177704811096, + "eval_rewards/rejected": -2.274059534072876, + "eval_runtime": 384.9141, + "eval_samples_per_second": 5.196, + "eval_steps_per_second": 0.649, "step": 1500 }, { "epoch": 0.4, - "grad_norm": 18.75, + "grad_norm": 10.0625, "learning_rate": 3.7873993652552077e-06, - "logits/chosen": -1.2893366813659668, - "logits/rejected": -1.1984317302703857, - "logps/chosen": -447.75994873046875, - "logps/rejected": -531.8748779296875, - "loss": 0.6089, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.140458583831787, - "rewards/margins": 0.8995217084884644, - "rewards/rejected": -3.039980173110962, + "logits/chosen": -1.4077152013778687, + "logits/rejected": -1.3199503421783447, + "logps/chosen": -359.19488525390625, + "logps/rejected": -424.65576171875, + "loss": 0.6047, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2548080682754517, + "rewards/margins": 0.7129807472229004, + "rewards/rejected": -1.9677889347076416, "step": 1510 }, { "epoch": 0.4, - "grad_norm": 9.375, + "grad_norm": 8.1875, "learning_rate": 3.7677631495319953e-06, - "logits/chosen": -1.4299360513687134, - "logits/rejected": -1.3164293766021729, - "logps/chosen": -420.2286682128906, - "logps/rejected": -484.6932067871094, - "loss": 0.5232, + "logits/chosen": -1.5366017818450928, + "logits/rejected": -1.420841932296753, + "logps/chosen": -355.3591613769531, + "logps/rejected": -406.9905700683594, + "loss": 0.5263, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.5654377937316895, - "rewards/margins": 0.824730396270752, - "rewards/rejected": -2.3901684284210205, + "rewards/chosen": -0.9167426228523254, + "rewards/margins": 0.6963993310928345, + "rewards/rejected": -1.6131420135498047, "step": 1520 }, { "epoch": 0.4, - "grad_norm": 6.65625, + "grad_norm": 6.15625, "learning_rate": 3.748021075950633e-06, - "logits/chosen": -1.4940413236618042, - "logits/rejected": -1.3882437944412231, - "logps/chosen": -430.9405822753906, - "logps/rejected": -477.81365966796875, - "loss": 0.5819, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.5629723072052002, - "rewards/margins": 0.5750766396522522, - "rewards/rejected": -2.1380488872528076, + "logits/chosen": -1.5663089752197266, + "logits/rejected": -1.4497790336608887, + "logps/chosen": -371.51312255859375, + "logps/rejected": -410.3604431152344, + "loss": 0.5946, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9686979055404663, + "rewards/margins": 0.49481868743896484, + "rewards/rejected": -1.4635167121887207, "step": 1530 }, { "epoch": 0.4, - "grad_norm": 9.5625, + "grad_norm": 9.9375, "learning_rate": 3.7281747929685824e-06, - "logits/chosen": -1.3339357376098633, - "logits/rejected": -1.1862207651138306, - "logps/chosen": -417.4947814941406, - "logps/rejected": -475.46160888671875, - "loss": 0.5335, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.7983973026275635, - "rewards/margins": 0.7366180419921875, - "rewards/rejected": -2.535015106201172, + "logits/chosen": -1.4247326850891113, + "logits/rejected": -1.265855073928833, + "logps/chosen": -353.05194091796875, + "logps/rejected": -399.3148498535156, + "loss": 0.548, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1539690494537354, + "rewards/margins": 0.6195784211158752, + "rewards/rejected": -1.7735474109649658, "step": 1540 }, { "epoch": 0.41, - "grad_norm": 7.5625, + "grad_norm": 7.59375, "learning_rate": 3.7082259577447604e-06, - "logits/chosen": -1.4374284744262695, - "logits/rejected": -1.3284789323806763, - "logps/chosen": -462.6572265625, - "logps/rejected": -520.775390625, - "loss": 0.4858, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.9146639108657837, - "rewards/margins": 0.7811635732650757, - "rewards/rejected": -2.6958274841308594, + "logits/chosen": -1.5184131860733032, + "logits/rejected": -1.4079376459121704, + "logps/chosen": -389.82550048828125, + "logps/rejected": -445.1400451660156, + "loss": 0.4885, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1863467693328857, + "rewards/margins": 0.7531275749206543, + "rewards/rejected": -1.9394744634628296, "step": 1550 }, { "epoch": 0.41, - "grad_norm": 9.875, + "grad_norm": 8.5625, "learning_rate": 3.6881762360011688e-06, - "logits/chosen": -1.45357346534729, - "logits/rejected": -1.2578237056732178, - "logps/chosen": -482.557861328125, - "logps/rejected": -538.22705078125, - "loss": 0.5034, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.010615110397339, - "rewards/margins": 0.9266406297683716, - "rewards/rejected": -2.937255620956421, + "logits/chosen": -1.5098861455917358, + "logits/rejected": -1.317479133605957, + "logps/chosen": -411.7156677246094, + "logps/rejected": -458.67218017578125, + "loss": 0.5111, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3021931648254395, + "rewards/margins": 0.8395140767097473, + "rewards/rejected": -2.141706943511963, "step": 1560 }, { "epoch": 0.41, - "grad_norm": 9.3125, + "grad_norm": 10.8125, "learning_rate": 3.668027301883802e-06, - "logits/chosen": -1.3933765888214111, - "logits/rejected": -1.2354159355163574, - "logps/chosen": -465.87628173828125, - "logps/rejected": -557.3919067382812, - "loss": 0.4966, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.145137310028076, - "rewards/margins": 1.0642468929290771, - "rewards/rejected": -3.209383726119995, + "logits/chosen": -1.4269211292266846, + "logits/rejected": -1.2615479230880737, + "logps/chosen": -402.62603759765625, + "logps/rejected": -482.184326171875, + "loss": 0.511, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5126349925994873, + "rewards/margins": 0.9446732401847839, + "rewards/rejected": -2.457308053970337, "step": 1570 }, { "epoch": 0.41, - "grad_norm": 6.3125, + "grad_norm": 6.09375, "learning_rate": 3.64778083782286e-06, - "logits/chosen": -1.283038854598999, - "logits/rejected": -1.264458417892456, - "logps/chosen": -504.4510803222656, - "logps/rejected": -629.853271484375, - "loss": 0.5256, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.5184431076049805, - "rewards/margins": 0.9640257954597473, - "rewards/rejected": -3.482469081878662, + "logits/chosen": -1.2994117736816406, + "logits/rejected": -1.2819687128067017, + "logps/chosen": -454.22711181640625, + "logps/rejected": -568.9495239257812, + "loss": 0.5489, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0162034034729004, + "rewards/margins": 0.8572282791137695, + "rewards/rejected": -2.87343168258667, "step": 1580 }, { "epoch": 0.42, - "grad_norm": 12.1875, + "grad_norm": 8.125, "learning_rate": 3.627438534392268e-06, - "logits/chosen": -1.3820140361785889, - "logits/rejected": -1.3510745763778687, - "logps/chosen": -486.08721923828125, - "logps/rejected": -596.9356689453125, - "loss": 0.5014, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.4804224967956543, - "rewards/margins": 1.0413092374801636, - "rewards/rejected": -3.5217316150665283, + "logits/chosen": -1.4073131084442139, + "logits/rejected": -1.3753129243850708, + "logps/chosen": -438.55255126953125, + "logps/rejected": -532.357421875, + "loss": 0.4994, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0050759315490723, + "rewards/margins": 0.8708721399307251, + "rewards/rejected": -2.875947952270508, "step": 1590 }, { "epoch": 0.42, - "grad_norm": 7.9375, + "grad_norm": 10.5625, "learning_rate": 3.607002090168506e-06, - "logits/chosen": -1.2637202739715576, - "logits/rejected": -1.1885454654693604, - "logps/chosen": -528.8330688476562, - "logps/rejected": -598.1657104492188, - "loss": 0.5551, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.6452443599700928, - "rewards/margins": 0.9074187278747559, - "rewards/rejected": -3.5526630878448486, + "logits/chosen": -1.2787964344024658, + "logits/rejected": -1.2062056064605713, + "logps/chosen": -478.2181091308594, + "logps/rejected": -532.1177978515625, + "loss": 0.5858, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.139094829559326, + "rewards/margins": 0.7530891299247742, + "rewards/rejected": -2.892183780670166, "step": 1600 }, { "epoch": 0.42, - "eval_logits/chosen": -1.2402483224868774, - "eval_logits/rejected": -1.1162159442901611, - "eval_logps/chosen": -512.13427734375, - "eval_logps/rejected": -599.2672119140625, - "eval_loss": 0.5056775212287903, - "eval_rewards/accuracies": 0.7310000061988831, - "eval_rewards/chosen": -2.474822521209717, - "eval_rewards/margins": 1.0717631578445435, - "eval_rewards/rejected": -3.5465855598449707, - "eval_runtime": 384.9561, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.2717995643615723, + "eval_logits/rejected": -1.1533604860305786, + "eval_logps/chosen": -445.6515197753906, + "eval_logps/rejected": -514.567138671875, + "eval_loss": 0.5072752833366394, + "eval_rewards/accuracies": 0.7365000247955322, + "eval_rewards/chosen": -1.809995174407959, + "eval_rewards/margins": 0.889590322971344, + "eval_rewards/rejected": -2.6995856761932373, + "eval_runtime": 385.2379, + "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 1600 }, { "epoch": 0.42, - "grad_norm": 4.5625, + "grad_norm": 4.8125, "learning_rate": 3.586473211588787e-06, - "logits/chosen": -1.3531196117401123, - "logits/rejected": -1.2485313415527344, - "logps/chosen": -474.69183349609375, - "logps/rejected": -598.9015502929688, - "loss": 0.4461, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.299967050552368, - "rewards/margins": 1.1753101348876953, - "rewards/rejected": -3.4752774238586426, + "logits/chosen": -1.3733545541763306, + "logits/rejected": -1.2681185007095337, + "logps/chosen": -407.07623291015625, + "logps/rejected": -509.69683837890625, + "loss": 0.4615, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6238105297088623, + "rewards/margins": 0.9594193696975708, + "rewards/rejected": -2.5832300186157227, "step": 1610 }, { "epoch": 0.42, - "grad_norm": 16.125, + "grad_norm": 11.125, "learning_rate": 3.5658536128085623e-06, - "logits/chosen": -1.397863745689392, - "logits/rejected": -1.2203514575958252, - "logps/chosen": -538.4635009765625, - "logps/rejected": -599.5867919921875, - "loss": 0.5809, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.7555203437805176, - "rewards/margins": 0.9067786931991577, - "rewards/rejected": -3.6622989177703857, + "logits/chosen": -1.3982038497924805, + "logits/rejected": -1.2271344661712646, + "logps/chosen": -460.24951171875, + "logps/rejected": -503.8080139160156, + "loss": 0.595, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.9733803272247314, + "rewards/margins": 0.7311316728591919, + "rewards/rejected": -2.704512119293213, "step": 1620 }, { "epoch": 0.43, - "grad_norm": 8.0, + "grad_norm": 8.625, "learning_rate": 3.545145015558399e-06, - "logits/chosen": -1.198150873184204, - "logits/rejected": -1.1703031063079834, - "logps/chosen": -492.5457458496094, - "logps/rejected": -591.6585083007812, - "loss": 0.484, + "logits/chosen": -1.1945741176605225, + "logits/rejected": -1.1713488101959229, + "logps/chosen": -412.6747131347656, + "logps/rejected": -492.372802734375, + "loss": 0.5028, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.609083652496338, - "rewards/margins": 1.1332842111587524, - "rewards/rejected": -3.7423675060272217, + "rewards/chosen": -1.8103736639022827, + "rewards/margins": 0.9391372799873352, + "rewards/rejected": -2.7495107650756836, "step": 1630 }, { "epoch": 0.43, - "grad_norm": 5.375, + "grad_norm": 8.9375, "learning_rate": 3.5243491490002056e-06, - "logits/chosen": -1.3408269882202148, - "logits/rejected": -1.2534765005111694, - "logps/chosen": -525.4117431640625, - "logps/rejected": -613.4912109375, - "loss": 0.5814, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.717186450958252, - "rewards/margins": 0.9326400756835938, - "rewards/rejected": -3.649826765060425, + "logits/chosen": -1.3308615684509277, + "logits/rejected": -1.2446686029434204, + "logps/chosen": -433.484375, + "logps/rejected": -507.3377990722656, + "loss": 0.5688, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7979129552841187, + "rewards/margins": 0.7903792262077332, + "rewards/rejected": -2.588292360305786, "step": 1640 }, { "epoch": 0.43, - "grad_norm": 10.5, + "grad_norm": 7.8125, "learning_rate": 3.503467749582857e-06, - "logits/chosen": -1.4275528192520142, - "logits/rejected": -1.233086109161377, - "logps/chosen": -476.9230041503906, - "logps/rejected": -512.470703125, - "loss": 0.594, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.256998300552368, - "rewards/margins": 0.6955486536026001, - "rewards/rejected": -2.9525468349456787, + "logits/chosen": -1.378259301185608, + "logits/rejected": -1.1882727146148682, + "logps/chosen": -412.93560791015625, + "logps/rejected": -446.2088317871094, + "loss": 0.5722, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6171245574951172, + "rewards/margins": 0.6728037595748901, + "rewards/rejected": -2.2899281978607178, "step": 1650 }, { "epoch": 0.43, - "grad_norm": 11.375, + "grad_norm": 14.125, "learning_rate": 3.4825025608971947e-06, - "logits/chosen": -1.347503662109375, - "logits/rejected": -1.2736327648162842, - "logps/chosen": -408.91351318359375, - "logps/rejected": -484.05926513671875, - "loss": 0.5229, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.8696982860565186, - "rewards/margins": 0.7134873270988464, - "rewards/rejected": -2.583185911178589, + "logits/chosen": -1.2760392427444458, + "logits/rejected": -1.2017720937728882, + "logps/chosen": -379.2555847167969, + "logps/rejected": -455.60791015625, + "loss": 0.5323, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.573118805885315, + "rewards/margins": 0.7255537509918213, + "rewards/rejected": -2.2986724376678467, "step": 1660 }, { "epoch": 0.44, - "grad_norm": 6.375, + "grad_norm": 7.34375, "learning_rate": 3.4614553335304407e-06, - "logits/chosen": -1.3952078819274902, - "logits/rejected": -1.1842278242111206, - "logps/chosen": -461.0892028808594, - "logps/rejected": -531.4512939453125, - "loss": 0.4605, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9217849969863892, - "rewards/margins": 1.02143132686615, - "rewards/rejected": -2.9432168006896973, + "logits/chosen": -1.3151836395263672, + "logits/rejected": -1.113488793373108, + "logps/chosen": -440.9947814941406, + "logps/rejected": -505.5106506347656, + "loss": 0.4714, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7208404541015625, + "rewards/margins": 0.9629707336425781, + "rewards/rejected": -2.6838109493255615, "step": 1670 }, { "epoch": 0.44, - "grad_norm": 10.875, + "grad_norm": 7.28125, "learning_rate": 3.4403278249200222e-06, - "logits/chosen": -1.3681023120880127, - "logits/rejected": -1.159461259841919, - "logps/chosen": -495.4026794433594, - "logps/rejected": -574.2462768554688, - "loss": 0.4479, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.127955913543701, - "rewards/margins": 1.2076514959335327, - "rewards/rejected": -3.3356070518493652, + "logits/chosen": -1.289880633354187, + "logits/rejected": -1.0922951698303223, + "logps/chosen": -470.2997131347656, + "logps/rejected": -540.0161743164062, + "loss": 0.4409, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8769254684448242, + "rewards/margins": 1.1163800954818726, + "rewards/rejected": -2.9933059215545654, "step": 1680 }, { "epoch": 0.44, - "grad_norm": 15.1875, + "grad_norm": 15.75, "learning_rate": 3.4191217992068293e-06, - "logits/chosen": -1.4043166637420654, - "logits/rejected": -1.2147849798202515, - "logps/chosen": -536.7705688476562, - "logps/rejected": -593.9422607421875, - "loss": 0.5436, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.586280345916748, - "rewards/margins": 1.0274693965911865, - "rewards/rejected": -3.6137497425079346, + "logits/chosen": -1.3650540113449097, + "logits/rejected": -1.1904373168945312, + "logps/chosen": -491.87060546875, + "logps/rejected": -539.9581298828125, + "loss": 0.5323, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.137281656265259, + "rewards/margins": 0.9366267323493958, + "rewards/rejected": -3.0739083290100098, "step": 1690 }, { "epoch": 0.44, - "grad_norm": 11.5625, + "grad_norm": 11.8125, "learning_rate": 3.3978390270879056e-06, - "logits/chosen": -1.2982449531555176, - "logits/rejected": -1.202523946762085, - "logps/chosen": -495.7208557128906, - "logps/rejected": -613.0609130859375, - "loss": 0.5183, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.8570523262023926, - "rewards/margins": 1.105515480041504, - "rewards/rejected": -3.9625678062438965, + "logits/chosen": -1.273272156715393, + "logits/rejected": -1.1826374530792236, + "logps/chosen": -441.7779235839844, + "logps/rejected": -540.5211791992188, + "loss": 0.5147, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3176229000091553, + "rewards/margins": 0.9195470809936523, + "rewards/rejected": -3.2371699810028076, "step": 1700 }, { "epoch": 0.44, - "eval_logits/chosen": -1.2783551216125488, - "eval_logits/rejected": -1.1493244171142578, - "eval_logps/chosen": -543.2153930664062, - "eval_logps/rejected": -629.583251953125, - "eval_loss": 0.49932965636253357, - "eval_rewards/accuracies": 0.7390000224113464, - "eval_rewards/chosen": -2.7856333255767822, - "eval_rewards/margins": 1.0641134977340698, - "eval_rewards/rejected": -3.8497471809387207, - "eval_runtime": 384.9856, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.269109845161438, + "eval_logits/rejected": -1.146828293800354, + "eval_logps/chosen": -491.4620666503906, + "eval_logps/rejected": -566.2828979492188, + "eval_loss": 0.5000255107879639, + "eval_rewards/accuracies": 0.734000027179718, + "eval_rewards/chosen": -2.2681005001068115, + "eval_rewards/margins": 0.9486428499221802, + "eval_rewards/rejected": -3.2167434692382812, + "eval_runtime": 385.0866, + "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 1700 }, { "epoch": 0.45, - "grad_norm": 12.3125, + "grad_norm": 11.6875, "learning_rate": 3.3764812856685995e-06, - "logits/chosen": -1.3749181032180786, - "logits/rejected": -1.3575098514556885, - "logps/chosen": -486.59808349609375, - "logps/rejected": -601.6511840820312, - "loss": 0.5154, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.6204190254211426, - "rewards/margins": 1.015373945236206, - "rewards/rejected": -3.6357929706573486, + "logits/chosen": -1.3418161869049072, + "logits/rejected": -1.3261712789535522, + "logps/chosen": -440.2900390625, + "logps/rejected": -541.0260009765625, + "loss": 0.5252, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1573386192321777, + "rewards/margins": 0.8722022175788879, + "rewards/rejected": -3.029540777206421, "step": 1710 }, { "epoch": 0.45, - "grad_norm": 9.5625, + "grad_norm": 10.875, "learning_rate": 3.3550503583141726e-06, - "logits/chosen": -1.4978001117706299, - "logits/rejected": -1.3547831773757935, - "logps/chosen": -509.7710876464844, - "logps/rejected": -603.9198608398438, - "loss": 0.4785, + "logits/chosen": -1.4454267024993896, + "logits/rejected": -1.311650276184082, + "logps/chosen": -485.03411865234375, + "logps/rejected": -569.6810913085938, + "loss": 0.4849, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.409210205078125, - "rewards/margins": 1.0928349494934082, - "rewards/rejected": -3.5020453929901123, + "rewards/chosen": -2.1618409156799316, + "rewards/margins": 0.9978164434432983, + "rewards/rejected": -3.1596572399139404, "step": 1720 }, { "epoch": 0.45, - "grad_norm": 9.8125, + "grad_norm": 8.375, "learning_rate": 3.3335480345008907e-06, - "logits/chosen": -1.359413743019104, - "logits/rejected": -1.2617199420928955, - "logps/chosen": -466.9374084472656, - "logps/rejected": -555.6993408203125, - "loss": 0.4525, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.067884922027588, - "rewards/margins": 1.134508728981018, - "rewards/rejected": -3.2023932933807373, + "logits/chosen": -1.2839902639389038, + "logits/rejected": -1.1861859560012817, + "logps/chosen": -466.77850341796875, + "logps/rejected": -553.5386352539062, + "loss": 0.4622, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0662953853607178, + "rewards/margins": 1.1144917011260986, + "rewards/rejected": -3.1807870864868164, "step": 1730 }, { "epoch": 0.46, - "grad_norm": 7.875, + "grad_norm": 8.0, "learning_rate": 3.3119761096666055e-06, - "logits/chosen": -1.4102518558502197, - "logits/rejected": -1.2553739547729492, - "logps/chosen": -483.46356201171875, - "logps/rejected": -535.0630493164062, - "loss": 0.5686, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0844430923461914, - "rewards/margins": 0.8192558288574219, - "rewards/rejected": -2.9036991596221924, + "logits/chosen": -1.3106259107589722, + "logits/rejected": -1.1651959419250488, + "logps/chosen": -498.0738830566406, + "logps/rejected": -552.7239379882812, + "loss": 0.5547, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.230546236038208, + "rewards/margins": 0.8497620820999146, + "rewards/rejected": -3.080308437347412, "step": 1740 }, { "epoch": 0.46, - "grad_norm": 8.875, + "grad_norm": 7.6875, "learning_rate": 3.290336385060832e-06, - "logits/chosen": -1.573932409286499, - "logits/rejected": -1.365490436553955, - "logps/chosen": -463.2930603027344, - "logps/rejected": -536.6669921875, - "loss": 0.5262, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.1443819999694824, - "rewards/margins": 0.9354622960090637, - "rewards/rejected": -3.0798439979553223, + "logits/chosen": -1.493554949760437, + "logits/rejected": -1.2929532527923584, + "logps/chosen": -479.22259521484375, + "logps/rejected": -548.3055419921875, + "loss": 0.55, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3036773204803467, + "rewards/margins": 0.8925528526306152, + "rewards/rejected": -3.196229934692383, "step": 1750 }, { "epoch": 0.46, - "grad_norm": 10.125, + "grad_norm": 9.3125, "learning_rate": 3.268630667594348e-06, - "logits/chosen": -1.4119102954864502, - "logits/rejected": -1.375628113746643, - "logps/chosen": -442.0873107910156, - "logps/rejected": -511.404296875, - "loss": 0.5134, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.8643802404403687, - "rewards/margins": 0.9355432391166687, - "rewards/rejected": -2.7999236583709717, + "logits/chosen": -1.355196237564087, + "logits/rejected": -1.3183298110961914, + "logps/chosen": -460.26336669921875, + "logps/rejected": -523.685546875, + "loss": 0.5176, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.046140670776367, + "rewards/margins": 0.8765950202941895, + "rewards/rejected": -2.9227356910705566, "step": 1760 }, { "epoch": 0.46, - "grad_norm": 8.5625, + "grad_norm": 11.125, "learning_rate": 3.2468607696883147e-06, - "logits/chosen": -1.4483537673950195, - "logits/rejected": -1.3931195735931396, - "logps/chosen": -452.58056640625, - "logps/rejected": -550.0877075195312, - "loss": 0.4914, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.9940522909164429, - "rewards/margins": 0.9315992593765259, - "rewards/rejected": -2.9256515502929688, + "logits/chosen": -1.3625749349594116, + "logits/rejected": -1.311535358428955, + "logps/chosen": -489.18017578125, + "logps/rejected": -587.8863525390625, + "loss": 0.4934, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.360048294067383, + "rewards/margins": 0.9435898065567017, + "rewards/rejected": -3.303637742996216, "step": 1770 }, { "epoch": 0.47, - "grad_norm": 6.40625, + "grad_norm": 7.78125, "learning_rate": 3.225028509122944e-06, - "logits/chosen": -1.5014052391052246, - "logits/rejected": -1.3815762996673584, - "logps/chosen": -431.58447265625, - "logps/rejected": -498.1240234375, - "loss": 0.5295, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.9506728649139404, - "rewards/margins": 0.7785587310791016, - "rewards/rejected": -2.729231595993042, + "logits/chosen": -1.397005319595337, + "logits/rejected": -1.2728253602981567, + "logps/chosen": -486.8643493652344, + "logps/rejected": -561.2532958984375, + "loss": 0.5211, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.503471851348877, + "rewards/margins": 0.8570526838302612, + "rewards/rejected": -3.3605244159698486, "step": 1780 }, { "epoch": 0.47, - "grad_norm": 11.8125, + "grad_norm": 11.6875, "learning_rate": 3.2031357088857083e-06, - "logits/chosen": -1.4752798080444336, - "logits/rejected": -1.3948938846588135, - "logps/chosen": -494.77056884765625, - "logps/rejected": -584.5908813476562, - "loss": 0.5126, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.1956658363342285, - "rewards/margins": 0.9330615997314453, - "rewards/rejected": -3.128727674484253, + "logits/chosen": -1.3312914371490479, + "logits/rejected": -1.2595702409744263, + "logps/chosen": -561.8858032226562, + "logps/rejected": -660.8182373046875, + "loss": 0.5043, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8668177127838135, + "rewards/margins": 1.0241832733154297, + "rewards/rejected": -3.8910012245178223, "step": 1790 }, { "epoch": 0.47, - "grad_norm": 14.125, + "grad_norm": 12.8125, "learning_rate": 3.181184197019127e-06, - "logits/chosen": -1.227853775024414, - "logits/rejected": -1.1199489831924438, - "logps/chosen": -480.1280212402344, - "logps/rejected": -640.9280395507812, - "loss": 0.478, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.4991328716278076, - "rewards/margins": 1.3459218740463257, - "rewards/rejected": -3.845055103302002, + "logits/chosen": -1.1215088367462158, + "logits/rejected": -1.0118662118911743, + "logps/chosen": -525.9521484375, + "logps/rejected": -697.3963623046875, + "loss": 0.4809, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.957373857498169, + "rewards/margins": 1.4523636102676392, + "rewards/rejected": -4.409738063812256, "step": 1800 }, { "epoch": 0.47, - "eval_logits/chosen": -1.2418839931488037, - "eval_logits/rejected": -1.1179745197296143, - "eval_logps/chosen": -533.2012329101562, - "eval_logps/rejected": -618.8510131835938, - "eval_loss": 0.5059686303138733, - "eval_rewards/accuracies": 0.7390000224113464, - "eval_rewards/chosen": -2.685492753982544, - "eval_rewards/margins": 1.0569311380386353, - "eval_rewards/rejected": -3.7424237728118896, - "eval_runtime": 385.0443, - "eval_samples_per_second": 5.194, - "eval_steps_per_second": 0.649, + "eval_logits/chosen": -1.1786177158355713, + "eval_logits/rejected": -1.0616753101348877, + "eval_logps/chosen": -557.43115234375, + "eval_logps/rejected": -643.640869140625, + "eval_loss": 0.5022104382514954, + "eval_rewards/accuracies": 0.7404999732971191, + "eval_rewards/chosen": -2.9277913570404053, + "eval_rewards/margins": 1.062530517578125, + "eval_rewards/rejected": -3.9903218746185303, + "eval_runtime": 384.8251, + "eval_samples_per_second": 5.197, + "eval_steps_per_second": 0.65, "step": 1800 }, { "epoch": 0.47, - "grad_norm": 22.25, + "grad_norm": 18.125, "learning_rate": 3.159175806468126e-06, - "logits/chosen": -1.206802248954773, - "logits/rejected": -1.0140331983566284, - "logps/chosen": -536.1395263671875, - "logps/rejected": -618.5433959960938, - "loss": 0.4884, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.874638795852661, - "rewards/margins": 1.125908374786377, - "rewards/rejected": -4.000546932220459, + "logits/chosen": -1.1367595195770264, + "logits/rejected": -0.9490365982055664, + "logps/chosen": -545.4899291992188, + "logps/rejected": -620.2122192382812, + "loss": 0.5001, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9681437015533447, + "rewards/margins": 1.0490918159484863, + "rewards/rejected": -4.01723575592041, "step": 1810 }, { "epoch": 0.48, - "grad_norm": 18.125, + "grad_norm": 13.375, "learning_rate": 3.1371123749269804e-06, - "logits/chosen": -1.27443265914917, - "logits/rejected": -1.2016910314559937, - "logps/chosen": -608.0801391601562, - "logps/rejected": -680.9607543945312, - "loss": 0.6001, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.246840000152588, - "rewards/margins": 0.8893483877182007, - "rewards/rejected": -4.13618803024292, + "logits/chosen": -1.2076561450958252, + "logits/rejected": -1.135667085647583, + "logps/chosen": -596.1036376953125, + "logps/rejected": -664.8118896484375, + "loss": 0.5596, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.1270740032196045, + "rewards/margins": 0.847625732421875, + "rewards/rejected": -3.9746997356414795, "step": 1820 }, { "epoch": 0.48, - "grad_norm": 10.125, + "grad_norm": 11.6875, "learning_rate": 3.114995744685877e-06, - "logits/chosen": -1.238593339920044, - "logits/rejected": -1.2031556367874146, - "logps/chosen": -542.8019409179688, - "logps/rejected": -626.592041015625, - "loss": 0.5158, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.95475697517395, - "rewards/margins": 0.9683350324630737, - "rewards/rejected": -3.9230918884277344, + "logits/chosen": -1.1738382577896118, + "logits/rejected": -1.146437644958496, + "logps/chosen": -529.6216430664062, + "logps/rejected": -603.9373168945312, + "loss": 0.5267, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.8229541778564453, + "rewards/margins": 0.8735902905464172, + "rewards/rejected": -3.696544647216797, "step": 1830 }, { "epoch": 0.48, - "grad_norm": 5.8125, + "grad_norm": 6.34375, "learning_rate": 3.0928277624770743e-06, - "logits/chosen": -1.4476687908172607, - "logits/rejected": -1.2830606698989868, - "logps/chosen": -545.8017578125, - "logps/rejected": -637.268798828125, - "loss": 0.5053, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.61057710647583, - "rewards/margins": 1.1988862752914429, - "rewards/rejected": -3.8094639778137207, + "logits/chosen": -1.3653886318206787, + "logits/rejected": -1.2098249197006226, + "logps/chosen": -532.6870727539062, + "logps/rejected": -613.7505493164062, + "loss": 0.5049, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4794299602508545, + "rewards/margins": 1.0948512554168701, + "rewards/rejected": -3.5742812156677246, "step": 1840 }, { "epoch": 0.48, - "grad_norm": 6.0625, + "grad_norm": 6.625, "learning_rate": 3.070610279320708e-06, - "logits/chosen": -1.469509243965149, - "logits/rejected": -1.2941926717758179, - "logps/chosen": -530.1407470703125, - "logps/rejected": -617.324462890625, - "loss": 0.4534, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.4305920600891113, - "rewards/margins": 1.112181544303894, - "rewards/rejected": -3.542773723602295, + "logits/chosen": -1.3816752433776855, + "logits/rejected": -1.2150719165802002, + "logps/chosen": -521.9651489257812, + "logps/rejected": -601.0781860351562, + "loss": 0.4669, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3488364219665527, + "rewards/margins": 1.0314748287200928, + "rewards/rejected": -3.3803107738494873, "step": 1850 }, { "epoch": 0.49, - "grad_norm": 6.125, + "grad_norm": 5.71875, "learning_rate": 3.0483451503702264e-06, - "logits/chosen": -1.3953410387039185, - "logits/rejected": -1.319723129272461, - "logps/chosen": -553.1173706054688, - "logps/rejected": -624.0015258789062, - "loss": 0.5575, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.6702237129211426, - "rewards/margins": 0.9092646837234497, - "rewards/rejected": -3.5794882774353027, + "logits/chosen": -1.3038969039916992, + "logits/rejected": -1.2319445610046387, + "logps/chosen": -547.4259033203125, + "logps/rejected": -617.3253784179688, + "loss": 0.5618, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.6133077144622803, + "rewards/margins": 0.8994197845458984, + "rewards/rejected": -3.5127272605895996, "step": 1860 }, { "epoch": 0.49, - "grad_norm": 7.40625, + "grad_norm": 8.1875, "learning_rate": 3.0260342347574916e-06, - "logits/chosen": -1.366349697113037, - "logits/rejected": -1.2165526151657104, - "logps/chosen": -525.2376098632812, - "logps/rejected": -642.2418823242188, - "loss": 0.4285, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -2.52901291847229, - "rewards/margins": 1.3603582382202148, - "rewards/rejected": -3.889371156692505, + "logits/chosen": -1.2965396642684937, + "logits/rejected": -1.1523797512054443, + "logps/chosen": -519.9957275390625, + "logps/rejected": -625.9295654296875, + "loss": 0.4402, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4765942096710205, + "rewards/margins": 1.2496535778045654, + "rewards/rejected": -3.726247787475586, "step": 1870 }, { "epoch": 0.49, - "grad_norm": 11.0625, + "grad_norm": 9.8125, "learning_rate": 3.0036793954375358e-06, - "logits/chosen": -1.3298574686050415, - "logits/rejected": -1.175115942955017, - "logps/chosen": -573.416015625, - "logps/rejected": -657.1756591796875, - "loss": 0.4426, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -3.033194065093994, - "rewards/margins": 1.259937047958374, - "rewards/rejected": -4.293131351470947, + "logits/chosen": -1.2782443761825562, + "logits/rejected": -1.1259523630142212, + "logps/chosen": -547.7828979492188, + "logps/rejected": -630.2535400390625, + "loss": 0.4395, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.7768635749816895, + "rewards/margins": 1.2470468282699585, + "rewards/rejected": -4.0239105224609375, "step": 1880 }, { "epoch": 0.49, - "grad_norm": 16.625, + "grad_norm": 11.5, "learning_rate": 2.981282499033009e-06, - "logits/chosen": -1.3132747411727905, - "logits/rejected": -1.1860190629959106, - "logps/chosen": -596.8779296875, - "logps/rejected": -696.3871459960938, - "loss": 0.5166, + "logits/chosen": -1.278181791305542, + "logits/rejected": -1.1554654836654663, + "logps/chosen": -553.5909423828125, + "logps/rejected": -634.082275390625, + "loss": 0.5183, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.2053794860839844, - "rewards/margins": 1.2494663000106812, - "rewards/rejected": -4.454846382141113, + "rewards/chosen": -2.772510051727295, + "rewards/margins": 1.059287667274475, + "rewards/rejected": -3.8317978382110596, "step": 1890 }, { "epoch": 0.5, - "grad_norm": 14.5, + "grad_norm": 8.0625, "learning_rate": 2.9588454156783163e-06, - "logits/chosen": -1.3445230722427368, - "logits/rejected": -1.1744747161865234, - "logps/chosen": -582.3587646484375, - "logps/rejected": -712.7427978515625, - "loss": 0.4325, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.0100293159484863, - "rewards/margins": 1.5214197635650635, - "rewards/rejected": -4.531449317932129, + "logits/chosen": -1.327986717224121, + "logits/rejected": -1.165433645248413, + "logps/chosen": -511.99090576171875, + "logps/rejected": -616.3585815429688, + "loss": 0.46, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3063502311706543, + "rewards/margins": 1.2612559795379639, + "rewards/rejected": -3.5676064491271973, "step": 1900 }, { "epoch": 0.5, - "eval_logits/chosen": -1.2515246868133545, - "eval_logits/rejected": -1.1244795322418213, - "eval_logps/chosen": -567.7128295898438, - "eval_logps/rejected": -665.8478393554688, - "eval_loss": 0.4995792806148529, - "eval_rewards/accuracies": 0.7369999885559082, - "eval_rewards/chosen": -3.0306081771850586, - "eval_rewards/margins": 1.1817845106124878, - "eval_rewards/rejected": -4.2123918533325195, - "eval_runtime": 384.958, - "eval_samples_per_second": 5.195, - "eval_steps_per_second": 0.649, + "eval_logits/chosen": -1.2252681255340576, + "eval_logits/rejected": -1.1040537357330322, + "eval_logps/chosen": -507.9823303222656, + "eval_logps/rejected": -594.7523193359375, + "eval_loss": 0.5002806782722473, + "eval_rewards/accuracies": 0.7354999780654907, + "eval_rewards/chosen": -2.433302879333496, + "eval_rewards/margins": 1.0681343078613281, + "eval_rewards/rejected": -3.501437187194824, + "eval_runtime": 384.8766, + "eval_samples_per_second": 5.196, + "eval_steps_per_second": 0.65, "step": 1900 }, { "epoch": 0.5, - "grad_norm": 10.6875, + "grad_norm": 10.0625, "learning_rate": 2.9363700188634597e-06, - "logits/chosen": -1.3585935831069946, - "logits/rejected": -1.2228021621704102, - "logps/chosen": -564.982666015625, - "logps/rejected": -625.9069213867188, - "loss": 0.5244, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.999126434326172, - "rewards/margins": 0.9794772863388062, - "rewards/rejected": -3.9786033630371094, + "logits/chosen": -1.2988775968551636, + "logits/rejected": -1.167811632156372, + "logps/chosen": -534.3869018554688, + "logps/rejected": -595.1586303710938, + "loss": 0.518, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6931681632995605, + "rewards/margins": 0.9779523611068726, + "rewards/rejected": -3.6711204051971436, "step": 1910 }, { "epoch": 0.5, - "grad_norm": 10.625, + "grad_norm": 11.3125, "learning_rate": 2.9138581852776053e-06, - "logits/chosen": -1.378015160560608, - "logits/rejected": -1.2671245336532593, - "logps/chosen": -547.2576904296875, - "logps/rejected": -642.3396606445312, - "loss": 0.5133, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.8754656314849854, - "rewards/margins": 1.0803815126419067, - "rewards/rejected": -3.9558472633361816, + "logits/chosen": -1.2570379972457886, + "logits/rejected": -1.1531012058258057, + "logps/chosen": -555.2855834960938, + "logps/rejected": -654.2891845703125, + "loss": 0.508, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.955744981765747, + "rewards/margins": 1.1195967197418213, + "rewards/rejected": -4.075342178344727, "step": 1920 }, { "epoch": 0.51, - "grad_norm": 6.71875, + "grad_norm": 7.8125, "learning_rate": 2.8913117946523805e-06, - "logits/chosen": -1.4040184020996094, - "logits/rejected": -1.2129814624786377, - "logps/chosen": -538.7586669921875, - "logps/rejected": -607.2654418945312, - "loss": 0.4589, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.72344970703125, - "rewards/margins": 1.065987229347229, - "rewards/rejected": -3.7894368171691895, + "logits/chosen": -1.280539631843567, + "logits/rejected": -1.100694179534912, + "logps/chosen": -573.8317260742188, + "logps/rejected": -636.535400390625, + "loss": 0.4979, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.0741798877716064, + "rewards/margins": 1.0079572200775146, + "rewards/rejected": -4.082137107849121, "step": 1930 }, { "epoch": 0.51, - "grad_norm": 12.1875, + "grad_norm": 9.9375, "learning_rate": 2.8687327296049126e-06, - "logits/chosen": -1.3795297145843506, - "logits/rejected": -1.2698510885238647, - "logps/chosen": -529.3519897460938, - "logps/rejected": -627.5960693359375, - "loss": 0.5143, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.702291965484619, - "rewards/margins": 1.0797038078308105, - "rewards/rejected": -3.781996250152588, + "logits/chosen": -1.2726280689239502, + "logits/rejected": -1.171382188796997, + "logps/chosen": -556.9118041992188, + "logps/rejected": -646.0772705078125, + "loss": 0.5218, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.977890968322754, + "rewards/margins": 0.9889172315597534, + "rewards/rejected": -3.966808319091797, "step": 1940 }, { "epoch": 0.51, - "grad_norm": 17.0, + "grad_norm": 13.25, "learning_rate": 2.8461228754806376e-06, - "logits/chosen": -1.4074313640594482, - "logits/rejected": -1.231386661529541, - "logps/chosen": -545.1365966796875, - "logps/rejected": -611.1114501953125, - "loss": 0.5387, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.6934621334075928, - "rewards/margins": 0.9177573323249817, - "rewards/rejected": -3.611219882965088, + "logits/chosen": -1.3368163108825684, + "logits/rejected": -1.172978401184082, + "logps/chosen": -542.0377807617188, + "logps/rejected": -597.8560180664062, + "loss": 0.5274, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6624741554260254, + "rewards/margins": 0.8161913752555847, + "rewards/rejected": -3.478665590286255, "step": 1950 }, { "epoch": 0.51, - "grad_norm": 8.0625, + "grad_norm": 8.5625, "learning_rate": 2.823484120195865e-06, - "logits/chosen": -1.51656174659729, - "logits/rejected": -1.2999058961868286, - "logps/chosen": -509.6175231933594, - "logps/rejected": -593.9664306640625, - "loss": 0.4301, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.3180630207061768, - "rewards/margins": 1.187303900718689, - "rewards/rejected": -3.5053672790527344, + "logits/chosen": -1.4352657794952393, + "logits/rejected": -1.227199912071228, + "logps/chosen": -520.835693359375, + "logps/rejected": -587.2822265625, + "loss": 0.4585, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.4302444458007812, + "rewards/margins": 1.0082801580429077, + "rewards/rejected": -3.4385247230529785, "step": 1960 }, { "epoch": 0.52, - "grad_norm": 8.8125, + "grad_norm": 10.3125, "learning_rate": 2.8008183540801486e-06, - "logits/chosen": -1.366084337234497, - "logits/rejected": -1.211987853050232, - "logps/chosen": -522.1681518554688, - "logps/rejected": -573.3563232421875, - "loss": 0.5128, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.4853363037109375, - "rewards/margins": 0.965847373008728, - "rewards/rejected": -3.451184034347534, + "logits/chosen": -1.293084979057312, + "logits/rejected": -1.148153305053711, + "logps/chosen": -520.2894897460938, + "logps/rejected": -565.23681640625, + "loss": 0.4997, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4665493965148926, + "rewards/margins": 0.9034391641616821, + "rewards/rejected": -3.369988203048706, "step": 1970 }, { "epoch": 0.52, - "grad_norm": 14.3125, + "grad_norm": 13.1875, "learning_rate": 2.7781274697184353e-06, - "logits/chosen": -1.2134374380111694, - "logits/rejected": -1.2537566423416138, - "logps/chosen": -503.17462158203125, - "logps/rejected": -637.951171875, - "loss": 0.5379, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.738171100616455, - "rewards/margins": 1.1446306705474854, - "rewards/rejected": -3.8828015327453613, + "logits/chosen": -1.1424802541732788, + "logits/rejected": -1.187720775604248, + "logps/chosen": -492.9554138183594, + "logps/rejected": -617.7970581054688, + "loss": 0.5349, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.6359786987304688, + "rewards/margins": 1.0452814102172852, + "rewards/rejected": -3.681259870529175, "step": 1980 }, { "epoch": 0.52, - "grad_norm": 6.96875, + "grad_norm": 9.375, "learning_rate": 2.7554133617930397e-06, - "logits/chosen": -1.3368358612060547, - "logits/rejected": -1.2081658840179443, - "logps/chosen": -505.2743225097656, - "logps/rejected": -597.1849365234375, - "loss": 0.5015, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.5743813514709473, - "rewards/margins": 1.0698320865631104, - "rewards/rejected": -3.644213914871216, + "logits/chosen": -1.2500625848770142, + "logits/rejected": -1.1256784200668335, + "logps/chosen": -501.5577087402344, + "logps/rejected": -588.8922119140625, + "loss": 0.5168, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.537215232849121, + "rewards/margins": 1.0240715742111206, + "rewards/rejected": -3.5612869262695312, "step": 1990 }, { "epoch": 0.52, - "grad_norm": 10.125, + "grad_norm": 11.625, "learning_rate": 2.7326779269254363e-06, - "logits/chosen": -1.5063138008117676, - "logits/rejected": -1.330679178237915, - "logps/chosen": -549.5060424804688, - "logps/rejected": -596.6401977539062, - "loss": 0.4926, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.6017112731933594, - "rewards/margins": 1.0307661294937134, - "rewards/rejected": -3.6324775218963623, + "logits/chosen": -1.436962366104126, + "logits/rejected": -1.266498327255249, + "logps/chosen": -528.1736450195312, + "logps/rejected": -578.77734375, + "loss": 0.477, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.388388156890869, + "rewards/margins": 1.0654609203338623, + "rewards/rejected": -3.4538490772247314, "step": 2000 }, { "epoch": 0.52, - "eval_logits/chosen": -1.2879465818405151, - "eval_logits/rejected": -1.1607391834259033, - "eval_logps/chosen": -531.1353759765625, - "eval_logps/rejected": -612.32275390625, - "eval_loss": 0.4934316873550415, - "eval_rewards/accuracies": 0.7404999732971191, - "eval_rewards/chosen": -2.664834499359131, - "eval_rewards/margins": 1.0123074054718018, - "eval_rewards/rejected": -3.6771416664123535, - "eval_runtime": 385.0864, - "eval_samples_per_second": 5.194, + "eval_logits/chosen": -1.2391676902770996, + "eval_logits/rejected": -1.1185089349746704, + "eval_logps/chosen": -503.76922607421875, + "eval_logps/rejected": -583.5771484375, + "eval_loss": 0.4988709092140198, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -2.39117169380188, + "eval_rewards/margins": 0.9985132813453674, + "eval_rewards/rejected": -3.3896851539611816, + "eval_runtime": 385.1549, + "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 2000 }, { "epoch": 0.53, - "grad_norm": 12.0625, + "grad_norm": 10.375, "learning_rate": 2.7099230635178954e-06, - "logits/chosen": -1.32606840133667, - "logits/rejected": -1.284746766090393, - "logps/chosen": -530.9010009765625, - "logps/rejected": -617.3204956054688, - "loss": 0.5175, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.6605124473571777, - "rewards/margins": 0.9288894534111023, - "rewards/rejected": -3.589401960372925, + "logits/chosen": -1.280256748199463, + "logits/rejected": -1.239262342453003, + "logps/chosen": -499.21240234375, + "logps/rejected": -584.2531127929688, + "loss": 0.5227, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.3436267375946045, + "rewards/margins": 0.9151015281677246, + "rewards/rejected": -3.25872802734375, "step": 2010 }, { "epoch": 0.53, - "grad_norm": 12.6875, + "grad_norm": 10.625, "learning_rate": 2.6871506715949608e-06, - "logits/chosen": -1.4429805278778076, - "logits/rejected": -1.309429407119751, - "logps/chosen": -509.5052795410156, - "logps/rejected": -599.0560913085938, - "loss": 0.4724, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.565361976623535, - "rewards/margins": 1.0760364532470703, - "rewards/rejected": -3.6413981914520264, + "logits/chosen": -1.4013721942901611, + "logits/rejected": -1.2793995141983032, + "logps/chosen": -463.5269470214844, + "logps/rejected": -541.9952392578125, + "loss": 0.4813, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1055784225463867, + "rewards/margins": 0.965211033821106, + "rewards/rejected": -3.070789337158203, "step": 2020 }, { "epoch": 0.53, - "grad_norm": 11.125, + "grad_norm": 10.25, "learning_rate": 2.6643626526448063e-06, - "logits/chosen": -1.516875982284546, - "logits/rejected": -1.3450555801391602, - "logps/chosen": -554.38427734375, - "logps/rejected": -638.1011352539062, - "loss": 0.4402, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.594825267791748, - "rewards/margins": 1.260938048362732, - "rewards/rejected": -3.8557631969451904, + "logits/chosen": -1.4540785551071167, + "logits/rejected": -1.2950793504714966, + "logps/chosen": -521.9766235351562, + "logps/rejected": -591.3455810546875, + "loss": 0.4591, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.2707479000091553, + "rewards/margins": 1.1174595355987549, + "rewards/rejected": -3.388207197189331, "step": 2030 }, { "epoch": 0.53, - "grad_norm": 9.375, + "grad_norm": 12.375, "learning_rate": 2.6415609094604562e-06, - "logits/chosen": -1.331624150276184, - "logits/rejected": -1.273493766784668, - "logps/chosen": -572.4739990234375, - "logps/rejected": -668.96484375, - "loss": 0.4552, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.0190746784210205, - "rewards/margins": 1.1551822423934937, - "rewards/rejected": -4.174256801605225, + "logits/chosen": -1.2611262798309326, + "logits/rejected": -1.2067164182662964, + "logps/chosen": -539.65869140625, + "logps/rejected": -629.5203857421875, + "loss": 0.4464, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6909213066101074, + "rewards/margins": 1.0888901948928833, + "rewards/rejected": -3.7798118591308594, "step": 2040 }, { "epoch": 0.54, - "grad_norm": 15.5, + "grad_norm": 14.375, "learning_rate": 2.618747345980904e-06, - "logits/chosen": -1.3320283889770508, - "logits/rejected": -1.138301134109497, - "logps/chosen": -604.2484130859375, - "logps/rejected": -656.2549438476562, - "loss": 0.5591, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -3.5585105419158936, - "rewards/margins": 1.0249335765838623, - "rewards/rejected": -4.583444118499756, + "logits/chosen": -1.2389599084854126, + "logits/rejected": -1.0517133474349976, + "logps/chosen": -593.0328369140625, + "logps/rejected": -635.1866455078125, + "loss": 0.5624, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.446354627609253, + "rewards/margins": 0.9264065027236938, + "rewards/rejected": -4.3727617263793945, "step": 2050 }, { "epoch": 0.54, - "grad_norm": 5.78125, + "grad_norm": 6.5, "learning_rate": 2.595923867132136e-06, - "logits/chosen": -1.3874107599258423, - "logits/rejected": -1.25830078125, - "logps/chosen": -597.45751953125, - "logps/rejected": -698.0640258789062, - "loss": 0.482, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -3.1653714179992676, - "rewards/margins": 1.2528908252716064, - "rewards/rejected": -4.418262481689453, + "logits/chosen": -1.2825162410736084, + "logits/rejected": -1.1602712869644165, + "logps/chosen": -608.6810302734375, + "logps/rejected": -699.3939819335938, + "loss": 0.5003, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.277606964111328, + "rewards/margins": 1.153955101966858, + "rewards/rejected": -4.4315619468688965, "step": 2060 }, { "epoch": 0.54, - "grad_norm": 7.625, + "grad_norm": 8.75, "learning_rate": 2.5730923786680672e-06, - "logits/chosen": -1.3206942081451416, - "logits/rejected": -1.2852368354797363, - "logps/chosen": -542.5220336914062, - "logps/rejected": -643.3800048828125, - "loss": 0.5358, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.9099090099334717, - "rewards/margins": 0.932235836982727, - "rewards/rejected": -3.842144727706909, + "logits/chosen": -1.2274243831634521, + "logits/rejected": -1.191007375717163, + "logps/chosen": -544.41259765625, + "logps/rejected": -638.9494018554688, + "loss": 0.5467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.92881441116333, + "rewards/margins": 0.869024932384491, + "rewards/rejected": -3.7978389263153076, "step": 2070 }, { "epoch": 0.54, - "grad_norm": 9.75, + "grad_norm": 7.875, "learning_rate": 2.5502547870114137e-06, - "logits/chosen": -1.3924452066421509, - "logits/rejected": -1.2653155326843262, - "logps/chosen": -522.9793090820312, - "logps/rejected": -588.5538940429688, - "loss": 0.5191, + "logits/chosen": -1.3184901475906372, + "logits/rejected": -1.196045994758606, + "logps/chosen": -512.1152954101562, + "logps/rejected": -571.6860961914062, + "loss": 0.5238, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.6759915351867676, - "rewards/margins": 0.9582807421684265, - "rewards/rejected": -3.634272336959839, + "rewards/chosen": -2.5673513412475586, + "rewards/margins": 0.8982425928115845, + "rewards/rejected": -3.4655938148498535, "step": 2080 }, { "epoch": 0.55, - "grad_norm": 11.5, + "grad_norm": 10.5625, "learning_rate": 2.527412999094507e-06, - "logits/chosen": -1.3764816522598267, - "logits/rejected": -1.2036128044128418, - "logps/chosen": -582.7906494140625, - "logps/rejected": -684.7365112304688, - "loss": 0.4795, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.890341281890869, - "rewards/margins": 1.1449689865112305, - "rewards/rejected": -4.0353102684021, + "logits/chosen": -1.3197405338287354, + "logits/rejected": -1.1518932580947876, + "logps/chosen": -544.2307739257812, + "logps/rejected": -638.2955932617188, + "loss": 0.4778, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.50474214553833, + "rewards/margins": 1.0661590099334717, + "rewards/rejected": -3.5709011554718018, "step": 2090 }, { "epoch": 0.55, - "grad_norm": 11.0625, + "grad_norm": 10.0625, "learning_rate": 2.504568922200064e-06, - "logits/chosen": -1.3423351049423218, - "logits/rejected": -1.1891554594039917, - "logps/chosen": -519.73388671875, - "logps/rejected": -617.8820190429688, - "loss": 0.5009, + "logits/chosen": -1.283879041671753, + "logits/rejected": -1.1339181661605835, + "logps/chosen": -479.8946838378906, + "logps/rejected": -564.1932373046875, + "loss": 0.5068, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.7839534282684326, - "rewards/margins": 1.1430864334106445, - "rewards/rejected": -3.9270401000976562, + "rewards/chosen": -2.385560989379883, + "rewards/margins": 1.0045907497406006, + "rewards/rejected": -3.3901519775390625, "step": 2100 }, { "epoch": 0.55, - "eval_logits/chosen": -1.3098708391189575, - "eval_logits/rejected": -1.1825140714645386, - "eval_logps/chosen": -547.086669921875, - "eval_logps/rejected": -630.5530395507812, - "eval_loss": 0.4915066063404083, - "eval_rewards/accuracies": 0.7509999871253967, - "eval_rewards/chosen": -2.8243465423583984, - "eval_rewards/margins": 1.0350984334945679, - "eval_rewards/rejected": -3.859445095062256, - "eval_runtime": 384.9833, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.2462238073349, + "eval_logits/rejected": -1.125494360923767, + "eval_logps/chosen": -512.4297485351562, + "eval_logps/rejected": -591.323974609375, + "eval_loss": 0.4939311146736145, + "eval_rewards/accuracies": 0.7429999709129333, + "eval_rewards/chosen": -2.4777767658233643, + "eval_rewards/margins": 0.9893770217895508, + "eval_rewards/rejected": -3.467153787612915, + "eval_runtime": 385.17, + "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 2100 }, { "epoch": 0.55, - "grad_norm": 7.25, + "grad_norm": 9.6875, "learning_rate": 2.4817244638019333e-06, - "logits/chosen": -1.4199883937835693, - "logits/rejected": -1.2606356143951416, - "logps/chosen": -566.4400634765625, - "logps/rejected": -621.7224731445312, - "loss": 0.5296, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.906280040740967, - "rewards/margins": 0.9649537205696106, - "rewards/rejected": -3.8712337017059326, + "logits/chosen": -1.3495204448699951, + "logits/rejected": -1.1980758905410767, + "logps/chosen": -514.2600708007812, + "logps/rejected": -565.2801513671875, + "loss": 0.5135, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.384481906890869, + "rewards/margins": 0.922328770160675, + "rewards/rejected": -3.3068108558654785, "step": 2110 }, { "epoch": 0.55, - "grad_norm": 14.75, + "grad_norm": 14.3125, "learning_rate": 2.4588815314058155e-06, - "logits/chosen": -1.3702977895736694, - "logits/rejected": -1.307600736618042, - "logps/chosen": -536.4708862304688, - "logps/rejected": -595.4000244140625, - "loss": 0.4918, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.909069776535034, - "rewards/margins": 0.9321239590644836, - "rewards/rejected": -3.841193675994873, + "logits/chosen": -1.3099550008773804, + "logits/rejected": -1.2511496543884277, + "logps/chosen": -468.06011962890625, + "logps/rejected": -523.5824584960938, + "loss": 0.4817, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.224961757659912, + "rewards/margins": 0.8980560302734375, + "rewards/rejected": -3.1230177879333496, "step": 2120 }, { "epoch": 0.56, - "grad_norm": 11.9375, + "grad_norm": 9.75, "learning_rate": 2.4360420323899922e-06, - "logits/chosen": -1.4435946941375732, - "logits/rejected": -1.3134028911590576, - "logps/chosen": -554.8719482421875, - "logps/rejected": -603.7503051757812, - "loss": 0.5711, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.7905924320220947, - "rewards/margins": 0.8145803213119507, - "rewards/rejected": -3.605172634124756, + "logits/chosen": -1.353991985321045, + "logits/rejected": -1.2306454181671143, + "logps/chosen": -505.89434814453125, + "logps/rejected": -550.9930419921875, + "loss": 0.5674, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.3008170127868652, + "rewards/margins": 0.7767833471298218, + "rewards/rejected": -3.0776004791259766, "step": 2130 }, { "epoch": 0.56, - "grad_norm": 8.5625, + "grad_norm": 8.0625, "learning_rate": 2.4132078738460585e-06, - "logits/chosen": -1.4984867572784424, - "logits/rejected": -1.3388562202453613, - "logps/chosen": -511.22344970703125, - "logps/rejected": -575.4027099609375, - "loss": 0.4819, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.4675207138061523, - "rewards/margins": 1.027344822883606, - "rewards/rejected": -3.4948654174804688, + "logits/chosen": -1.3921695947647095, + "logits/rejected": -1.2415539026260376, + "logps/chosen": -491.42401123046875, + "logps/rejected": -556.8810424804688, + "loss": 0.4726, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2695276737213135, + "rewards/margins": 1.0401204824447632, + "rewards/rejected": -3.309648036956787, "step": 2140 }, { "epoch": 0.56, - "grad_norm": 13.8125, + "grad_norm": 13.4375, "learning_rate": 2.3903809624196826e-06, - "logits/chosen": -1.4616576433181763, - "logits/rejected": -1.3179928064346313, - "logps/chosen": -468.075439453125, - "logps/rejected": -520.423095703125, - "loss": 0.5464, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.3437161445617676, - "rewards/margins": 0.8483073115348816, - "rewards/rejected": -3.192023515701294, + "logits/chosen": -1.3411505222320557, + "logits/rejected": -1.2057361602783203, + "logps/chosen": -456.32452392578125, + "logps/rejected": -508.445068359375, + "loss": 0.5549, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2262067794799805, + "rewards/margins": 0.8460358381271362, + "rewards/rejected": -3.072242498397827, "step": 2150 }, { "epoch": 0.57, - "grad_norm": 12.375, + "grad_norm": 12.8125, "learning_rate": 2.3675632041513978e-06, - "logits/chosen": -1.5807464122772217, - "logits/rejected": -1.3334547281265259, - "logps/chosen": -532.7571411132812, - "logps/rejected": -571.2667846679688, - "loss": 0.4903, + "logits/chosen": -1.4614931344985962, + "logits/rejected": -1.2260310649871826, + "logps/chosen": -524.8610229492188, + "logps/rejected": -565.4326171875, + "loss": 0.4894, "rewards/accuracies": 0.75, - "rewards/chosen": -2.4575352668762207, - "rewards/margins": 1.0318888425827026, - "rewards/rejected": -3.489424228668213, + "rewards/chosen": -2.378574848175049, + "rewards/margins": 1.0525071620941162, + "rewards/rejected": -3.431082248687744, "step": 2160 }, { "epoch": 0.57, - "grad_norm": 14.125, + "grad_norm": 12.1875, "learning_rate": 2.3447565043174533e-06, - "logits/chosen": -1.42782461643219, - "logits/rejected": -1.2667099237442017, - "logps/chosen": -528.2996826171875, - "logps/rejected": -579.03173828125, - "loss": 0.525, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.6998863220214844, - "rewards/margins": 0.8944419026374817, - "rewards/rejected": -3.5943286418914795, + "logits/chosen": -1.3028042316436768, + "logits/rejected": -1.1499183177947998, + "logps/chosen": -515.6001586914062, + "logps/rejected": -565.5277099609375, + "loss": 0.5241, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5728909969329834, + "rewards/margins": 0.8863977193832397, + "rewards/rejected": -3.4592888355255127, "step": 2170 }, { "epoch": 0.57, - "grad_norm": 9.9375, + "grad_norm": 10.3125, "learning_rate": 2.321962767270724e-06, - "logits/chosen": -1.4696300029754639, - "logits/rejected": -1.3142036199569702, - "logps/chosen": -505.57843017578125, - "logps/rejected": -546.5269165039062, - "loss": 0.5521, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.556256055831909, - "rewards/margins": 0.7811610698699951, - "rewards/rejected": -3.3374171257019043, + "logits/chosen": -1.3512235879898071, + "logits/rejected": -1.2086089849472046, + "logps/chosen": -495.2906188964844, + "logps/rejected": -538.8243408203125, + "loss": 0.5573, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4533779621124268, + "rewards/margins": 0.8070129156112671, + "rewards/rejected": -3.2603907585144043, "step": 2180 }, { "epoch": 0.57, - "grad_norm": 14.1875, + "grad_norm": 10.0, "learning_rate": 2.299183896281692e-06, - "logits/chosen": -1.4029340744018555, - "logits/rejected": -1.2631093263626099, - "logps/chosen": -490.07208251953125, - "logps/rejected": -572.7850952148438, - "loss": 0.5183, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3514010906219482, - "rewards/margins": 0.850308895111084, - "rewards/rejected": -3.2017102241516113, + "logits/chosen": -1.301710844039917, + "logits/rejected": -1.1697108745574951, + "logps/chosen": -466.3893127441406, + "logps/rejected": -546.2555541992188, + "loss": 0.524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1145732402801514, + "rewards/margins": 0.8218411207199097, + "rewards/rejected": -2.9364142417907715, "step": 2190 }, { "epoch": 0.58, - "grad_norm": 8.125, + "grad_norm": 7.34375, "learning_rate": 2.2764217933795297e-06, - "logits/chosen": -1.497179627418518, - "logits/rejected": -1.3680561780929565, - "logps/chosen": -482.985595703125, - "logps/rejected": -569.66552734375, - "loss": 0.4777, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.2155346870422363, - "rewards/margins": 1.0720819234848022, - "rewards/rejected": -3.2876172065734863, + "logits/chosen": -1.4019851684570312, + "logits/rejected": -1.2783384323120117, + "logps/chosen": -460.39227294921875, + "logps/rejected": -538.6397705078125, + "loss": 0.4832, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9896026849746704, + "rewards/margins": 0.9877565503120422, + "rewards/rejected": -2.9773590564727783, "step": 2200 }, { "epoch": 0.58, - "eval_logits/chosen": -1.377992033958435, - "eval_logits/rejected": -1.2484394311904907, - "eval_logps/chosen": -498.22637939453125, - "eval_logps/rejected": -575.8182983398438, - "eval_loss": 0.49137604236602783, - "eval_rewards/accuracies": 0.7475000023841858, - "eval_rewards/chosen": -2.3357439041137695, - "eval_rewards/margins": 0.9763532876968384, - "eval_rewards/rejected": -3.3120970726013184, - "eval_runtime": 384.8882, - "eval_samples_per_second": 5.196, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.289854884147644, + "eval_logits/rejected": -1.1670362949371338, + "eval_logps/chosen": -477.1521911621094, + "eval_logps/rejected": -549.7868041992188, + "eval_loss": 0.49245789647102356, + "eval_rewards/accuracies": 0.7425000071525574, + "eval_rewards/chosen": -2.125001907348633, + "eval_rewards/margins": 0.9267801642417908, + "eval_rewards/rejected": -3.05178165435791, + "eval_runtime": 385.1303, + "eval_samples_per_second": 5.193, + "eval_steps_per_second": 0.649, "step": 2200 }, { "epoch": 0.58, - "grad_norm": 8.1875, + "grad_norm": 5.1875, "learning_rate": 2.2536783591932786e-06, - "logits/chosen": -1.5414104461669922, - "logits/rejected": -1.380723237991333, - "logps/chosen": -521.5967407226562, - "logps/rejected": -590.3738403320312, - "loss": 0.5316, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.460937023162842, - "rewards/margins": 0.8691738843917847, - "rewards/rejected": -3.330111026763916, + "logits/chosen": -1.4467527866363525, + "logits/rejected": -1.2898051738739014, + "logps/chosen": -501.9493103027344, + "logps/rejected": -568.07080078125, + "loss": 0.5262, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.26446270942688, + "rewards/margins": 0.842617392539978, + "rewards/rejected": -3.1070799827575684, "step": 2210 }, { "epoch": 0.58, - "grad_norm": 7.71875, + "grad_norm": 7.84375, "learning_rate": 2.230955492793149e-06, - "logits/chosen": -1.3435004949569702, - "logits/rejected": -1.285659909248352, - "logps/chosen": -540.8585815429688, - "logps/rejected": -614.8424072265625, - "loss": 0.591, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.6706583499908447, - "rewards/margins": 0.8687313199043274, - "rewards/rejected": -3.5393898487091064, + "logits/chosen": -1.2303822040557861, + "logits/rejected": -1.1834524869918823, + "logps/chosen": -536.91796875, + "logps/rejected": -603.58203125, + "loss": 0.5935, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6312527656555176, + "rewards/margins": 0.7955335378646851, + "rewards/rejected": -3.4267868995666504, "step": 2220 }, { "epoch": 0.58, - "grad_norm": 6.53125, + "grad_norm": 5.6875, "learning_rate": 2.208255091531947e-06, - "logits/chosen": -1.3562614917755127, - "logits/rejected": -1.267709493637085, - "logps/chosen": -524.9986572265625, - "logps/rejected": -600.77880859375, - "loss": 0.4914, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.480012893676758, - "rewards/margins": 1.107474684715271, - "rewards/rejected": -3.5874874591827393, + "logits/chosen": -1.2445331811904907, + "logits/rejected": -1.1615046262741089, + "logps/chosen": -523.9738159179688, + "logps/rejected": -601.7839965820312, + "loss": 0.4818, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.469764232635498, + "rewards/margins": 1.127774953842163, + "rewards/rejected": -3.597539186477661, "step": 2230 }, { "epoch": 0.59, - "grad_norm": 12.25, + "grad_norm": 11.75, "learning_rate": 2.1855790508866435e-06, - "logits/chosen": -1.4271819591522217, - "logits/rejected": -1.3109095096588135, - "logps/chosen": -548.445556640625, - "logps/rejected": -636.0255126953125, - "loss": 0.4951, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.51424241065979, - "rewards/margins": 1.0523946285247803, - "rewards/rejected": -3.5666370391845703, + "logits/chosen": -1.3009926080703735, + "logits/rejected": -1.1936320066452026, + "logps/chosen": -551.2839965820312, + "logps/rejected": -635.8419799804688, + "loss": 0.5122, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.5426268577575684, + "rewards/margins": 1.0221750736236572, + "rewards/rejected": -3.5648021697998047, "step": 2240 }, { "epoch": 0.59, - "grad_norm": 10.0, + "grad_norm": 7.0, "learning_rate": 2.162929264300107e-06, - "logits/chosen": -1.4263137578964233, - "logits/rejected": -1.329329252243042, - "logps/chosen": -499.01947021484375, - "logps/rejected": -603.9342041015625, - "loss": 0.4131, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.300555944442749, - "rewards/margins": 1.2727267742156982, - "rewards/rejected": -3.5732827186584473, + "logits/chosen": -1.313072919845581, + "logits/rejected": -1.2196762561798096, + "logps/chosen": -495.29840087890625, + "logps/rejected": -598.8929443359375, + "loss": 0.4195, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2633450031280518, + "rewards/margins": 1.2595245838165283, + "rewards/rejected": -3.52286958694458, "step": 2250 }, { "epoch": 0.59, "grad_norm": 12.0, "learning_rate": 2.1403076230230006e-06, - "logits/chosen": -1.3517494201660156, - "logits/rejected": -1.2308024168014526, - "logps/chosen": -553.6866455078125, - "logps/rejected": -609.8203125, - "loss": 0.5893, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.825040817260742, - "rewards/margins": 0.8394268155097961, - "rewards/rejected": -3.6644675731658936, + "logits/chosen": -1.2646925449371338, + "logits/rejected": -1.1446959972381592, + "logps/chosen": -531.4093017578125, + "logps/rejected": -583.4620971679688, + "loss": 0.587, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6022684574127197, + "rewards/margins": 0.7986178994178772, + "rewards/rejected": -3.4008865356445312, "step": 2260 }, { "epoch": 0.59, - "grad_norm": 6.90625, + "grad_norm": 11.625, "learning_rate": 2.11771601595586e-06, - "logits/chosen": -1.420016884803772, - "logits/rejected": -1.302247405052185, - "logps/chosen": -558.3585205078125, - "logps/rejected": -605.810791015625, - "loss": 0.5185, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.7631030082702637, - "rewards/margins": 0.9972630739212036, - "rewards/rejected": -3.760366439819336, + "logits/chosen": -1.3460241556167603, + "logits/rejected": -1.232742428779602, + "logps/chosen": -530.1009521484375, + "logps/rejected": -569.1173095703125, + "loss": 0.5295, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4805283546447754, + "rewards/margins": 0.9129024744033813, + "rewards/rejected": -3.3934311866760254, "step": 2270 }, { "epoch": 0.6, - "grad_norm": 10.1875, + "grad_norm": 16.625, "learning_rate": 2.0951563294913737e-06, - "logits/chosen": -1.4174630641937256, - "logits/rejected": -1.2032339572906494, - "logps/chosen": -525.8551025390625, - "logps/rejected": -591.8328247070312, - "loss": 0.4771, + "logits/chosen": -1.344582438468933, + "logits/rejected": -1.1410505771636963, + "logps/chosen": -493.6297912597656, + "logps/rejected": -556.1669921875, + "loss": 0.4651, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.643219232559204, - "rewards/margins": 0.9647111892700195, - "rewards/rejected": -3.6079304218292236, + "rewards/chosen": -2.3209660053253174, + "rewards/margins": 0.930306613445282, + "rewards/rejected": -3.251272678375244, "step": 2280 }, { "epoch": 0.6, - "grad_norm": 8.9375, + "grad_norm": 7.59375, "learning_rate": 2.0726304473568693e-06, - "logits/chosen": -1.3942341804504395, - "logits/rejected": -1.2679177522659302, - "logps/chosen": -523.4191284179688, - "logps/rejected": -589.2734985351562, - "loss": 0.492, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.6448018550872803, - "rewards/margins": 0.9818207621574402, - "rewards/rejected": -3.6266231536865234, + "logits/chosen": -1.3250610828399658, + "logits/rejected": -1.207024097442627, + "logps/chosen": -501.9657287597656, + "logps/rejected": -565.3271484375, + "loss": 0.4841, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.4302685260772705, + "rewards/margins": 0.9568912386894226, + "rewards/rejected": -3.387159824371338, "step": 2290 }, { "epoch": 0.6, - "grad_norm": 9.1875, + "grad_norm": 10.25, "learning_rate": 2.050140250457023e-06, - "logits/chosen": -1.4893324375152588, - "logits/rejected": -1.2684142589569092, - "logps/chosen": -571.5191650390625, - "logps/rejected": -653.9939575195312, - "loss": 0.4655, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.9395713806152344, - "rewards/margins": 1.165346384048462, - "rewards/rejected": -4.104917526245117, + "logits/chosen": -1.4138681888580322, + "logits/rejected": -1.1992824077606201, + "logps/chosen": -557.7728881835938, + "logps/rejected": -629.088623046875, + "loss": 0.4731, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.802109479904175, + "rewards/margins": 1.0537548065185547, + "rewards/rejected": -3.8558642864227295, "step": 2300 }, { "epoch": 0.6, - "eval_logits/chosen": -1.2897459268569946, - "eval_logits/rejected": -1.1627777814865112, - "eval_logps/chosen": -571.7406616210938, - "eval_logps/rejected": -672.1651000976562, - "eval_loss": 0.4928034543991089, - "eval_rewards/accuracies": 0.7450000047683716, - "eval_rewards/chosen": -3.0708866119384766, - "eval_rewards/margins": 1.2046781778335571, - "eval_rewards/rejected": -4.2755656242370605, - "eval_runtime": 384.9685, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.2155396938323975, + "eval_logits/rejected": -1.095304250717163, + "eval_logps/chosen": -552.5741577148438, + "eval_logps/rejected": -645.44482421875, + "eval_loss": 0.49232217669487, + "eval_rewards/accuracies": 0.7434999942779541, + "eval_rewards/chosen": -2.8792214393615723, + "eval_rewards/margins": 1.129140853881836, + "eval_rewards/rejected": -4.008362770080566, + "eval_runtime": 385.2143, + "eval_samples_per_second": 5.192, "eval_steps_per_second": 0.649, "step": 2300 }, { "epoch": 0.6, - "grad_norm": 17.75, + "grad_norm": 14.0625, "learning_rate": 2.0276876167168042e-06, - "logits/chosen": -1.2386648654937744, - "logits/rejected": -1.1422879695892334, - "logps/chosen": -528.5611572265625, - "logps/rejected": -600.717529296875, - "loss": 0.5928, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.040616273880005, - "rewards/margins": 1.0473132133483887, - "rewards/rejected": -4.087929725646973, + "logits/chosen": -1.1646645069122314, + "logits/rejected": -1.0743215084075928, + "logps/chosen": -514.7222900390625, + "logps/rejected": -580.4427490234375, + "loss": 0.5834, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9022274017333984, + "rewards/margins": 0.9829545021057129, + "rewards/rejected": -3.8851819038391113, "step": 2310 }, { "epoch": 0.61, - "grad_norm": 8.875, + "grad_norm": 8.8125, "learning_rate": 2.0052744209246682e-06, - "logits/chosen": -1.3995951414108276, - "logits/rejected": -1.2767788171768188, - "logps/chosen": -552.1273193359375, - "logps/rejected": -622.7492065429688, - "loss": 0.5088, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.9678916931152344, - "rewards/margins": 1.0349812507629395, - "rewards/rejected": -4.002872943878174, + "logits/chosen": -1.3135536909103394, + "logits/rejected": -1.1998984813690186, + "logps/chosen": -542.7693481445312, + "logps/rejected": -606.01123046875, + "loss": 0.5182, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.874311685562134, + "rewards/margins": 0.9611810445785522, + "rewards/rejected": -3.8354930877685547, "step": 2320 }, { "epoch": 0.61, - "grad_norm": 12.1875, + "grad_norm": 9.625, "learning_rate": 1.9829025345760127e-06, - "logits/chosen": -1.405669093132019, - "logits/rejected": -1.3700844049453735, - "logps/chosen": -559.1365966796875, - "logps/rejected": -644.623779296875, - "loss": 0.5386, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.8013358116149902, - "rewards/margins": 0.9156203269958496, - "rewards/rejected": -3.7169559001922607, + "logits/chosen": -1.3124678134918213, + "logits/rejected": -1.2832801342010498, + "logps/chosen": -549.1907958984375, + "logps/rejected": -632.5858764648438, + "loss": 0.5333, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7018771171569824, + "rewards/margins": 0.8947007060050964, + "rewards/rejected": -3.5965774059295654, "step": 2330 }, { "epoch": 0.61, - "grad_norm": 9.125, + "grad_norm": 7.65625, "learning_rate": 1.9605738257169115e-06, - "logits/chosen": -1.3817434310913086, - "logits/rejected": -1.2056105136871338, - "logps/chosen": -506.63201904296875, - "logps/rejected": -614.6943969726562, - "loss": 0.4856, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.7365362644195557, - "rewards/margins": 1.1604435443878174, - "rewards/rejected": -3.896979808807373, + "logits/chosen": -1.2838572263717651, + "logits/rejected": -1.117290735244751, + "logps/chosen": -497.5326232910156, + "logps/rejected": -604.8740234375, + "loss": 0.4837, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6455423831939697, + "rewards/margins": 1.1532337665557861, + "rewards/rejected": -3.798776149749756, "step": 2340 }, { "epoch": 0.62, - "grad_norm": 14.5, + "grad_norm": 9.9375, "learning_rate": 1.9382901587881275e-06, - "logits/chosen": -1.4400955438613892, - "logits/rejected": -1.3102456331253052, - "logps/chosen": -521.36767578125, - "logps/rejected": -608.1643676757812, - "loss": 0.4185, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.6261146068573, - "rewards/margins": 1.1909650564193726, - "rewards/rejected": -3.817080020904541, + "logits/chosen": -1.3377434015274048, + "logits/rejected": -1.2184029817581177, + "logps/chosen": -514.0582275390625, + "logps/rejected": -602.654052734375, + "loss": 0.4292, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.5530195236206055, + "rewards/margins": 1.208957552909851, + "rewards/rejected": -3.761976957321167, "step": 2350 }, { "epoch": 0.62, - "grad_norm": 11.0625, + "grad_norm": 10.5625, "learning_rate": 1.916053394469437e-06, - "logits/chosen": -1.4624320268630981, - "logits/rejected": -1.2485642433166504, - "logps/chosen": -547.3516845703125, - "logps/rejected": -640.5094604492188, - "loss": 0.5228, + "logits/chosen": -1.3620846271514893, + "logits/rejected": -1.1589324474334717, + "logps/chosen": -535.8505859375, + "logps/rejected": -625.4491577148438, + "loss": 0.5293, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.8405938148498535, - "rewards/margins": 1.0770297050476074, - "rewards/rejected": -3.917623996734619, + "rewards/chosen": -2.725583791732788, + "rewards/margins": 1.0414365530014038, + "rewards/rejected": -3.7670199871063232, "step": 2360 }, { "epoch": 0.62, - "grad_norm": 9.75, + "grad_norm": 11.3125, "learning_rate": 1.8938653895242604e-06, - "logits/chosen": -1.4186649322509766, - "logits/rejected": -1.2279117107391357, - "logps/chosen": -544.6094970703125, - "logps/rejected": -642.7184448242188, - "loss": 0.4335, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -2.803039073944092, - "rewards/margins": 1.269425630569458, - "rewards/rejected": -4.072464942932129, + "logits/chosen": -1.3228267431259155, + "logits/rejected": -1.1428587436676025, + "logps/chosen": -536.3853759765625, + "logps/rejected": -627.5452880859375, + "loss": 0.441, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.720797300338745, + "rewards/margins": 1.1999356746673584, + "rewards/rejected": -3.9207332134246826, "step": 2370 }, { "epoch": 0.62, - "grad_norm": 11.8125, + "grad_norm": 10.125, "learning_rate": 1.8717279966446267e-06, - "logits/chosen": -1.2762877941131592, - "logits/rejected": -1.1935598850250244, - "logps/chosen": -549.9871826171875, - "logps/rejected": -655.0281982421875, - "loss": 0.4571, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.014120101928711, - "rewards/margins": 1.1458585262298584, - "rewards/rejected": -4.15997838973999, + "logits/chosen": -1.1800651550292969, + "logits/rejected": -1.102126955986023, + "logps/chosen": -539.4421997070312, + "logps/rejected": -641.0511474609375, + "loss": 0.4566, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.90867018699646, + "rewards/margins": 1.1115381717681885, + "rewards/rejected": -4.020208358764648, "step": 2380 }, { "epoch": 0.63, - "grad_norm": 7.8125, + "grad_norm": 10.125, "learning_rate": 1.8496430642964698e-06, - "logits/chosen": -1.351825475692749, - "logits/rejected": -1.2405725717544556, - "logps/chosen": -560.7874145507812, - "logps/rejected": -648.8487548828125, - "loss": 0.4959, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.8837532997131348, - "rewards/margins": 1.1171314716339111, - "rewards/rejected": -4.000885009765625, + "logits/chosen": -1.258175015449524, + "logits/rejected": -1.1534559726715088, + "logps/chosen": -557.5374755859375, + "logps/rejected": -637.3475341796875, + "loss": 0.51, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8512537479400635, + "rewards/margins": 1.0346183776855469, + "rewards/rejected": -3.8858723640441895, "step": 2390 }, { "epoch": 0.63, - "grad_norm": 7.40625, + "grad_norm": 8.75, "learning_rate": 1.827612436565286e-06, - "logits/chosen": -1.359593391418457, - "logits/rejected": -1.2036197185516357, - "logps/chosen": -551.798828125, - "logps/rejected": -647.3968505859375, - "loss": 0.47, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.88834285736084, - "rewards/margins": 1.1616257429122925, - "rewards/rejected": -4.049968242645264, + "logits/chosen": -1.2754342555999756, + "logits/rejected": -1.123130440711975, + "logps/chosen": -543.8443603515625, + "logps/rejected": -633.3651123046875, + "loss": 0.4782, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.808797836303711, + "rewards/margins": 1.1008532047271729, + "rewards/rejected": -3.909651279449463, "step": 2400 }, { "epoch": 0.63, - "eval_logits/chosen": -1.2773113250732422, - "eval_logits/rejected": -1.1517482995986938, - "eval_logps/chosen": -557.9854125976562, - "eval_logps/rejected": -651.6221923828125, - "eval_loss": 0.4908619225025177, - "eval_rewards/accuracies": 0.7409999966621399, - "eval_rewards/chosen": -2.9333345890045166, - "eval_rewards/margins": 1.1368014812469482, - "eval_rewards/rejected": -4.070136070251465, - "eval_runtime": 385.0042, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.1977647542953491, + "eval_logits/rejected": -1.0794349908828735, + "eval_logps/chosen": -549.680419921875, + "eval_logps/rejected": -637.0914306640625, + "eval_loss": 0.4923146665096283, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -2.8502840995788574, + "eval_rewards/margins": 1.0745435953140259, + "eval_rewards/rejected": -3.9248275756835938, + "eval_runtime": 385.0636, + "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 2400 }, { "epoch": 0.63, - "grad_norm": 16.5, + "grad_norm": 13.875, "learning_rate": 1.8056379530021492e-06, - "logits/chosen": -1.4074201583862305, - "logits/rejected": -1.3206876516342163, - "logps/chosen": -535.7772216796875, - "logps/rejected": -602.4798583984375, - "loss": 0.5365, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.915788173675537, - "rewards/margins": 0.915139377117157, - "rewards/rejected": -3.830927610397339, + "logits/chosen": -1.3143008947372437, + "logits/rejected": -1.2356500625610352, + "logps/chosen": -539.9703979492188, + "logps/rejected": -599.3643188476562, + "loss": 0.5312, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9577202796936035, + "rewards/margins": 0.8420518040657043, + "rewards/rejected": -3.799771785736084, "step": 2410 }, { "epoch": 0.63, - "grad_norm": 8.125, + "grad_norm": 11.0625, "learning_rate": 1.7837214484701154e-06, - "logits/chosen": -1.4410103559494019, - "logits/rejected": -1.3098431825637817, - "logps/chosen": -498.352294921875, - "logps/rejected": -587.9461669921875, - "loss": 0.4705, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.460021495819092, - "rewards/margins": 1.1296519041061401, - "rewards/rejected": -3.5896732807159424, + "logits/chosen": -1.3325443267822266, + "logits/rejected": -1.2115572690963745, + "logps/chosen": -515.3961181640625, + "logps/rejected": -601.1583862304688, + "loss": 0.4782, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.630460262298584, + "rewards/margins": 1.091335415840149, + "rewards/rejected": -3.7217955589294434, "step": 2420 }, { "epoch": 0.64, - "grad_norm": 15.25, + "grad_norm": 13.9375, "learning_rate": 1.7618647529910043e-06, - "logits/chosen": -1.4378907680511475, - "logits/rejected": -1.3039597272872925, - "logps/chosen": -500.2588806152344, - "logps/rejected": -604.9884033203125, - "loss": 0.4869, + "logits/chosen": -1.3422627449035645, + "logits/rejected": -1.2155346870422363, + "logps/chosen": -517.1422119140625, + "logps/rejected": -613.8555908203125, + "loss": 0.5001, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.434854030609131, - "rewards/margins": 1.166461706161499, - "rewards/rejected": -3.601315975189209, + "rewards/chosen": -2.6036880016326904, + "rewards/margins": 1.0863001346588135, + "rewards/rejected": -3.689988613128662, "step": 2430 }, { "epoch": 0.64, - "grad_norm": 9.6875, + "grad_norm": 9.25, "learning_rate": 1.7400696915925996e-06, - "logits/chosen": -1.442056655883789, - "logits/rejected": -1.2497450113296509, - "logps/chosen": -522.6090698242188, - "logps/rejected": -571.2970581054688, - "loss": 0.5167, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.519986391067505, - "rewards/margins": 1.0696182250976562, - "rewards/rejected": -3.589604139328003, + "logits/chosen": -1.3564714193344116, + "logits/rejected": -1.1683833599090576, + "logps/chosen": -539.3397216796875, + "logps/rejected": -584.2203979492188, + "loss": 0.5162, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.687293529510498, + "rewards/margins": 1.0315442085266113, + "rewards/rejected": -3.7188377380371094, "step": 2440 }, { "epoch": 0.64, - "grad_norm": 11.0, + "grad_norm": 11.125, "learning_rate": 1.718338084156254e-06, - "logits/chosen": -1.4057360887527466, - "logits/rejected": -1.2487547397613525, - "logps/chosen": -521.5921020507812, - "logps/rejected": -594.0106201171875, - "loss": 0.4502, + "logits/chosen": -1.3139379024505615, + "logits/rejected": -1.1639807224273682, + "logps/chosen": -541.3829956054688, + "logps/rejected": -613.3155517578125, + "loss": 0.4505, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.3480098247528076, - "rewards/margins": 1.1080354452133179, - "rewards/rejected": -3.456045627593994, + "rewards/chosen": -2.545919179916382, + "rewards/margins": 1.1031758785247803, + "rewards/rejected": -3.649095058441162, "step": 2450 }, { "epoch": 0.64, - "grad_norm": 11.3125, + "grad_norm": 10.8125, "learning_rate": 1.6966717452649372e-06, - "logits/chosen": -1.5112414360046387, - "logits/rejected": -1.3404825925827026, - "logps/chosen": -511.281005859375, - "logps/rejected": -574.0786743164062, - "loss": 0.4402, + "logits/chosen": -1.4163377285003662, + "logits/rejected": -1.2610633373260498, + "logps/chosen": -529.8837890625, + "logps/rejected": -588.6536254882812, + "loss": 0.4533, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.34338641166687, - "rewards/margins": 1.1465994119644165, - "rewards/rejected": -3.489985704421997, + "rewards/chosen": -2.5294137001037598, + "rewards/margins": 1.1063209772109985, + "rewards/rejected": -3.6357345581054688, "step": 2460 }, { "epoch": 0.65, - "grad_norm": 8.375, + "grad_norm": 7.78125, "learning_rate": 1.6750724840517103e-06, - "logits/chosen": -1.4542334079742432, - "logits/rejected": -1.3746201992034912, - "logps/chosen": -482.7185974121094, - "logps/rejected": -584.7723999023438, - "loss": 0.504, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.2991232872009277, - "rewards/margins": 0.9794435501098633, - "rewards/rejected": -3.278566837310791, + "logits/chosen": -1.3619472980499268, + "logits/rejected": -1.2863503694534302, + "logps/chosen": -506.430908203125, + "logps/rejected": -603.09228515625, + "loss": 0.5196, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5362462997436523, + "rewards/margins": 0.925518810749054, + "rewards/rejected": -3.4617652893066406, "step": 2470 }, { "epoch": 0.65, - "grad_norm": 12.0625, + "grad_norm": 14.375, "learning_rate": 1.6535421040486686e-06, - "logits/chosen": -1.2772949934005737, - "logits/rejected": -1.177643895149231, - "logps/chosen": -497.01861572265625, - "logps/rejected": -593.6337890625, - "loss": 0.424, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.4366674423217773, - "rewards/margins": 1.3086668252944946, - "rewards/rejected": -3.7453346252441406, + "logits/chosen": -1.175429105758667, + "logits/rejected": -1.0819575786590576, + "logps/chosen": -522.57373046875, + "logps/rejected": -610.5762939453125, + "loss": 0.4362, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.69221830368042, + "rewards/margins": 1.2225408554077148, + "rewards/rejected": -3.914759874343872, "step": 2480 }, { "epoch": 0.65, - "grad_norm": 15.5, + "grad_norm": 12.25, "learning_rate": 1.6320824030363458e-06, - "logits/chosen": -1.351653814315796, - "logits/rejected": -1.2928129434585571, - "logps/chosen": -490.3855895996094, - "logps/rejected": -596.65380859375, - "loss": 0.465, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.5555453300476074, - "rewards/margins": 1.217242956161499, - "rewards/rejected": -3.7727882862091064, + "logits/chosen": -1.2581863403320312, + "logits/rejected": -1.1994664669036865, + "logps/chosen": -505.96783447265625, + "logps/rejected": -609.1953735351562, + "loss": 0.4515, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7113680839538574, + "rewards/margins": 1.186835527420044, + "rewards/rejected": -3.8982033729553223, "step": 2490 }, { "epoch": 0.65, - "grad_norm": 14.5625, + "grad_norm": 13.75, "learning_rate": 1.6106951728936028e-06, - "logits/chosen": -1.4520965814590454, - "logits/rejected": -1.318485975265503, - "logps/chosen": -513.4244995117188, - "logps/rejected": -615.5191040039062, - "loss": 0.4963, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.5622081756591797, - "rewards/margins": 1.0996941328048706, - "rewards/rejected": -3.661902666091919, + "logits/chosen": -1.3734843730926514, + "logits/rejected": -1.2433956861495972, + "logps/chosen": -518.4763793945312, + "logps/rejected": -614.0827026367188, + "loss": 0.4983, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6127266883850098, + "rewards/margins": 1.034812569618225, + "rewards/rejected": -3.6475391387939453, "step": 2500 }, { "epoch": 0.65, - "eval_logits/chosen": -1.3239047527313232, - "eval_logits/rejected": -1.1944924592971802, - "eval_logps/chosen": -525.2288208007812, - "eval_logps/rejected": -621.9060668945312, - "eval_loss": 0.49328407645225525, - "eval_rewards/accuracies": 0.7390000224113464, - "eval_rewards/chosen": -2.6057679653167725, - "eval_rewards/margins": 1.1672067642211914, - "eval_rewards/rejected": -3.7729744911193848, - "eval_runtime": 384.9389, - "eval_samples_per_second": 5.196, - "eval_steps_per_second": 0.649, + "eval_logits/chosen": -1.2522040605545044, + "eval_logits/rejected": -1.1292414665222168, + "eval_logps/chosen": -521.7777709960938, + "eval_logps/rejected": -610.1890258789062, + "eval_loss": 0.49058130383491516, + "eval_rewards/accuracies": 0.7409999966621399, + "eval_rewards/chosen": -2.5712568759918213, + "eval_rewards/margins": 1.0845470428466797, + "eval_rewards/rejected": -3.655803918838501, + "eval_runtime": 384.7732, + "eval_samples_per_second": 5.198, + "eval_steps_per_second": 0.65, "step": 2500 }, { "epoch": 0.66, - "grad_norm": 10.6875, + "grad_norm": 8.4375, "learning_rate": 1.5893821994479996e-06, - "logits/chosen": -1.4442546367645264, - "logits/rejected": -1.3286397457122803, - "logps/chosen": -525.9605102539062, - "logps/rejected": -608.4495239257812, - "loss": 0.475, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.4855237007141113, - "rewards/margins": 1.2105185985565186, - "rewards/rejected": -3.696042537689209, + "logits/chosen": -1.372878909111023, + "logits/rejected": -1.2597870826721191, + "logps/chosen": -519.8887939453125, + "logps/rejected": -593.8539428710938, + "loss": 0.476, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.424806594848633, + "rewards/margins": 1.1252799034118652, + "rewards/rejected": -3.550086259841919, "step": 2510 }, { "epoch": 0.66, - "grad_norm": 7.90625, + "grad_norm": 9.0625, "learning_rate": 1.5681452623266868e-06, - "logits/chosen": -1.4192955493927002, - "logits/rejected": -1.174787998199463, - "logps/chosen": -559.93603515625, - "logps/rejected": -632.15234375, - "loss": 0.4838, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.7089059352874756, - "rewards/margins": 1.304880976676941, - "rewards/rejected": -4.013787269592285, + "logits/chosen": -1.347572684288025, + "logits/rejected": -1.115192174911499, + "logps/chosen": -546.6536254882812, + "logps/rejected": -608.205078125, + "loss": 0.478, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.5760815143585205, + "rewards/margins": 1.198232889175415, + "rewards/rejected": -3.7743141651153564, "step": 2520 }, { "epoch": 0.66, - "grad_norm": 5.9375, + "grad_norm": 7.9375, "learning_rate": 1.5469861348078014e-06, - "logits/chosen": -1.420716643333435, - "logits/rejected": -1.2741087675094604, - "logps/chosen": -522.9864501953125, - "logps/rejected": -641.4890747070312, - "loss": 0.4193, + "logits/chosen": -1.3562158346176147, + "logits/rejected": -1.2117723226547241, + "logps/chosen": -505.29254150390625, + "logps/rejected": -614.58251953125, + "loss": 0.4407, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.7621614933013916, - "rewards/margins": 1.2914037704467773, - "rewards/rejected": -4.05356502532959, + "rewards/chosen": -2.5852229595184326, + "rewards/margins": 1.1992766857147217, + "rewards/rejected": -3.784499406814575, "step": 2530 }, { "epoch": 0.66, - "grad_norm": 9.4375, + "grad_norm": 10.3125, "learning_rate": 1.5259065836724035e-06, - "logits/chosen": -1.2834079265594482, - "logits/rejected": -1.2232940196990967, - "logps/chosen": -522.1956176757812, - "logps/rejected": -655.6646728515625, - "loss": 0.4116, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.773864269256592, - "rewards/margins": 1.3960521221160889, - "rewards/rejected": -4.16991662979126, + "logits/chosen": -1.2109121084213257, + "logits/rejected": -1.152276635169983, + "logps/chosen": -509.5875549316406, + "logps/rejected": -634.964111328125, + "loss": 0.4268, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6477839946746826, + "rewards/margins": 1.3151264190673828, + "rewards/rejected": -3.9629104137420654, "step": 2540 }, { "epoch": 0.67, - "grad_norm": 19.5, + "grad_norm": 19.375, "learning_rate": 1.5049083690569456e-06, - "logits/chosen": -1.3350251913070679, - "logits/rejected": -1.2278568744659424, - "logps/chosen": -531.40380859375, - "logps/rejected": -651.0161743164062, - "loss": 0.5176, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.939335584640503, - "rewards/margins": 1.2402127981185913, - "rewards/rejected": -4.179548740386963, + "logits/chosen": -1.2700594663619995, + "logits/rejected": -1.166520118713379, + "logps/chosen": -509.182861328125, + "logps/rejected": -621.1192626953125, + "loss": 0.5163, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.71712589263916, + "rewards/margins": 1.1634531021118164, + "rewards/rejected": -3.8805785179138184, "step": 2550 }, { "epoch": 0.67, - "grad_norm": 14.0625, + "grad_norm": 16.5, "learning_rate": 1.4839932443063057e-06, - "logits/chosen": -1.339280366897583, - "logits/rejected": -1.1727148294448853, - "logps/chosen": -578.1463623046875, - "logps/rejected": -648.4544677734375, - "loss": 0.4648, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.86167573928833, - "rewards/margins": 1.2659341096878052, - "rewards/rejected": -4.127610206604004, + "logits/chosen": -1.275468349456787, + "logits/rejected": -1.1098088026046753, + "logps/chosen": -555.3331909179688, + "logps/rejected": -615.7780151367188, + "loss": 0.4743, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6335442066192627, + "rewards/margins": 1.167301058769226, + "rewards/rejected": -3.8008453845977783, "step": 2560 }, { "epoch": 0.67, - "grad_norm": 22.875, + "grad_norm": 18.0, "learning_rate": 1.4631629558273803e-06, - "logits/chosen": -1.3597378730773926, - "logits/rejected": -1.2507550716400146, - "logps/chosen": -532.3685302734375, - "logps/rejected": -612.3403930664062, - "loss": 0.6361, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.9064340591430664, - "rewards/margins": 0.9161791801452637, - "rewards/rejected": -3.822613477706909, + "logits/chosen": -1.2889525890350342, + "logits/rejected": -1.1872795820236206, + "logps/chosen": -510.55615234375, + "logps/rejected": -586.6162109375, + "loss": 0.6102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.688310146331787, + "rewards/margins": 0.8770621418952942, + "rewards/rejected": -3.5653719902038574, "step": 2570 }, { "epoch": 0.68, - "grad_norm": 5.09375, + "grad_norm": 6.71875, "learning_rate": 1.4424192429432657e-06, - "logits/chosen": -1.4379812479019165, - "logits/rejected": -1.3553606271743774, - "logps/chosen": -497.770263671875, - "logps/rejected": -619.8333129882812, - "loss": 0.4692, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.4189367294311523, - "rewards/margins": 1.1874587535858154, - "rewards/rejected": -3.606395721435547, + "logits/chosen": -1.359438419342041, + "logits/rejected": -1.2795076370239258, + "logps/chosen": -480.8011779785156, + "logps/rejected": -599.15966796875, + "loss": 0.4647, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.249246120452881, + "rewards/margins": 1.150412917137146, + "rewards/rejected": -3.3996593952178955, "step": 2580 }, { "epoch": 0.68, - "grad_norm": 14.9375, + "grad_norm": 13.1875, "learning_rate": 1.421763837748016e-06, - "logits/chosen": -1.4066466093063354, - "logits/rejected": -1.3101098537445068, - "logps/chosen": -501.1058654785156, - "logps/rejected": -613.1395263671875, - "loss": 0.4538, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.5047993659973145, - "rewards/margins": 1.2053807973861694, - "rewards/rejected": -3.7101802825927734, + "logits/chosen": -1.326791763305664, + "logits/rejected": -1.2331459522247314, + "logps/chosen": -485.2764587402344, + "logps/rejected": -594.4434814453125, + "loss": 0.4524, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.346505641937256, + "rewards/margins": 1.1767139434814453, + "rewards/rejected": -3.523219585418701, "step": 2590 }, { "epoch": 0.68, - "grad_norm": 10.125, + "grad_norm": 10.8125, "learning_rate": 1.401198464962021e-06, - "logits/chosen": -1.4462471008300781, - "logits/rejected": -1.2622243165969849, - "logps/chosen": -535.8546142578125, - "logps/rejected": -603.2532348632812, - "loss": 0.4663, + "logits/chosen": -1.3617570400238037, + "logits/rejected": -1.1875524520874023, + "logps/chosen": -524.5842895507812, + "logps/rejected": -588.7896728515625, + "loss": 0.4746, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.672712564468384, - "rewards/margins": 1.0602437257766724, - "rewards/rejected": -3.7329559326171875, + "rewards/chosen": -2.5600085258483887, + "rewards/margins": 1.0283123254776, + "rewards/rejected": -3.5883209705352783, "step": 2600 }, { "epoch": 0.68, - "eval_logits/chosen": -1.3264377117156982, - "eval_logits/rejected": -1.1991208791732788, - "eval_logps/chosen": -532.6129760742188, - "eval_logps/rejected": -628.5565795898438, - "eval_loss": 0.4950037896633148, - "eval_rewards/accuracies": 0.7450000047683716, - "eval_rewards/chosen": -2.67961049079895, - "eval_rewards/margins": 1.1598690748214722, - "eval_rewards/rejected": -3.839479446411133, - "eval_runtime": 384.8048, - "eval_samples_per_second": 5.197, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.2491270303726196, + "eval_logits/rejected": -1.1266547441482544, + "eval_logps/chosen": -523.2234497070312, + "eval_logps/rejected": -616.9339599609375, + "eval_loss": 0.4946673512458801, + "eval_rewards/accuracies": 0.7365000247955322, + "eval_rewards/chosen": -2.585714340209961, + "eval_rewards/margins": 1.1375384330749512, + "eval_rewards/rejected": -3.723253011703491, + "eval_runtime": 385.1919, + "eval_samples_per_second": 5.192, + "eval_steps_per_second": 0.649, "step": 2600 }, { "epoch": 0.68, - "grad_norm": 10.9375, + "grad_norm": 10.1875, "learning_rate": 1.3807248417879896e-06, - "logits/chosen": -1.4817050695419312, - "logits/rejected": -1.368260145187378, - "logps/chosen": -536.787109375, - "logps/rejected": -646.0281982421875, - "loss": 0.4362, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.6372110843658447, - "rewards/margins": 1.2936350107192993, - "rewards/rejected": -3.9308464527130127, + "logits/chosen": -1.3990509510040283, + "logits/rejected": -1.2910901308059692, + "logps/chosen": -524.749267578125, + "logps/rejected": -631.2271728515625, + "loss": 0.445, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.5168325901031494, + "rewards/margins": 1.2660022974014282, + "rewards/rejected": -3.782834529876709, "step": 2610 }, { "epoch": 0.69, - "grad_norm": 22.75, + "grad_norm": 25.875, "learning_rate": 1.3603446777675665e-06, - "logits/chosen": -1.3277854919433594, - "logits/rejected": -1.207275629043579, - "logps/chosen": -556.1530151367188, - "logps/rejected": -648.6631469726562, - "loss": 0.5338, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.9295554161071777, - "rewards/margins": 1.1459153890609741, - "rewards/rejected": -4.075470924377441, + "logits/chosen": -1.2434417009353638, + "logits/rejected": -1.1283738613128662, + "logps/chosen": -539.6519165039062, + "logps/rejected": -630.5535888671875, + "loss": 0.5282, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7645440101623535, + "rewards/margins": 1.129831314086914, + "rewards/rejected": -3.8943753242492676, "step": 2620 }, { "epoch": 0.69, - "grad_norm": 8.3125, + "grad_norm": 7.5625, "learning_rate": 1.3400596746385817e-06, - "logits/chosen": -1.4622247219085693, - "logits/rejected": -1.293874979019165, - "logps/chosen": -548.2469482421875, - "logps/rejected": -635.0680541992188, - "loss": 0.5024, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.7906343936920166, - "rewards/margins": 1.13905930519104, - "rewards/rejected": -3.9296936988830566, + "logits/chosen": -1.3770835399627686, + "logits/rejected": -1.216672658920288, + "logps/chosen": -541.1361694335938, + "logps/rejected": -622.9951171875, + "loss": 0.5016, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7195262908935547, + "rewards/margins": 1.0894376039505005, + "rewards/rejected": -3.8089637756347656, "step": 2630 }, { "epoch": 0.69, - "grad_norm": 9.125, + "grad_norm": 9.3125, "learning_rate": 1.3198715261929587e-06, - "logits/chosen": -1.4278955459594727, - "logits/rejected": -1.2781603336334229, - "logps/chosen": -530.7630615234375, - "logps/rejected": -637.7789306640625, - "loss": 0.4257, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.943067789077759, - "rewards/margins": 1.2065026760101318, - "rewards/rejected": -4.149571418762207, + "logits/chosen": -1.344639539718628, + "logits/rejected": -1.1973941326141357, + "logps/chosen": -521.10888671875, + "logps/rejected": -628.2103881835938, + "loss": 0.4222, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8465256690979004, + "rewards/margins": 1.2073593139648438, + "rewards/rejected": -4.053884983062744, "step": 2640 }, { "epoch": 0.69, - "grad_norm": 8.1875, + "grad_norm": 7.96875, "learning_rate": 1.2997819181352823e-06, - "logits/chosen": -1.4591927528381348, - "logits/rejected": -1.3012304306030273, - "logps/chosen": -570.8213500976562, - "logps/rejected": -686.0819091796875, - "loss": 0.432, + "logits/chosen": -1.3569964170455933, + "logits/rejected": -1.2025775909423828, + "logps/chosen": -566.9078369140625, + "logps/rejected": -691.9054565429688, + "loss": 0.4043, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.7398808002471924, - "rewards/margins": 1.3976715803146362, - "rewards/rejected": -4.137551784515381, + "rewards/chosen": -2.7007460594177246, + "rewards/margins": 1.4950422048568726, + "rewards/rejected": -4.195788383483887, "step": 2650 }, { "epoch": 0.7, - "grad_norm": 21.375, + "grad_norm": 16.375, "learning_rate": 1.2797925279420454e-06, - "logits/chosen": -1.4334865808486938, - "logits/rejected": -1.2914705276489258, - "logps/chosen": -569.4708862304688, - "logps/rejected": -681.2939453125, - "loss": 0.4801, + "logits/chosen": -1.3312625885009766, + "logits/rejected": -1.1907614469528198, + "logps/chosen": -577.4212646484375, + "logps/rejected": -690.229248046875, + "loss": 0.4911, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.9758732318878174, - "rewards/margins": 1.2384196519851685, - "rewards/rejected": -4.214293003082275, + "rewards/chosen": -3.055377244949341, + "rewards/margins": 1.2482696771621704, + "rewards/rejected": -4.303646564483643, "step": 2660 }, { "epoch": 0.7, - "grad_norm": 13.625, + "grad_norm": 16.875, "learning_rate": 1.2599050247215764e-06, - "logits/chosen": -1.3719279766082764, - "logits/rejected": -1.2636692523956299, - "logps/chosen": -553.9847412109375, - "logps/rejected": -650.590576171875, - "loss": 0.4906, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.9605300426483154, - "rewards/margins": 1.196462869644165, - "rewards/rejected": -4.1569929122924805, + "logits/chosen": -1.2753608226776123, + "logits/rejected": -1.1736326217651367, + "logps/chosen": -555.798828125, + "logps/rejected": -654.16357421875, + "loss": 0.4766, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.978670835494995, + "rewards/margins": 1.214051365852356, + "rewards/rejected": -4.192722320556641, "step": 2670 }, { "epoch": 0.7, - "grad_norm": 13.5625, + "grad_norm": 12.0, "learning_rate": 1.2401210690746705e-06, - "logits/chosen": -1.407362699508667, - "logits/rejected": -1.252087950706482, - "logps/chosen": -551.7230834960938, - "logps/rejected": -632.700439453125, - "loss": 0.4983, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.845663547515869, - "rewards/margins": 1.1418297290802002, - "rewards/rejected": -3.9874930381774902, + "logits/chosen": -1.3060388565063477, + "logits/rejected": -1.1588232517242432, + "logps/chosen": -556.8359985351562, + "logps/rejected": -636.2288818359375, + "loss": 0.5018, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8967931270599365, + "rewards/margins": 1.125984787940979, + "rewards/rejected": -4.022777557373047, "step": 2680 }, { "epoch": 0.7, - "grad_norm": 13.0, + "grad_norm": 10.625, "learning_rate": 1.2204423129559306e-06, - "logits/chosen": -1.4424464702606201, - "logits/rejected": -1.3769545555114746, - "logps/chosen": -529.952880859375, - "logps/rejected": -642.1946411132812, - "loss": 0.5035, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.6443989276885986, - "rewards/margins": 1.1635662317276, - "rewards/rejected": -3.807965040206909, + "logits/chosen": -1.3615459203720093, + "logits/rejected": -1.3014076948165894, + "logps/chosen": -538.6215209960938, + "logps/rejected": -644.1961669921875, + "loss": 0.5168, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.731085777282715, + "rewards/margins": 1.0968948602676392, + "rewards/rejected": -3.8279807567596436, "step": 2690 }, { "epoch": 0.71, - "grad_norm": 18.625, + "grad_norm": 15.1875, "learning_rate": 1.20087039953583e-06, - "logits/chosen": -1.4672467708587646, - "logits/rejected": -1.3367975950241089, - "logps/chosen": -524.6241455078125, - "logps/rejected": -619.1414794921875, - "loss": 0.5286, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.604055881500244, - "rewards/margins": 1.2088748216629028, - "rewards/rejected": -3.8129310607910156, + "logits/chosen": -1.375808596611023, + "logits/rejected": -1.252746820449829, + "logps/chosen": -531.059326171875, + "logps/rejected": -624.744140625, + "loss": 0.514, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.668408155441284, + "rewards/margins": 1.2005492448806763, + "rewards/rejected": -3.86895751953125, "step": 2700 }, { "epoch": 0.71, - "eval_logits/chosen": -1.330871820449829, - "eval_logits/rejected": -1.2033063173294067, - "eval_logps/chosen": -528.7828979492188, - "eval_logps/rejected": -622.6273193359375, - "eval_loss": 0.49607598781585693, - "eval_rewards/accuracies": 0.7379999756813049, - "eval_rewards/chosen": -2.6413092613220215, - "eval_rewards/margins": 1.1388777494430542, - "eval_rewards/rejected": -3.7801873683929443, - "eval_runtime": 384.8793, - "eval_samples_per_second": 5.196, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.2462804317474365, + "eval_logits/rejected": -1.1248236894607544, + "eval_logps/chosen": -534.3994140625, + "eval_logps/rejected": -625.0958251953125, + "eval_loss": 0.4923916161060333, + "eval_rewards/accuracies": 0.7354999780654907, + "eval_rewards/chosen": -2.6974740028381348, + "eval_rewards/margins": 1.1073981523513794, + "eval_rewards/rejected": -3.8048720359802246, + "eval_runtime": 385.0439, + "eval_samples_per_second": 5.194, + "eval_steps_per_second": 0.649, "step": 2700 }, { "epoch": 0.71, - "grad_norm": 12.1875, + "grad_norm": 13.125, "learning_rate": 1.181406963063507e-06, - "logits/chosen": -1.3730335235595703, - "logits/rejected": -1.3149739503860474, - "logps/chosen": -519.0841064453125, - "logps/rejected": -629.9993896484375, - "loss": 0.5014, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.532372236251831, - "rewards/margins": 1.1152244806289673, - "rewards/rejected": -3.647596836090088, + "logits/chosen": -1.2778210639953613, + "logits/rejected": -1.228360652923584, + "logps/chosen": -523.0855102539062, + "logps/rejected": -629.9219970703125, + "loss": 0.5097, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.572385787963867, + "rewards/margins": 1.0744374990463257, + "rewards/rejected": -3.6468231678009033, "step": 2710 }, { "epoch": 0.71, - "grad_norm": 5.875, + "grad_norm": 6.6875, "learning_rate": 1.1620536287303052e-06, - "logits/chosen": -1.4739983081817627, - "logits/rejected": -1.3339478969573975, - "logps/chosen": -543.446533203125, - "logps/rejected": -608.2566528320312, - "loss": 0.5447, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.522928476333618, - "rewards/margins": 0.9496552348136902, - "rewards/rejected": -3.472583770751953, + "logits/chosen": -1.3865063190460205, + "logits/rejected": -1.2557927370071411, + "logps/chosen": -545.7744750976562, + "logps/rejected": -609.2724609375, + "loss": 0.5395, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.5462071895599365, + "rewards/margins": 0.9365339279174805, + "rewards/rejected": -3.482741117477417, "step": 2720 }, { "epoch": 0.71, - "grad_norm": 10.5, + "grad_norm": 9.3125, "learning_rate": 1.1428120125340717e-06, - "logits/chosen": -1.4052727222442627, - "logits/rejected": -1.2576491832733154, - "logps/chosen": -496.2265625, - "logps/rejected": -606.283203125, - "loss": 0.4235, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.4208767414093018, - "rewards/margins": 1.5151195526123047, - "rewards/rejected": -3.9359962940216064, + "logits/chosen": -1.3251538276672363, + "logits/rejected": -1.1808980703353882, + "logps/chosen": -494.53924560546875, + "logps/rejected": -603.8756103515625, + "loss": 0.3923, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.404003143310547, + "rewards/margins": 1.5079169273376465, + "rewards/rejected": -3.9119198322296143, "step": 2730 }, { "epoch": 0.72, - "grad_norm": 10.4375, + "grad_norm": 10.125, "learning_rate": 1.123683721144223e-06, - "logits/chosen": -1.4060310125350952, - "logits/rejected": -1.3006138801574707, - "logps/chosen": -531.5321655273438, - "logps/rejected": -636.8844604492188, - "loss": 0.4255, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.4785878658294678, - "rewards/margins": 1.376922845840454, - "rewards/rejected": -3.85551118850708, + "logits/chosen": -1.319456696510315, + "logits/rejected": -1.213781714439392, + "logps/chosen": -539.8772583007812, + "logps/rejected": -638.1966552734375, + "loss": 0.44, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5620384216308594, + "rewards/margins": 1.3065942525863647, + "rewards/rejected": -3.8686325550079346, "step": 2740 }, { "epoch": 0.72, - "grad_norm": 7.375, + "grad_norm": 6.25, "learning_rate": 1.1046703517675848e-06, - "logits/chosen": -1.4380762577056885, - "logits/rejected": -1.3517663478851318, - "logps/chosen": -497.983154296875, - "logps/rejected": -609.3760986328125, - "loss": 0.5127, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.428668975830078, - "rewards/margins": 1.0526988506317139, - "rewards/rejected": -3.481367588043213, + "logits/chosen": -1.3422720432281494, + "logits/rejected": -1.2605860233306885, + "logps/chosen": -512.2991943359375, + "logps/rejected": -620.3077392578125, + "loss": 0.522, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.5718300342559814, + "rewards/margins": 1.018854022026062, + "rewards/rejected": -3.590684175491333, "step": 2750 }, { "epoch": 0.72, - "grad_norm": 11.3125, + "grad_norm": 10.75, "learning_rate": 1.085773492015028e-06, - "logits/chosen": -1.422131896018982, - "logits/rejected": -1.2450854778289795, - "logps/chosen": -488.3145446777344, - "logps/rejected": -581.1353149414062, - "loss": 0.4374, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.4193508625030518, - "rewards/margins": 1.271308183670044, - "rewards/rejected": -3.6906590461730957, + "logits/chosen": -1.3229783773422241, + "logits/rejected": -1.1519359350204468, + "logps/chosen": -497.25701904296875, + "logps/rejected": -590.8815307617188, + "loss": 0.4271, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.508775472640991, + "rewards/margins": 1.2793452739715576, + "rewards/rejected": -3.788120985031128, "step": 2760 }, { "epoch": 0.72, - "grad_norm": 11.1875, + "grad_norm": 32.0, "learning_rate": 1.0669947197689034e-06, - "logits/chosen": -1.3928449153900146, - "logits/rejected": -1.2680118083953857, - "logps/chosen": -527.7369995117188, - "logps/rejected": -608.68408203125, - "loss": 0.4956, + "logits/chosen": -1.2877874374389648, + "logits/rejected": -1.1616923809051514, + "logps/chosen": -543.9298095703125, + "logps/rejected": -625.6560668945312, + "loss": 0.487, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.5510122776031494, - "rewards/margins": 1.0964926481246948, - "rewards/rejected": -3.6475048065185547, + "rewards/chosen": -2.712940216064453, + "rewards/margins": 1.1042835712432861, + "rewards/rejected": -3.8172237873077393, "step": 2770 }, { "epoch": 0.73, - "grad_norm": 10.5, + "grad_norm": 9.4375, "learning_rate": 1.048335603051291e-06, - "logits/chosen": -1.3895783424377441, - "logits/rejected": -1.2499104738235474, - "logps/chosen": -566.0472412109375, - "logps/rejected": -672.4845581054688, - "loss": 0.4484, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.7824342250823975, - "rewards/margins": 1.3463845252990723, - "rewards/rejected": -4.128818511962891, + "logits/chosen": -1.282389521598816, + "logits/rejected": -1.1512477397918701, + "logps/chosen": -572.5489501953125, + "logps/rejected": -676.9873046875, + "loss": 0.4351, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8474509716033936, + "rewards/margins": 1.3263962268829346, + "rewards/rejected": -4.173847198486328, "step": 2780 }, { "epoch": 0.73, - "grad_norm": 19.25, + "grad_norm": 9.0, "learning_rate": 1.0297976998930665e-06, - "logits/chosen": -1.3868653774261475, - "logits/rejected": -1.2727091312408447, - "logps/chosen": -521.1253662109375, - "logps/rejected": -633.4533081054688, - "loss": 0.44, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.605109453201294, - "rewards/margins": 1.4010969400405884, - "rewards/rejected": -4.006206035614014, + "logits/chosen": -1.2781507968902588, + "logits/rejected": -1.1678388118743896, + "logps/chosen": -534.2879638671875, + "logps/rejected": -643.2774047851562, + "loss": 0.4393, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7367353439331055, + "rewards/margins": 1.3677116632461548, + "rewards/rejected": -4.104446887969971, "step": 2790 }, { "epoch": 0.73, - "grad_norm": 9.4375, + "grad_norm": 8.4375, "learning_rate": 1.0113825582038078e-06, - "logits/chosen": -1.4213166236877441, - "logits/rejected": -1.304223656654358, - "logps/chosen": -538.0697021484375, - "logps/rejected": -635.2910766601562, - "loss": 0.4564, + "logits/chosen": -1.3029879331588745, + "logits/rejected": -1.196803092956543, + "logps/chosen": -556.0444946289062, + "logps/rejected": -652.0103149414062, + "loss": 0.4662, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.727999210357666, - "rewards/margins": 1.1396801471710205, - "rewards/rejected": -3.8676788806915283, + "rewards/chosen": -2.9077467918395996, + "rewards/margins": 1.127124547958374, + "rewards/rejected": -4.0348711013793945, "step": 2800 }, { "epoch": 0.73, - "eval_logits/chosen": -1.3305258750915527, - "eval_logits/rejected": -1.2037560939788818, - "eval_logps/chosen": -532.7353515625, - "eval_logps/rejected": -627.1752319335938, - "eval_loss": 0.49248310923576355, - "eval_rewards/accuracies": 0.7404999732971191, - "eval_rewards/chosen": -2.680833578109741, - "eval_rewards/margins": 1.1448326110839844, - "eval_rewards/rejected": -3.8256664276123047, - "eval_runtime": 384.8109, - "eval_samples_per_second": 5.197, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.2345499992370605, + "eval_logits/rejected": -1.1134350299835205, + "eval_logps/chosen": -547.6557006835938, + "eval_logps/rejected": -641.2913208007812, + "eval_loss": 0.4899207055568695, + "eval_rewards/accuracies": 0.7379999756813049, + "eval_rewards/chosen": -2.830036163330078, + "eval_rewards/margins": 1.1367909908294678, + "eval_rewards/rejected": -3.966827154159546, + "eval_runtime": 384.9651, + "eval_samples_per_second": 5.195, + "eval_steps_per_second": 0.649, "step": 2800 }, { "epoch": 0.74, - "grad_norm": 10.5, + "grad_norm": 9.5625, "learning_rate": 9.930917156425477e-07, - "logits/chosen": -1.4014348983764648, - "logits/rejected": -1.2874269485473633, - "logps/chosen": -547.7164306640625, - "logps/rejected": -650.8362426757812, - "loss": 0.5363, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.8659284114837646, - "rewards/margins": 1.0600517988204956, - "rewards/rejected": -3.9259800910949707, + "logits/chosen": -1.2949634790420532, + "logits/rejected": -1.183593988418579, + "logps/chosen": -563.5440673828125, + "logps/rejected": -668.3243408203125, + "loss": 0.5295, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0242040157318115, + "rewards/margins": 1.0766557455062866, + "rewards/rejected": -4.100859642028809, "step": 2810 }, { "epoch": 0.74, - "grad_norm": 23.5, + "grad_norm": 19.0, "learning_rate": 9.749266994893756e-07, - "logits/chosen": -1.3308923244476318, - "logits/rejected": -1.20591139793396, - "logps/chosen": -509.127685546875, - "logps/rejected": -585.4031982421875, - "loss": 0.5619, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.684295177459717, - "rewards/margins": 0.8970636129379272, - "rewards/rejected": -3.5813584327697754, + "logits/chosen": -1.2192089557647705, + "logits/rejected": -1.0985405445098877, + "logps/chosen": -531.9083251953125, + "logps/rejected": -606.6322021484375, + "loss": 0.5603, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9121012687683105, + "rewards/margins": 0.8815471529960632, + "rewards/rejected": -3.7936484813690186, "step": 2820 }, { "epoch": 0.74, - "grad_norm": 12.0625, + "grad_norm": 12.125, "learning_rate": 9.56889026517913e-07, - "logits/chosen": -1.388494610786438, - "logits/rejected": -1.2768661975860596, - "logps/chosen": -543.159423828125, - "logps/rejected": -621.0648193359375, - "loss": 0.5044, + "logits/chosen": -1.2642897367477417, + "logits/rejected": -1.1569067239761353, + "logps/chosen": -561.394287109375, + "logps/rejected": -641.754638671875, + "loss": 0.5072, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.818211078643799, - "rewards/margins": 1.039044737815857, - "rewards/rejected": -3.857255458831787, + "rewards/chosen": -3.0005598068237305, + "rewards/margins": 1.063594102859497, + "rewards/rejected": -4.064153671264648, "step": 2830 }, { "epoch": 0.74, - "grad_norm": 6.90625, + "grad_norm": 7.40625, "learning_rate": 9.389802028686617e-07, - "logits/chosen": -1.4692569971084595, - "logits/rejected": -1.3720782995224, - "logps/chosen": -529.861328125, - "logps/rejected": -575.4440307617188, - "loss": 0.5904, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.690932035446167, - "rewards/margins": 0.7855546474456787, - "rewards/rejected": -3.4764866828918457, + "logits/chosen": -1.3579823970794678, + "logits/rejected": -1.2555077075958252, + "logps/chosen": -551.67626953125, + "logps/rejected": -596.185546875, + "loss": 0.5982, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.90908145904541, + "rewards/margins": 0.7748203277587891, + "rewards/rejected": -3.6839020252227783, "step": 2840 }, { "epoch": 0.75, - "grad_norm": 10.5, + "grad_norm": 9.75, "learning_rate": 9.212017239232427e-07, - "logits/chosen": -1.4099429845809937, - "logits/rejected": -1.2403053045272827, - "logps/chosen": -530.2276611328125, - "logps/rejected": -631.5889892578125, - "loss": 0.4731, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.5727314949035645, - "rewards/margins": 1.2429524660110474, - "rewards/rejected": -3.8156840801239014, + "logits/chosen": -1.2956401109695435, + "logits/rejected": -1.1352595090866089, + "logps/chosen": -550.6188354492188, + "logps/rejected": -647.8556518554688, + "loss": 0.4704, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7766430377960205, + "rewards/margins": 1.2017085552215576, + "rewards/rejected": -3.97835111618042, "step": 2850 }, { "epoch": 0.75, - "grad_norm": 9.625, + "grad_norm": 9.875, "learning_rate": 9.03555074179533e-07, - "logits/chosen": -1.3713786602020264, - "logits/rejected": -1.3497127294540405, - "logps/chosen": -513.812744140625, - "logps/rejected": -644.496826171875, - "loss": 0.4514, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.570328950881958, - "rewards/margins": 1.2485467195510864, - "rewards/rejected": -3.818875551223755, + "logits/chosen": -1.2600593566894531, + "logits/rejected": -1.2393784523010254, + "logps/chosen": -524.3843994140625, + "logps/rejected": -654.7698364257812, + "loss": 0.4337, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.6760458946228027, + "rewards/margins": 1.2455599308013916, + "rewards/rejected": -3.921605348587036, "step": 2860 }, { "epoch": 0.75, - "grad_norm": 20.0, + "grad_norm": 20.5, "learning_rate": 8.860417271277067e-07, - "logits/chosen": -1.4884029626846313, - "logits/rejected": -1.4525179862976074, - "logps/chosen": -530.30712890625, - "logps/rejected": -616.304931640625, - "loss": 0.4918, + "logits/chosen": -1.3854873180389404, + "logits/rejected": -1.3558924198150635, + "logps/chosen": -545.82568359375, + "logps/rejected": -628.0182495117188, + "loss": 0.4992, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.6107208728790283, - "rewards/margins": 0.9398597478866577, - "rewards/rejected": -3.5505805015563965, + "rewards/chosen": -2.765906810760498, + "rewards/margins": 0.901807963848114, + "rewards/rejected": -3.6677145957946777, "step": 2870 }, { "epoch": 0.75, - "grad_norm": 8.25, + "grad_norm": 10.125, "learning_rate": 8.686631451272029e-07, - "logits/chosen": -1.4650144577026367, - "logits/rejected": -1.3026695251464844, - "logps/chosen": -529.6368408203125, - "logps/rejected": -623.4691162109375, - "loss": 0.4882, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.762089252471924, - "rewards/margins": 1.1926974058151245, - "rewards/rejected": -3.954786777496338, + "logits/chosen": -1.3561471700668335, + "logits/rejected": -1.2010211944580078, + "logps/chosen": -551.3495483398438, + "logps/rejected": -639.118896484375, + "loss": 0.5022, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.979217052459717, + "rewards/margins": 1.1320674419403076, + "rewards/rejected": -4.1112847328186035, "step": 2880 }, { "epoch": 0.76, - "grad_norm": 7.03125, + "grad_norm": 9.3125, "learning_rate": 8.514207792846168e-07, - "logits/chosen": -1.4712326526641846, - "logits/rejected": -1.3452935218811035, - "logps/chosen": -522.875244140625, - "logps/rejected": -606.8411254882812, - "loss": 0.5049, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.720153570175171, - "rewards/margins": 1.1239979267120361, - "rewards/rejected": -3.844151735305786, + "logits/chosen": -1.3641732931137085, + "logits/rejected": -1.2438944578170776, + "logps/chosen": -541.0029296875, + "logps/rejected": -626.8678588867188, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.901430130004883, + "rewards/margins": 1.142988681793213, + "rewards/rejected": -4.044418811798096, "step": 2890 }, { "epoch": 0.76, - "grad_norm": 9.6875, + "grad_norm": 8.5, "learning_rate": 8.343160693325356e-07, - "logits/chosen": -1.3627361059188843, - "logits/rejected": -1.2458826303482056, - "logps/chosen": -535.0531005859375, - "logps/rejected": -643.9103393554688, - "loss": 0.5166, + "logits/chosen": -1.2573918104171753, + "logits/rejected": -1.1431939601898193, + "logps/chosen": -554.5100708007812, + "logps/rejected": -662.68212890625, + "loss": 0.5111, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.795009136199951, - "rewards/margins": 1.1317135095596313, - "rewards/rejected": -3.926722764968872, + "rewards/chosen": -2.98957896232605, + "rewards/margins": 1.1248613595962524, + "rewards/rejected": -4.11444091796875, "step": 2900 }, { "epoch": 0.76, - "eval_logits/chosen": -1.3309776782989502, - "eval_logits/rejected": -1.2046185731887817, - "eval_logps/chosen": -542.6776733398438, - "eval_logps/rejected": -634.599365234375, - "eval_loss": 0.4903542995452881, - "eval_rewards/accuracies": 0.7415000200271606, - "eval_rewards/chosen": -2.7802560329437256, - "eval_rewards/margins": 1.1196515560150146, - "eval_rewards/rejected": -3.8999080657958984, - "eval_runtime": 384.9371, - "eval_samples_per_second": 5.196, + "eval_logits/chosen": -1.2396172285079956, + "eval_logits/rejected": -1.1188315153121948, + "eval_logps/chosen": -558.570556640625, + "eval_logps/rejected": -650.9627075195312, + "eval_loss": 0.48732802271842957, + "eval_rewards/accuracies": 0.7404999732971191, + "eval_rewards/chosen": -2.9391860961914062, + "eval_rewards/margins": 1.1243551969528198, + "eval_rewards/rejected": -4.063540935516357, + "eval_runtime": 385.3295, + "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 2900 }, { "epoch": 0.76, - "grad_norm": 7.65625, + "grad_norm": 7.8125, "learning_rate": 8.173504435093174e-07, - "logits/chosen": -1.3742562532424927, - "logits/rejected": -1.193378210067749, - "logps/chosen": -514.3562622070312, - "logps/rejected": -601.44091796875, - "loss": 0.4823, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.727877378463745, - "rewards/margins": 1.1920711994171143, - "rewards/rejected": -3.9199485778808594, + "logits/chosen": -1.252179741859436, + "logits/rejected": -1.0778075456619263, + "logps/chosen": -531.073974609375, + "logps/rejected": -619.1007690429688, + "loss": 0.4851, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.895054817199707, + "rewards/margins": 1.2014925479888916, + "rewards/rejected": -4.096547603607178, "step": 2910 }, { "epoch": 0.76, - "grad_norm": 6.6875, + "grad_norm": 6.84375, "learning_rate": 8.00525318439836e-07, - "logits/chosen": -1.4089447259902954, - "logits/rejected": -1.2591418027877808, - "logps/chosen": -555.609375, - "logps/rejected": -642.9563598632812, - "loss": 0.5406, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.7878005504608154, - "rewards/margins": 0.9498197436332703, - "rewards/rejected": -3.7376205921173096, + "logits/chosen": -1.2942620515823364, + "logits/rejected": -1.1525405645370483, + "logps/chosen": -569.043701171875, + "logps/rejected": -657.7420043945312, + "loss": 0.5304, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.9221444129943848, + "rewards/margins": 0.9633318185806274, + "rewards/rejected": -3.8854763507843018, "step": 2920 }, { "epoch": 0.77, - "grad_norm": 8.625, + "grad_norm": 7.53125, "learning_rate": 7.838420990171927e-07, - "logits/chosen": -1.4908835887908936, - "logits/rejected": -1.3264166116714478, - "logps/chosen": -539.0242919921875, - "logps/rejected": -614.5499267578125, - "loss": 0.5213, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.6965396404266357, - "rewards/margins": 1.0111539363861084, - "rewards/rejected": -3.7076938152313232, + "logits/chosen": -1.3769783973693848, + "logits/rejected": -1.217556357383728, + "logps/chosen": -552.2919921875, + "logps/rejected": -631.7188720703125, + "loss": 0.5073, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8292160034179688, + "rewards/margins": 1.050167202949524, + "rewards/rejected": -3.879383087158203, "step": 2930 }, { "epoch": 0.77, - "grad_norm": 7.90625, + "grad_norm": 9.5, "learning_rate": 7.673021782854084e-07, - "logits/chosen": -1.3615717887878418, - "logits/rejected": -1.2126728296279907, - "logps/chosen": -533.7975463867188, - "logps/rejected": -619.9442138671875, - "loss": 0.4664, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.7310891151428223, - "rewards/margins": 1.2802739143371582, - "rewards/rejected": -4.0113630294799805, + "logits/chosen": -1.2488492727279663, + "logits/rejected": -1.1089154481887817, + "logps/chosen": -549.6131591796875, + "logps/rejected": -629.2005615234375, + "loss": 0.4792, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8892455101013184, + "rewards/margins": 1.214680790901184, + "rewards/rejected": -4.103926658630371, "step": 2940 }, { "epoch": 0.77, - "grad_norm": 8.0625, + "grad_norm": 11.75, "learning_rate": 7.509069373231039e-07, - "logits/chosen": -1.364745855331421, - "logits/rejected": -1.2478384971618652, - "logps/chosen": -535.0496215820312, - "logps/rejected": -594.5074462890625, - "loss": 0.5768, + "logits/chosen": -1.259916067123413, + "logits/rejected": -1.1467456817626953, + "logps/chosen": -547.0595092773438, + "logps/rejected": -607.7587280273438, + "loss": 0.5723, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.823659658432007, - "rewards/margins": 0.8424208760261536, - "rewards/rejected": -3.666080951690674, + "rewards/chosen": -2.9437592029571533, + "rewards/margins": 0.854836106300354, + "rewards/rejected": -3.798595428466797, "step": 2950 }, { "epoch": 0.77, - "grad_norm": 7.25, + "grad_norm": 7.34375, "learning_rate": 7.346577451281822e-07, - "logits/chosen": -1.3784279823303223, - "logits/rejected": -1.2886309623718262, - "logps/chosen": -533.7701416015625, - "logps/rejected": -641.3207397460938, - "loss": 0.4516, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.715505361557007, - "rewards/margins": 1.3165885210037231, - "rewards/rejected": -4.032094478607178, + "logits/chosen": -1.275743007659912, + "logits/rejected": -1.1921640634536743, + "logps/chosen": -545.425537109375, + "logps/rejected": -653.1339111328125, + "loss": 0.4519, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.832059383392334, + "rewards/margins": 1.3181660175323486, + "rewards/rejected": -4.150225639343262, "step": 2960 }, { "epoch": 0.78, - "grad_norm": 22.0, + "grad_norm": 12.25, "learning_rate": 7.185559585035138e-07, - "logits/chosen": -1.413137674331665, - "logits/rejected": -1.2511926889419556, - "logps/chosen": -575.2888793945312, - "logps/rejected": -671.4484252929688, - "loss": 0.4863, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.9251255989074707, - "rewards/margins": 1.120355248451233, - "rewards/rejected": -4.045480728149414, + "logits/chosen": -1.3098807334899902, + "logits/rejected": -1.1533119678497314, + "logps/chosen": -584.9642333984375, + "logps/rejected": -682.4730224609375, + "loss": 0.4797, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.021878957748413, + "rewards/margins": 1.133847951889038, + "rewards/rejected": -4.155727386474609, "step": 2970 }, { "epoch": 0.78, - "grad_norm": 9.25, + "grad_norm": 8.625, "learning_rate": 7.026029219436504e-07, - "logits/chosen": -1.4352123737335205, - "logits/rejected": -1.2644484043121338, - "logps/chosen": -532.7044677734375, - "logps/rejected": -639.6867065429688, - "loss": 0.479, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.804140090942383, - "rewards/margins": 1.197808861732483, - "rewards/rejected": -4.001949310302734, + "logits/chosen": -1.3365461826324463, + "logits/rejected": -1.1761207580566406, + "logps/chosen": -542.1203002929688, + "logps/rejected": -646.118896484375, + "loss": 0.4723, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8982977867126465, + "rewards/margins": 1.1679728031158447, + "rewards/rejected": -4.0662713050842285, "step": 2980 }, { "epoch": 0.78, - "grad_norm": 6.46875, + "grad_norm": 7.0, "learning_rate": 6.867999675225523e-07, - "logits/chosen": -1.4778783321380615, - "logits/rejected": -1.3396053314208984, - "logps/chosen": -498.43896484375, - "logps/rejected": -601.5620727539062, - "loss": 0.4659, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.675211191177368, - "rewards/margins": 1.1733391284942627, - "rewards/rejected": -3.8485500812530518, + "logits/chosen": -1.3771815299987793, + "logits/rejected": -1.2472676038742065, + "logps/chosen": -512.2825317382812, + "logps/rejected": -608.7750854492188, + "loss": 0.487, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8136465549468994, + "rewards/margins": 1.1070338487625122, + "rewards/rejected": -3.920680284500122, "step": 2990 }, { "epoch": 0.79, - "grad_norm": 9.75, + "grad_norm": 10.6875, "learning_rate": 6.711484147823663e-07, - "logits/chosen": -1.3854588270187378, - "logits/rejected": -1.3027629852294922, - "logps/chosen": -500.9271545410156, - "logps/rejected": -624.5755615234375, - "loss": 0.4653, + "logits/chosen": -1.2860959768295288, + "logits/rejected": -1.2111051082611084, + "logps/chosen": -506.64581298828125, + "logps/rejected": -628.4481811523438, + "loss": 0.4758, "rewards/accuracies": 0.75, - "rewards/chosen": -2.667020797729492, - "rewards/margins": 1.1879808902740479, - "rewards/rejected": -3.855001449584961, + "rewards/chosen": -2.7242074012756348, + "rewards/margins": 1.1695196628570557, + "rewards/rejected": -3.8937268257141113, "step": 3000 }, { "epoch": 0.79, - "eval_logits/chosen": -1.3332931995391846, - "eval_logits/rejected": -1.2066706418991089, - "eval_logps/chosen": -544.357421875, - "eval_logps/rejected": -633.0811157226562, - "eval_loss": 0.48963090777397156, - "eval_rewards/accuracies": 0.7425000071525574, - "eval_rewards/chosen": -2.7970540523529053, - "eval_rewards/margins": 1.0876713991165161, - "eval_rewards/rejected": -3.884725332260132, - "eval_runtime": 384.4736, - "eval_samples_per_second": 5.202, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.2526096105575562, + "eval_logits/rejected": -1.1318107843399048, + "eval_logps/chosen": -550.865478515625, + "eval_logps/rejected": -638.7723999023438, + "eval_loss": 0.4866448938846588, + "eval_rewards/accuracies": 0.7409999966621399, + "eval_rewards/chosen": -2.8621349334716797, + "eval_rewards/margins": 1.079501986503601, + "eval_rewards/rejected": -3.9416370391845703, + "eval_runtime": 385.0884, + "eval_samples_per_second": 5.194, + "eval_steps_per_second": 0.649, "step": 3000 }, { "epoch": 0.79, - "grad_norm": 12.6875, + "grad_norm": 10.625, "learning_rate": 6.556495706232413e-07, - "logits/chosen": -1.3909966945648193, - "logits/rejected": -1.2924126386642456, - "logps/chosen": -558.5319213867188, - "logps/rejected": -642.2532958984375, - "loss": 0.5411, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.8983654975891113, - "rewards/margins": 1.0214240550994873, - "rewards/rejected": -3.9197897911071777, + "logits/chosen": -1.2896664142608643, + "logits/rejected": -1.1979024410247803, + "logps/chosen": -560.0714721679688, + "logps/rejected": -646.5289916992188, + "loss": 0.5296, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9137609004974365, + "rewards/margins": 1.0487867593765259, + "rewards/rejected": -3.9625473022460938, "step": 3010 }, { "epoch": 0.79, - "grad_norm": 9.75, + "grad_norm": 9.6875, "learning_rate": 6.403047291942057e-07, - "logits/chosen": -1.316248893737793, - "logits/rejected": -1.1616556644439697, - "logps/chosen": -507.49603271484375, - "logps/rejected": -592.3514404296875, - "loss": 0.4966, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.8122828006744385, - "rewards/margins": 1.092459797859192, - "rewards/rejected": -3.904742479324341, + "logits/chosen": -1.2192307710647583, + "logits/rejected": -1.0712454319000244, + "logps/chosen": -515.818115234375, + "logps/rejected": -601.6507568359375, + "loss": 0.4944, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.895503520965576, + "rewards/margins": 1.1022310256958008, + "rewards/rejected": -3.997734785079956, "step": 3020 }, { "epoch": 0.79, - "grad_norm": 13.1875, + "grad_norm": 12.6875, "learning_rate": 6.251151717851023e-07, - "logits/chosen": -1.3809759616851807, - "logits/rejected": -1.300438642501831, - "logps/chosen": -501.9111328125, - "logps/rejected": -600.9503784179688, - "loss": 0.4907, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.685105323791504, - "rewards/margins": 1.123300313949585, - "rewards/rejected": -3.8084053993225098, + "logits/chosen": -1.2880637645721436, + "logits/rejected": -1.2091928720474243, + "logps/chosen": -509.5738220214844, + "logps/rejected": -608.5218505859375, + "loss": 0.4853, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7617316246032715, + "rewards/margins": 1.1223886013031006, + "rewards/rejected": -3.884120464324951, "step": 3030 }, { "epoch": 0.8, - "grad_norm": 5.96875, + "grad_norm": 6.40625, "learning_rate": 6.100821667196041e-07, - "logits/chosen": -1.5608792304992676, - "logits/rejected": -1.2841638326644897, - "logps/chosen": -542.8932495117188, - "logps/rejected": -584.8834228515625, - "loss": 0.4858, + "logits/chosen": -1.4694463014602661, + "logits/rejected": -1.2010104656219482, + "logps/chosen": -551.3878173828125, + "logps/rejected": -589.3790283203125, + "loss": 0.4979, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.672549247741699, - "rewards/margins": 1.0747697353363037, - "rewards/rejected": -3.747319459915161, + "rewards/chosen": -2.7574946880340576, + "rewards/margins": 1.0347812175750732, + "rewards/rejected": -3.792275905609131, "step": 3040 }, { "epoch": 0.8, - "grad_norm": 96.5, + "grad_norm": 29.5, "learning_rate": 5.952069692493062e-07, - "logits/chosen": -1.364072561264038, - "logits/rejected": -1.2446839809417725, - "logps/chosen": -493.5611267089844, - "logps/rejected": -623.0722045898438, - "loss": 0.4197, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.652458906173706, - "rewards/margins": 1.2607505321502686, - "rewards/rejected": -3.9132094383239746, + "logits/chosen": -1.2609448432922363, + "logits/rejected": -1.1505969762802124, + "logps/chosen": -498.6568908691406, + "logps/rejected": -627.9306640625, + "loss": 0.4171, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.703416347503662, + "rewards/margins": 1.2583777904510498, + "rewards/rejected": -3.961793899536133, "step": 3050 }, { "epoch": 0.8, - "grad_norm": 9.25, + "grad_norm": 10.625, "learning_rate": 5.80490821448918e-07, - "logits/chosen": -1.3073358535766602, - "logits/rejected": -1.3144903182983398, - "logps/chosen": -533.8553466796875, - "logps/rejected": -709.3834838867188, - "loss": 0.4358, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.7133326530456543, - "rewards/margins": 1.3306279182434082, - "rewards/rejected": -4.043961048126221, + "logits/chosen": -1.216658353805542, + "logits/rejected": -1.2167049646377563, + "logps/chosen": -540.7564086914062, + "logps/rejected": -711.563232421875, + "loss": 0.4298, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7823424339294434, + "rewards/margins": 1.2834153175354004, + "rewards/rejected": -4.065757751464844, "step": 3060 }, { "epoch": 0.8, - "grad_norm": 9.0625, + "grad_norm": 9.5625, "learning_rate": 5.659349521125459e-07, - "logits/chosen": -1.5068944692611694, - "logits/rejected": -1.4476134777069092, - "logps/chosen": -550.5355224609375, - "logps/rejected": -633.8616943359375, - "loss": 0.5084, + "logits/chosen": -1.4194704294204712, + "logits/rejected": -1.3601640462875366, + "logps/chosen": -555.782958984375, + "logps/rejected": -634.6406860351562, + "loss": 0.5047, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6911702156066895, - "rewards/margins": 1.0392811298370361, - "rewards/rejected": -3.7304511070251465, + "rewards/chosen": -2.743645191192627, + "rewards/margins": 0.9945963025093079, + "rewards/rejected": -3.7382407188415527, "step": 3070 }, { "epoch": 0.81, - "grad_norm": 8.0, + "grad_norm": 6.4375, "learning_rate": 5.5154057665109e-07, - "logits/chosen": -1.4646342992782593, - "logits/rejected": -1.306074857711792, - "logps/chosen": -547.6588745117188, - "logps/rejected": -648.3570556640625, - "loss": 0.4836, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.8676917552948, - "rewards/margins": 1.2712510824203491, - "rewards/rejected": -4.138943672180176, + "logits/chosen": -1.3637388944625854, + "logits/rejected": -1.216048240661621, + "logps/chosen": -546.4483642578125, + "logps/rejected": -646.1047973632812, + "loss": 0.4807, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.855586528778076, + "rewards/margins": 1.2608329057693481, + "rewards/rejected": -4.116419792175293, "step": 3080 }, { "epoch": 0.81, - "grad_norm": 8.3125, + "grad_norm": 11.8125, "learning_rate": 5.373088969907586e-07, - "logits/chosen": -1.4882913827896118, - "logits/rejected": -1.3172584772109985, - "logps/chosen": -559.1968994140625, - "logps/rejected": -619.5064697265625, - "loss": 0.4568, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.8238325119018555, - "rewards/margins": 1.1052324771881104, - "rewards/rejected": -3.929064989089966, + "logits/chosen": -1.3931351900100708, + "logits/rejected": -1.2272682189941406, + "logps/chosen": -558.13232421875, + "logps/rejected": -618.197265625, + "loss": 0.4482, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8131866455078125, + "rewards/margins": 1.1027860641479492, + "rewards/rejected": -3.915972948074341, "step": 3090 }, { "epoch": 0.81, - "grad_norm": 6.53125, + "grad_norm": 7.53125, "learning_rate": 5.23241101472709e-07, - "logits/chosen": -1.4091695547103882, - "logits/rejected": -1.2783794403076172, - "logps/chosen": -545.93359375, - "logps/rejected": -631.8021240234375, - "loss": 0.4808, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -2.696319580078125, - "rewards/margins": 1.0787622928619385, - "rewards/rejected": -3.7750816345214844, + "logits/chosen": -1.3162554502487183, + "logits/rejected": -1.1940876245498657, + "logps/chosen": -549.4010009765625, + "logps/rejected": -625.9002075195312, + "loss": 0.4908, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7309937477111816, + "rewards/margins": 0.9850690960884094, + "rewards/rejected": -3.7160630226135254, "step": 3100 }, { "epoch": 0.81, - "eval_logits/chosen": -1.32783842086792, - "eval_logits/rejected": -1.2009104490280151, - "eval_logps/chosen": -546.65625, - "eval_logps/rejected": -639.3413696289062, - "eval_loss": 0.4900914132595062, - "eval_rewards/accuracies": 0.7409999966621399, - "eval_rewards/chosen": -2.820042133331299, - "eval_rewards/margins": 1.1272854804992676, - "eval_rewards/rejected": -3.9473278522491455, - "eval_runtime": 384.996, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.2554689645767212, + "eval_logits/rejected": -1.1347445249557495, + "eval_logps/chosen": -549.6837158203125, + "eval_logps/rejected": -638.7192993164062, + "eval_loss": 0.4868563115596771, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -2.8503170013427734, + "eval_rewards/margins": 1.0907903909683228, + "eval_rewards/rejected": -3.9411072731018066, + "eval_runtime": 385.4515, + "eval_samples_per_second": 5.189, "eval_steps_per_second": 0.649, "step": 3100 }, { "epoch": 0.81, - "grad_norm": 8.625, + "grad_norm": 8.375, "learning_rate": 5.09338364753818e-07, - "logits/chosen": -1.4804376363754272, - "logits/rejected": -1.3111730813980103, - "logps/chosen": -562.015625, - "logps/rejected": -653.310546875, - "loss": 0.5286, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.7659342288970947, - "rewards/margins": 1.0708248615264893, - "rewards/rejected": -3.836758852005005, + "logits/chosen": -1.3838107585906982, + "logits/rejected": -1.2234851121902466, + "logps/chosen": -565.4810791015625, + "logps/rejected": -655.7274169921875, + "loss": 0.5191, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.800589084625244, + "rewards/margins": 1.0603386163711548, + "rewards/rejected": -3.8609280586242676, "step": 3110 }, { "epoch": 0.82, - "grad_norm": 8.8125, + "grad_norm": 11.0625, "learning_rate": 4.956018477086005e-07, - "logits/chosen": -1.444879174232483, - "logits/rejected": -1.2762501239776611, - "logps/chosen": -557.318115234375, - "logps/rejected": -641.5279541015625, - "loss": 0.5268, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.881852626800537, - "rewards/margins": 1.1156418323516846, - "rewards/rejected": -3.9974944591522217, + "logits/chosen": -1.3474712371826172, + "logits/rejected": -1.1852939128875732, + "logps/chosen": -559.21142578125, + "logps/rejected": -640.77685546875, + "loss": 0.5116, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9007859230041504, + "rewards/margins": 1.0891984701156616, + "rewards/rejected": -3.9899849891662598, "step": 3120 }, { "epoch": 0.82, - "grad_norm": 10.8125, + "grad_norm": 12.125, "learning_rate": 4.820326973322764e-07, - "logits/chosen": -1.3559176921844482, - "logits/rejected": -1.2783609628677368, - "logps/chosen": -545.87255859375, - "logps/rejected": -644.6527099609375, - "loss": 0.5488, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.9490904808044434, - "rewards/margins": 1.0479673147201538, - "rewards/rejected": -3.997058153152466, + "logits/chosen": -1.2560558319091797, + "logits/rejected": -1.1815481185913086, + "logps/chosen": -549.0807495117188, + "logps/rejected": -643.4081420898438, + "loss": 0.5513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9811716079711914, + "rewards/margins": 1.0034395456314087, + "rewards/rejected": -3.9846110343933105, "step": 3130 }, { "epoch": 0.82, - "grad_norm": 16.0, + "grad_norm": 10.5, "learning_rate": 4.686320466449981e-07, - "logits/chosen": -1.3531897068023682, - "logits/rejected": -1.1676143407821655, - "logps/chosen": -509.6865234375, - "logps/rejected": -646.2823486328125, - "loss": 0.4513, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.704374313354492, - "rewards/margins": 1.3875735998153687, - "rewards/rejected": -4.091948509216309, + "logits/chosen": -1.2670228481292725, + "logits/rejected": -1.0823358297348022, + "logps/chosen": -515.7471923828125, + "logps/rejected": -646.492919921875, + "loss": 0.454, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7649807929992676, + "rewards/margins": 1.3290727138519287, + "rewards/rejected": -4.094053745269775, "step": 3140 }, { "epoch": 0.82, - "grad_norm": 7.53125, + "grad_norm": 6.8125, "learning_rate": 4.554010145972418e-07, - "logits/chosen": -1.5110399723052979, - "logits/rejected": -1.3339247703552246, - "logps/chosen": -550.0721435546875, - "logps/rejected": -648.1046752929688, - "loss": 0.5631, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.862560749053955, - "rewards/margins": 1.0935637950897217, - "rewards/rejected": -3.956124782562256, + "logits/chosen": -1.4123005867004395, + "logits/rejected": -1.2410565614700317, + "logps/chosen": -551.8477783203125, + "logps/rejected": -645.891357421875, + "loss": 0.5464, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.880317449569702, + "rewards/margins": 1.0536738634109497, + "rewards/rejected": -3.9339919090270996, "step": 3150 }, { "epoch": 0.83, - "grad_norm": 10.0, + "grad_norm": 8.75, "learning_rate": 4.4234070597637455e-07, - "logits/chosen": -1.3611904382705688, - "logits/rejected": -1.270994782447815, - "logps/chosen": -550.0131225585938, - "logps/rejected": -645.00341796875, - "loss": 0.5164, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.744797468185425, - "rewards/margins": 1.0690962076187134, - "rewards/rejected": -3.8138937950134277, + "logits/chosen": -1.2695270776748657, + "logits/rejected": -1.1814700365066528, + "logps/chosen": -558.7033081054688, + "logps/rejected": -645.794189453125, + "loss": 0.5261, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.831700086593628, + "rewards/margins": 0.990101158618927, + "rewards/rejected": -3.8218014240264893, "step": 3160 }, { "epoch": 0.83, - "grad_norm": 8.6875, + "grad_norm": 6.75, "learning_rate": 4.2945221131440783e-07, - "logits/chosen": -1.3394265174865723, - "logits/rejected": -1.1380027532577515, - "logps/chosen": -532.6522216796875, - "logps/rejected": -629.9595336914062, - "loss": 0.4172, - "rewards/accuracies": 0.84375, - "rewards/chosen": -2.6694254875183105, - "rewards/margins": 1.289540410041809, - "rewards/rejected": -3.958966016769409, + "logits/chosen": -1.244091272354126, + "logits/rejected": -1.0454550981521606, + "logps/chosen": -539.8818359375, + "logps/rejected": -634.0319213867188, + "loss": 0.43, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.7417213916778564, + "rewards/margins": 1.257968544960022, + "rewards/rejected": -3.9996895790100098, "step": 3170 }, { "epoch": 0.83, - "grad_norm": 9.8125, + "grad_norm": 9.6875, "learning_rate": 4.167366067969381e-07, - "logits/chosen": -1.420111060142517, - "logits/rejected": -1.356261134147644, - "logps/chosen": -500.45135498046875, - "logps/rejected": -622.8685302734375, - "loss": 0.4999, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.775409698486328, - "rewards/margins": 0.9858112335205078, - "rewards/rejected": -3.7612204551696777, + "logits/chosen": -1.3269858360290527, + "logits/rejected": -1.2656229734420776, + "logps/chosen": -505.6949157714844, + "logps/rejected": -628.41015625, + "loss": 0.4885, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.827846050262451, + "rewards/margins": 0.9887911677360535, + "rewards/rejected": -3.816636562347412, "step": 3180 }, { "epoch": 0.83, - "grad_norm": 5.78125, + "grad_norm": 7.0, "learning_rate": 4.041949541732826e-07, - "logits/chosen": -1.4244636297225952, - "logits/rejected": -1.3622827529907227, - "logps/chosen": -547.3236694335938, - "logps/rejected": -641.5424194335938, - "loss": 0.5021, + "logits/chosen": -1.327467441558838, + "logits/rejected": -1.272200584411621, + "logps/chosen": -555.987060546875, + "logps/rejected": -642.5946655273438, + "loss": 0.5129, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.83439302444458, - "rewards/margins": 1.0849192142486572, - "rewards/rejected": -3.9193122386932373, + "rewards/chosen": -2.9210267066955566, + "rewards/margins": 1.0088088512420654, + "rewards/rejected": -3.929835557937622, "step": 3190 }, { "epoch": 0.84, - "grad_norm": 18.5, + "grad_norm": 12.1875, "learning_rate": 3.9182830066782614e-07, - "logits/chosen": -1.342223882675171, - "logits/rejected": -1.326421856880188, - "logps/chosen": -541.6400756835938, - "logps/rejected": -670.80126953125, - "loss": 0.4882, + "logits/chosen": -1.2530772686004639, + "logits/rejected": -1.2375959157943726, + "logps/chosen": -542.0306396484375, + "logps/rejected": -671.2916259765625, + "loss": 0.4641, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.8389620780944824, - "rewards/margins": 1.1673671007156372, - "rewards/rejected": -4.006329536437988, + "rewards/chosen": -2.8428683280944824, + "rewards/margins": 1.1683650016784668, + "rewards/rejected": -4.011233329772949, "step": 3200 }, { "epoch": 0.84, - "eval_logits/chosen": -1.3354921340942383, - "eval_logits/rejected": -1.2087724208831787, - "eval_logps/chosen": -541.2137451171875, - "eval_logps/rejected": -633.5067749023438, - "eval_loss": 0.48956310749053955, - "eval_rewards/accuracies": 0.7440000176429749, - "eval_rewards/chosen": -2.7656171321868896, - "eval_rewards/margins": 1.1233649253845215, - "eval_rewards/rejected": -3.8889822959899902, - "eval_runtime": 384.9676, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.2554447650909424, + "eval_logits/rejected": -1.1346678733825684, + "eval_logps/chosen": -545.7666015625, + "eval_logps/rejected": -634.5078735351562, + "eval_loss": 0.48661333322525024, + "eval_rewards/accuracies": 0.7404999732971191, + "eval_rewards/chosen": -2.8111462593078613, + "eval_rewards/margins": 1.0878463983535767, + "eval_rewards/rejected": -3.8989927768707275, + "eval_runtime": 385.3303, + "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.649, "step": 3200 }, { "epoch": 0.84, - "grad_norm": 7.09375, + "grad_norm": 8.375, "learning_rate": 3.796376788925771e-07, - "logits/chosen": -1.3472093343734741, - "logits/rejected": -1.27553129196167, - "logps/chosen": -528.8458251953125, - "logps/rejected": -600.9093627929688, - "loss": 0.5081, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.674180507659912, - "rewards/margins": 0.9610416293144226, - "rewards/rejected": -3.6352221965789795, + "logits/chosen": -1.264981985092163, + "logits/rejected": -1.1978137493133545, + "logps/chosen": -532.4588623046875, + "logps/rejected": -602.8772583007812, + "loss": 0.5036, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7103111743927, + "rewards/margins": 0.944588840007782, + "rewards/rejected": -3.654900074005127, "step": 3210 }, { "epoch": 0.84, - "grad_norm": 7.8125, + "grad_norm": 6.78125, "learning_rate": 3.676241067609465e-07, - "logits/chosen": -1.4311882257461548, - "logits/rejected": -1.3133292198181152, - "logps/chosen": -566.4583740234375, - "logps/rejected": -629.7789306640625, - "loss": 0.516, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.7275829315185547, - "rewards/margins": 1.0640867948532104, - "rewards/rejected": -3.791670322418213, + "logits/chosen": -1.3384299278259277, + "logits/rejected": -1.2301527261734009, + "logps/chosen": -568.9376220703125, + "logps/rejected": -628.9427490234375, + "loss": 0.5105, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7523765563964844, + "rewards/margins": 1.0309317111968994, + "rewards/rejected": -3.7833080291748047, "step": 3220 }, { "epoch": 0.85, - "grad_norm": 13.3125, + "grad_norm": 11.625, "learning_rate": 3.5578858740274976e-07, - "logits/chosen": -1.3556554317474365, - "logits/rejected": -1.2464677095413208, - "logps/chosen": -546.6044311523438, - "logps/rejected": -628.5745849609375, - "loss": 0.5223, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.848706007003784, - "rewards/margins": 0.9455618858337402, - "rewards/rejected": -3.7942676544189453, + "logits/chosen": -1.2620373964309692, + "logits/rejected": -1.1610171794891357, + "logps/chosen": -548.6265258789062, + "logps/rejected": -628.5254516601562, + "loss": 0.5183, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8689258098602295, + "rewards/margins": 0.9248504638671875, + "rewards/rejected": -3.793776273727417, "step": 3230 }, { "epoch": 0.85, - "grad_norm": 10.875, + "grad_norm": 12.0625, "learning_rate": 3.44132109080447e-07, - "logits/chosen": -1.5402499437332153, - "logits/rejected": -1.3627904653549194, - "logps/chosen": -532.7853393554688, - "logps/rejected": -620.8280029296875, - "loss": 0.4349, + "logits/chosen": -1.4505221843719482, + "logits/rejected": -1.2806892395019531, + "logps/chosen": -536.9176025390625, + "logps/rejected": -614.2163696289062, + "loss": 0.4513, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.6756184101104736, - "rewards/margins": 1.2552212476730347, - "rewards/rejected": -3.9308395385742188, + "rewards/chosen": -2.71694016456604, + "rewards/margins": 1.147782802581787, + "rewards/rejected": -3.864722490310669, "step": 3240 }, { "epoch": 0.85, - "grad_norm": 11.625, + "grad_norm": 10.375, "learning_rate": 3.3265564510662344e-07, - "logits/chosen": -1.4746288061141968, - "logits/rejected": -1.3366063833236694, - "logps/chosen": -556.5170288085938, - "logps/rejected": -653.8280029296875, - "loss": 0.4486, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.631749391555786, - "rewards/margins": 1.22456693649292, - "rewards/rejected": -3.856316328048706, + "logits/chosen": -1.377443790435791, + "logits/rejected": -1.2464927434921265, + "logps/chosen": -556.8729858398438, + "logps/rejected": -654.7142333984375, + "loss": 0.4331, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6353092193603516, + "rewards/margins": 1.2298697233200073, + "rewards/rejected": -3.8651790618896484, "step": 3250 }, { "epoch": 0.85, - "grad_norm": 13.4375, + "grad_norm": 16.0, "learning_rate": 3.213601537627195e-07, - "logits/chosen": -1.3857685327529907, - "logits/rejected": -1.2827407121658325, - "logps/chosen": -551.5367431640625, - "logps/rejected": -642.1528930664062, - "loss": 0.5368, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.944179058074951, - "rewards/margins": 1.0771982669830322, - "rewards/rejected": -4.021376609802246, + "logits/chosen": -1.2895920276641846, + "logits/rejected": -1.1866865158081055, + "logps/chosen": -556.0447998046875, + "logps/rejected": -639.7942504882812, + "loss": 0.5502, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.989259719848633, + "rewards/margins": 1.0085315704345703, + "rewards/rejected": -3.997791290283203, "step": 3260 }, { "epoch": 0.86, - "grad_norm": 13.1875, + "grad_norm": 12.3125, "learning_rate": 3.1024657821901063e-07, - "logits/chosen": -1.4493802785873413, - "logits/rejected": -1.3605637550354004, - "logps/chosen": -506.7554626464844, - "logps/rejected": -613.6812744140625, - "loss": 0.4775, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.590136766433716, - "rewards/margins": 1.2277183532714844, - "rewards/rejected": -3.8178551197052, + "logits/chosen": -1.3556302785873413, + "logits/rejected": -1.2743966579437256, + "logps/chosen": -520.0645141601562, + "logps/rejected": -612.8271484375, + "loss": 0.5058, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.72322678565979, + "rewards/margins": 1.0860865116119385, + "rewards/rejected": -3.8093135356903076, "step": 3270 }, { "epoch": 0.86, - "grad_norm": 14.1875, + "grad_norm": 13.625, "learning_rate": 2.9931584645585654e-07, - "logits/chosen": -1.3915107250213623, - "logits/rejected": -1.3630131483078003, - "logps/chosen": -540.7659912109375, - "logps/rejected": -647.734375, - "loss": 0.5041, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.6625149250030518, - "rewards/margins": 1.0405315160751343, - "rewards/rejected": -3.7030467987060547, + "logits/chosen": -1.3034099340438843, + "logits/rejected": -1.271439790725708, + "logps/chosen": -548.0056762695312, + "logps/rejected": -648.6056518554688, + "loss": 0.514, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.734912395477295, + "rewards/margins": 0.9768469929695129, + "rewards/rejected": -3.711759090423584, "step": 3280 }, { "epoch": 0.86, - "grad_norm": 5.5625, + "grad_norm": 5.9375, "learning_rate": 2.885688711862136e-07, - "logits/chosen": -1.4039568901062012, - "logits/rejected": -1.4016002416610718, - "logps/chosen": -542.4031372070312, - "logps/rejected": -669.4109497070312, - "loss": 0.4979, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.813516139984131, - "rewards/margins": 1.2897857427597046, - "rewards/rejected": -4.103302001953125, + "logits/chosen": -1.3113230466842651, + "logits/rejected": -1.3101108074188232, + "logps/chosen": -549.2462768554688, + "logps/rejected": -667.9041748046875, + "loss": 0.5153, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8819470405578613, + "rewards/margins": 1.2062867879867554, + "rewards/rejected": -4.0882344245910645, "step": 3290 }, { "epoch": 0.86, - "grad_norm": 13.9375, + "grad_norm": 12.1875, "learning_rate": 2.7800654977942486e-07, - "logits/chosen": -1.3982176780700684, - "logits/rejected": -1.2750059366226196, - "logps/chosen": -529.8203735351562, - "logps/rejected": -631.67431640625, - "loss": 0.5123, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.7246382236480713, - "rewards/margins": 1.056668996810913, - "rewards/rejected": -3.7813076972961426, + "logits/chosen": -1.3042861223220825, + "logits/rejected": -1.1825424432754517, + "logps/chosen": -531.413818359375, + "logps/rejected": -631.4432373046875, + "loss": 0.5096, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.740572452545166, + "rewards/margins": 1.0384232997894287, + "rewards/rejected": -3.778996229171753, "step": 3300 }, { "epoch": 0.86, - "eval_logits/chosen": -1.3351857662200928, - "eval_logits/rejected": -1.208348274230957, - "eval_logps/chosen": -542.1024780273438, - "eval_logps/rejected": -634.3662109375, - "eval_loss": 0.4894912838935852, - "eval_rewards/accuracies": 0.7434999942779541, - "eval_rewards/chosen": -2.7745048999786377, - "eval_rewards/margins": 1.123070478439331, - "eval_rewards/rejected": -3.8975753784179688, - "eval_runtime": 384.9043, - "eval_samples_per_second": 5.196, - "eval_steps_per_second": 0.65, + "eval_logits/chosen": -1.2585511207580566, + "eval_logits/rejected": -1.1378772258758545, + "eval_logps/chosen": -544.573974609375, + "eval_logps/rejected": -633.404052734375, + "eval_loss": 0.4864084720611572, + "eval_rewards/accuracies": 0.7394999861717224, + "eval_rewards/chosen": -2.7992191314697266, + "eval_rewards/margins": 1.0887356996536255, + "eval_rewards/rejected": -3.8879551887512207, + "eval_runtime": 385.2344, + "eval_samples_per_second": 5.192, + "eval_steps_per_second": 0.649, "step": 3300 }, { "epoch": 0.87, - "grad_norm": 14.3125, + "grad_norm": 15.8125, "learning_rate": 2.6762976418628797e-07, - "logits/chosen": -1.434815764427185, - "logits/rejected": -1.2825909852981567, - "logps/chosen": -497.67559814453125, - "logps/rejected": -557.65478515625, - "loss": 0.5421, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.67899751663208, - "rewards/margins": 1.0358701944351196, - "rewards/rejected": -3.714867353439331, + "logits/chosen": -1.345733880996704, + "logits/rejected": -1.2021456956863403, + "logps/chosen": -504.2335510253906, + "logps/rejected": -557.3888549804688, + "loss": 0.5433, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.744576930999756, + "rewards/margins": 0.9676315188407898, + "rewards/rejected": -3.7122085094451904, "step": 3310 }, { "epoch": 0.87, - "grad_norm": 17.0, + "grad_norm": 10.25, "learning_rate": 2.5743938086541354e-07, - "logits/chosen": -1.4120391607284546, - "logits/rejected": -1.282286524772644, - "logps/chosen": -537.319580078125, - "logps/rejected": -624.9915771484375, - "loss": 0.4846, + "logits/chosen": -1.3191635608673096, + "logits/rejected": -1.1912063360214233, + "logps/chosen": -537.1017456054688, + "logps/rejected": -623.8556518554688, + "loss": 0.4854, "rewards/accuracies": 0.75, - "rewards/chosen": -2.730372428894043, - "rewards/margins": 1.1250317096710205, - "rewards/rejected": -3.8554039001464844, + "rewards/chosen": -2.728193998336792, + "rewards/margins": 1.1158511638641357, + "rewards/rejected": -3.8440451622009277, "step": 3320 }, { "epoch": 0.87, - "grad_norm": 13.125, + "grad_norm": 12.5625, "learning_rate": 2.4743625071087574e-07, - "logits/chosen": -1.5484386682510376, - "logits/rejected": -1.370973825454712, - "logps/chosen": -544.7464599609375, - "logps/rejected": -646.3916625976562, - "loss": 0.479, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.6717612743377686, - "rewards/margins": 1.311167597770691, - "rewards/rejected": -3.982929229736328, + "logits/chosen": -1.4545891284942627, + "logits/rejected": -1.2835543155670166, + "logps/chosen": -547.1002807617188, + "logps/rejected": -641.8110961914062, + "loss": 0.481, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6952996253967285, + "rewards/margins": 1.241824746131897, + "rewards/rejected": -3.937124252319336, "step": 3330 }, { "epoch": 0.87, - "grad_norm": 10.5, + "grad_norm": 11.9375, "learning_rate": 2.3762120898116498e-07, - "logits/chosen": -1.428130865097046, - "logits/rejected": -1.3167780637741089, - "logps/chosen": -561.8053588867188, - "logps/rejected": -654.1575927734375, - "loss": 0.4989, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.930124282836914, - "rewards/margins": 1.0036227703094482, - "rewards/rejected": -3.9337470531463623, + "logits/chosen": -1.3351377248764038, + "logits/rejected": -1.2292808294296265, + "logps/chosen": -560.9166259765625, + "logps/rejected": -651.31103515625, + "loss": 0.4951, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9212374687194824, + "rewards/margins": 0.9840442538261414, + "rewards/rejected": -3.9052817821502686, "step": 3340 }, { "epoch": 0.88, - "grad_norm": 8.5625, + "grad_norm": 9.25, "learning_rate": 2.2799507522944048e-07, - "logits/chosen": -1.361971139907837, - "logits/rejected": -1.2745027542114258, - "logps/chosen": -532.9896240234375, - "logps/rejected": -649.6702270507812, - "loss": 0.4514, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.671921730041504, - "rewards/margins": 1.2491633892059326, - "rewards/rejected": -3.9210853576660156, + "logits/chosen": -1.2632300853729248, + "logits/rejected": -1.1798118352890015, + "logps/chosen": -535.8150634765625, + "logps/rejected": -648.6831665039062, + "loss": 0.4501, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7001757621765137, + "rewards/margins": 1.211038589477539, + "rewards/rejected": -3.9112143516540527, "step": 3350 }, { "epoch": 0.88, - "grad_norm": 11.375, + "grad_norm": 9.5625, "learning_rate": 2.1855865323510056e-07, - "logits/chosen": -1.424619436264038, - "logits/rejected": -1.2369648218154907, - "logps/chosen": -545.8442993164062, - "logps/rejected": -683.6163330078125, - "loss": 0.4292, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.731658458709717, - "rewards/margins": 1.4368107318878174, - "rewards/rejected": -4.168468952178955, + "logits/chosen": -1.3267244100570679, + "logits/rejected": -1.1418159008026123, + "logps/chosen": -551.0586547851562, + "logps/rejected": -682.3759765625, + "loss": 0.4332, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.783801317214966, + "rewards/margins": 1.3722645044326782, + "rewards/rejected": -4.156065940856934, "step": 3360 }, { "epoch": 0.88, - "grad_norm": 8.3125, + "grad_norm": 6.84375, "learning_rate": 2.0931273093666575e-07, - "logits/chosen": -1.3846908807754517, - "logits/rejected": -1.2318280935287476, - "logps/chosen": -527.0106201171875, - "logps/rejected": -627.6875, - "loss": 0.4511, - "rewards/accuracies": 0.84375, - "rewards/chosen": -2.868238925933838, - "rewards/margins": 1.2093108892440796, - "rewards/rejected": -4.077549934387207, + "logits/chosen": -1.2914237976074219, + "logits/rejected": -1.1433568000793457, + "logps/chosen": -527.1336669921875, + "logps/rejected": -625.6705322265625, + "loss": 0.4545, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.8694703578948975, + "rewards/margins": 1.1879098415374756, + "rewards/rejected": -4.057379722595215, "step": 3370 }, { "epoch": 0.88, - "grad_norm": 12.0, + "grad_norm": 12.1875, "learning_rate": 2.002580803659873e-07, - "logits/chosen": -1.3799206018447876, - "logits/rejected": -1.2638591527938843, - "logps/chosen": -536.2264404296875, - "logps/rejected": -633.6256713867188, - "loss": 0.4582, + "logits/chosen": -1.2892788648605347, + "logits/rejected": -1.1720714569091797, + "logps/chosen": -541.9439697265625, + "logps/rejected": -635.7294311523438, + "loss": 0.4668, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.816713333129883, - "rewards/margins": 1.155531406402588, - "rewards/rejected": -3.97224497795105, + "rewards/chosen": -2.873889446258545, + "rewards/margins": 1.1193937063217163, + "rewards/rejected": -3.9932830333709717, "step": 3380 }, { "epoch": 0.89, - "grad_norm": 6.125, + "grad_norm": 5.5625, "learning_rate": 1.913954575837826e-07, - "logits/chosen": -1.455733299255371, - "logits/rejected": -1.1953377723693848, - "logps/chosen": -554.5827026367188, - "logps/rejected": -613.561767578125, - "loss": 0.4729, + "logits/chosen": -1.3597743511199951, + "logits/rejected": -1.1029024124145508, + "logps/chosen": -555.9939575195312, + "logps/rejected": -613.6519775390625, + "loss": 0.4753, "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.8246824741363525, - "rewards/margins": 1.0867326259613037, - "rewards/rejected": -3.9114151000976562, + "rewards/chosen": -2.8387951850891113, + "rewards/margins": 1.0735225677490234, + "rewards/rejected": -3.9123177528381348, "step": 3390 }, { "epoch": 0.89, - "grad_norm": 9.1875, + "grad_norm": 8.875, "learning_rate": 1.827256026165028e-07, - "logits/chosen": -1.46336829662323, - "logits/rejected": -1.2622567415237427, - "logps/chosen": -575.8062744140625, - "logps/rejected": -644.166748046875, - "loss": 0.4526, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.6073572635650635, - "rewards/margins": 1.2542911767959595, - "rewards/rejected": -3.8616480827331543, + "logits/chosen": -1.37373685836792, + "logits/rejected": -1.178899884223938, + "logps/chosen": -578.7473754882812, + "logps/rejected": -641.29345703125, + "loss": 0.455, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.636767864227295, + "rewards/margins": 1.1961476802825928, + "rewards/rejected": -3.8329155445098877, "step": 3400 }, { "epoch": 0.89, - "eval_logits/chosen": -1.3318783044815063, - "eval_logits/rejected": -1.205055594444275, - "eval_logps/chosen": -543.2083129882812, - "eval_logps/rejected": -635.9655151367188, - "eval_loss": 0.4895820915699005, - "eval_rewards/accuracies": 0.7444999814033508, - "eval_rewards/chosen": -2.785562753677368, - "eval_rewards/margins": 1.1280065774917603, - "eval_rewards/rejected": -3.913569450378418, - "eval_runtime": 384.9489, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.2543540000915527, + "eval_logits/rejected": -1.1335822343826294, + "eval_logps/chosen": -545.915283203125, + "eval_logps/rejected": -635.4321899414062, + "eval_loss": 0.48658648133277893, + "eval_rewards/accuracies": 0.7394999861717224, + "eval_rewards/chosen": -2.8126325607299805, + "eval_rewards/margins": 1.0956026315689087, + "eval_rewards/rejected": -3.9082350730895996, + "eval_runtime": 385.1178, + "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 3400 }, { "epoch": 0.89, - "grad_norm": 11.4375, + "grad_norm": 14.25, "learning_rate": 1.7424923939454274e-07, - "logits/chosen": -1.4153268337249756, - "logits/rejected": -1.235887885093689, - "logps/chosen": -558.7437744140625, - "logps/rejected": -640.8536376953125, - "loss": 0.4144, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -2.77394437789917, - "rewards/margins": 1.254831075668335, - "rewards/rejected": -4.028775215148926, + "logits/chosen": -1.3175909519195557, + "logits/rejected": -1.1420743465423584, + "logps/chosen": -559.6712036132812, + "logps/rejected": -638.4949951171875, + "loss": 0.4204, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7832179069519043, + "rewards/margins": 1.2219712734222412, + "rewards/rejected": -4.005189418792725, "step": 3410 }, { "epoch": 0.9, - "grad_norm": 19.25, + "grad_norm": 18.25, "learning_rate": 1.6596707569179304e-07, - "logits/chosen": -1.4984407424926758, - "logits/rejected": -1.3432929515838623, - "logps/chosen": -559.1913452148438, - "logps/rejected": -636.6985473632812, - "loss": 0.491, + "logits/chosen": -1.4023295640945435, + "logits/rejected": -1.2579714059829712, + "logps/chosen": -565.6453857421875, + "logps/rejected": -633.6785278320312, + "loss": 0.5107, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.7800402641296387, - "rewards/margins": 1.1263272762298584, - "rewards/rejected": -3.906367540359497, + "rewards/chosen": -2.844580888748169, + "rewards/margins": 1.0315876007080078, + "rewards/rejected": -3.8761680126190186, "step": 3420 }, { "epoch": 0.9, - "grad_norm": 8.75, + "grad_norm": 9.3125, "learning_rate": 1.578798030665385e-07, - "logits/chosen": -1.4436315298080444, - "logits/rejected": -1.2577316761016846, - "logps/chosen": -546.3856811523438, - "logps/rejected": -664.1823120117188, - "loss": 0.4391, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.7168757915496826, - "rewards/margins": 1.348455786705017, - "rewards/rejected": -4.065331935882568, + "logits/chosen": -1.3531277179718018, + "logits/rejected": -1.1701006889343262, + "logps/chosen": -551.0891723632812, + "logps/rejected": -663.329345703125, + "loss": 0.4451, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7639107704162598, + "rewards/margins": 1.2928920984268188, + "rewards/rejected": -4.056802749633789, "step": 3430 }, { "epoch": 0.9, - "grad_norm": 8.875, + "grad_norm": 8.625, "learning_rate": 1.499880968037165e-07, - "logits/chosen": -1.4255344867706299, - "logits/rejected": -1.2893887758255005, - "logps/chosen": -532.6036376953125, - "logps/rejected": -603.4581298828125, - "loss": 0.5173, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.7699015140533447, - "rewards/margins": 1.071218729019165, - "rewards/rejected": -3.8411202430725098, + "logits/chosen": -1.3360685110092163, + "logits/rejected": -1.204347014427185, + "logps/chosen": -529.5865478515625, + "logps/rejected": -599.6653442382812, + "loss": 0.5141, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7397305965423584, + "rewards/margins": 1.0634615421295166, + "rewards/rejected": -3.803192138671875, "step": 3440 }, { "epoch": 0.9, - "grad_norm": 12.4375, + "grad_norm": 14.5, "learning_rate": 1.4229261585852805e-07, - "logits/chosen": -1.4486864805221558, - "logits/rejected": -1.366389513015747, - "logps/chosen": -536.9041748046875, - "logps/rejected": -628.5606689453125, - "loss": 0.4501, + "logits/chosen": -1.3513076305389404, + "logits/rejected": -1.2741743326187134, + "logps/chosen": -541.47216796875, + "logps/rejected": -626.5045776367188, + "loss": 0.457, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.6847262382507324, - "rewards/margins": 1.1747690439224243, - "rewards/rejected": -3.859494686126709, + "rewards/chosen": -2.730405807495117, + "rewards/margins": 1.1085281372070312, + "rewards/rejected": -3.8389339447021484, "step": 3450 }, { "epoch": 0.91, - "grad_norm": 11.5, + "grad_norm": 10.0625, "learning_rate": 1.3479400280141886e-07, - "logits/chosen": -1.3643203973770142, - "logits/rejected": -1.319645643234253, - "logps/chosen": -524.4163818359375, - "logps/rejected": -644.7371826171875, - "loss": 0.4688, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.781571865081787, - "rewards/margins": 1.223384976387024, - "rewards/rejected": -4.00495719909668, + "logits/chosen": -1.2767086029052734, + "logits/rejected": -1.2311673164367676, + "logps/chosen": -530.0853881835938, + "logps/rejected": -643.3887939453125, + "loss": 0.4882, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8382620811462402, + "rewards/margins": 1.153211236000061, + "rewards/rejected": -3.9914729595184326, "step": 3460 }, { "epoch": 0.91, - "grad_norm": 7.25, + "grad_norm": 9.375, "learning_rate": 1.2749288376442044e-07, - "logits/chosen": -1.4585063457489014, - "logits/rejected": -1.2669061422348022, - "logps/chosen": -567.2780151367188, - "logps/rejected": -623.2936401367188, - "loss": 0.4761, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.70743727684021, - "rewards/margins": 1.135317325592041, - "rewards/rejected": -3.842755079269409, + "logits/chosen": -1.3543965816497803, + "logits/rejected": -1.169668436050415, + "logps/chosen": -568.0045166015625, + "logps/rejected": -621.7847900390625, + "loss": 0.4688, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.714702606201172, + "rewards/margins": 1.1129640340805054, + "rewards/rejected": -3.827666759490967, "step": 3470 }, { "epoch": 0.91, - "grad_norm": 9.1875, + "grad_norm": 9.8125, "learning_rate": 1.203898683888713e-07, - "logits/chosen": -1.4662349224090576, - "logits/rejected": -1.3190191984176636, - "logps/chosen": -531.9495849609375, - "logps/rejected": -622.1035766601562, - "loss": 0.5705, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.879521369934082, - "rewards/margins": 0.9325839281082153, - "rewards/rejected": -3.812105178833008, + "logits/chosen": -1.3755584955215454, + "logits/rejected": -1.2311782836914062, + "logps/chosen": -532.9498901367188, + "logps/rejected": -625.6002197265625, + "loss": 0.5499, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.8895249366760254, + "rewards/margins": 0.9575474858283997, + "rewards/rejected": -3.8470726013183594, "step": 3480 }, { "epoch": 0.91, - "grad_norm": 8.875, + "grad_norm": 9.75, "learning_rate": 1.1348554977451132e-07, - "logits/chosen": -1.4893817901611328, - "logits/rejected": -1.3383376598358154, - "logps/chosen": -555.6965942382812, - "logps/rejected": -626.2882690429688, - "loss": 0.5165, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.737074613571167, - "rewards/margins": 1.034942865371704, - "rewards/rejected": -3.77201771736145, + "logits/chosen": -1.4002835750579834, + "logits/rejected": -1.255327820777893, + "logps/chosen": -557.9288330078125, + "logps/rejected": -629.8707275390625, + "loss": 0.5076, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.759397029876709, + "rewards/margins": 1.0484449863433838, + "rewards/rejected": -3.8078417778015137, "step": 3490 }, { "epoch": 0.92, - "grad_norm": 7.09375, + "grad_norm": 6.75, "learning_rate": 1.0678050442995802e-07, - "logits/chosen": -1.444490671157837, - "logits/rejected": -1.2492029666900635, - "logps/chosen": -563.559326171875, - "logps/rejected": -623.2242431640625, - "loss": 0.5432, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.8276607990264893, - "rewards/margins": 1.0474214553833008, - "rewards/rejected": -3.875082015991211, + "logits/chosen": -1.345915675163269, + "logits/rejected": -1.1581257581710815, + "logps/chosen": -561.4415283203125, + "logps/rejected": -623.0227661132812, + "loss": 0.5262, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8064818382263184, + "rewards/margins": 1.0665854215621948, + "rewards/rejected": -3.8730673789978027, "step": 3500 }, { "epoch": 0.92, - "eval_logits/chosen": -1.331380009651184, - "eval_logits/rejected": -1.2045425176620483, - "eval_logps/chosen": -543.0230712890625, - "eval_logps/rejected": -635.9038696289062, - "eval_loss": 0.4895781874656677, - "eval_rewards/accuracies": 0.7440000176429749, - "eval_rewards/chosen": -2.783710479736328, - "eval_rewards/margins": 1.1292426586151123, - "eval_rewards/rejected": -3.9129531383514404, - "eval_runtime": 385.1815, - "eval_samples_per_second": 5.192, + "eval_logits/chosen": -1.255007028579712, + "eval_logits/rejected": -1.1342185735702515, + "eval_logps/chosen": -545.7534790039062, + "eval_logps/rejected": -635.4207153320312, + "eval_loss": 0.4864389896392822, + "eval_rewards/accuracies": 0.7409999966621399, + "eval_rewards/chosen": -2.8110146522521973, + "eval_rewards/margins": 1.0971060991287231, + "eval_rewards/rejected": -3.908120632171631, + "eval_runtime": 385.1023, + "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 3500 }, { "epoch": 0.92, - "grad_norm": 8.0625, + "grad_norm": 9.0, "learning_rate": 1.0027529222456755e-07, - "logits/chosen": -1.423830509185791, - "logits/rejected": -1.2484782934188843, - "logps/chosen": -528.2448120117188, - "logps/rejected": -626.2152099609375, - "loss": 0.4541, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.7407870292663574, - "rewards/margins": 1.155261754989624, - "rewards/rejected": -3.8960487842559814, + "logits/chosen": -1.3322703838348389, + "logits/rejected": -1.164650321006775, + "logps/chosen": -529.0792236328125, + "logps/rejected": -624.087646484375, + "loss": 0.4498, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7491297721862793, + "rewards/margins": 1.125643014907837, + "rewards/rejected": -3.8747730255126953, "step": 3510 }, { "epoch": 0.92, - "grad_norm": 10.1875, + "grad_norm": 8.375, "learning_rate": 9.397045634168766e-08, - "logits/chosen": -1.4465529918670654, - "logits/rejected": -1.3796955347061157, - "logps/chosen": -536.9094848632812, - "logps/rejected": -664.8760375976562, - "loss": 0.4559, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.6832659244537354, - "rewards/margins": 1.277266502380371, - "rewards/rejected": -3.9605324268341064, + "logits/chosen": -1.36007821559906, + "logits/rejected": -1.2929136753082275, + "logps/chosen": -542.2086181640625, + "logps/rejected": -667.2059326171875, + "loss": 0.4547, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.736257553100586, + "rewards/margins": 1.2475742101669312, + "rewards/rejected": -3.9838318824768066, "step": 3520 }, { "epoch": 0.92, - "grad_norm": 10.125, + "grad_norm": 11.625, "learning_rate": 8.78665232332998e-08, - "logits/chosen": -1.3762016296386719, - "logits/rejected": -1.3040244579315186, - "logps/chosen": -512.94140625, - "logps/rejected": -617.2539672851562, - "loss": 0.4859, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.8223133087158203, - "rewards/margins": 1.0507687330245972, - "rewards/rejected": -3.873081922531128, + "logits/chosen": -1.2781856060028076, + "logits/rejected": -1.210409164428711, + "logps/chosen": -516.8308715820312, + "logps/rejected": -617.1106567382812, + "loss": 0.4795, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8612074851989746, + "rewards/margins": 1.0104413032531738, + "rewards/rejected": -3.8716487884521484, "step": 3530 }, { "epoch": 0.93, - "grad_norm": 9.5, + "grad_norm": 9.375, "learning_rate": 8.196400257606208e-08, - "logits/chosen": -1.4817397594451904, - "logits/rejected": -1.334993839263916, - "logps/chosen": -555.3631591796875, - "logps/rejected": -686.2461547851562, - "loss": 0.4321, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.72278094291687, - "rewards/margins": 1.3615996837615967, - "rewards/rejected": -4.084380626678467, + "logits/chosen": -1.385122537612915, + "logits/rejected": -1.24273681640625, + "logps/chosen": -560.0970458984375, + "logps/rejected": -682.8623046875, + "loss": 0.4425, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7701196670532227, + "rewards/margins": 1.2804229259490967, + "rewards/rejected": -4.05054235458374, "step": 3540 }, { "epoch": 0.93, - "grad_norm": 8.875, + "grad_norm": 9.9375, "learning_rate": 7.626338722875076e-08, - "logits/chosen": -1.423555612564087, - "logits/rejected": -1.3563083410263062, - "logps/chosen": -526.125244140625, - "logps/rejected": -638.4963989257812, - "loss": 0.4891, + "logits/chosen": -1.3233528137207031, + "logits/rejected": -1.269012212753296, + "logps/chosen": -527.5838623046875, + "logps/rejected": -637.5836791992188, + "loss": 0.4828, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.720315933227539, - "rewards/margins": 1.1140156984329224, - "rewards/rejected": -3.83433198928833, + "rewards/chosen": -2.7349019050598145, + "rewards/margins": 1.0903024673461914, + "rewards/rejected": -3.8252041339874268, "step": 3550 }, { "epoch": 0.93, - "grad_norm": 6.125, + "grad_norm": 7.03125, "learning_rate": 7.076515319110688e-08, - "logits/chosen": -1.42192804813385, - "logits/rejected": -1.3248012065887451, - "logps/chosen": -528.8367919921875, - "logps/rejected": -604.7586669921875, - "loss": 0.5312, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.725490093231201, - "rewards/margins": 1.1749989986419678, - "rewards/rejected": -3.900489091873169, + "logits/chosen": -1.3301162719726562, + "logits/rejected": -1.2381629943847656, + "logps/chosen": -530.2079467773438, + "logps/rejected": -606.7450561523438, + "loss": 0.5044, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.739201545715332, + "rewards/margins": 1.1811503171920776, + "rewards/rejected": -3.9203522205352783, "step": 3560 }, { "epoch": 0.93, - "grad_norm": 9.6875, + "grad_norm": 7.90625, "learning_rate": 6.54697595640899e-08, - "logits/chosen": -1.4399796724319458, - "logits/rejected": -1.3037444353103638, - "logps/chosen": -573.2326049804688, - "logps/rejected": -660.8978271484375, - "loss": 0.4778, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.8101038932800293, - "rewards/margins": 1.1283040046691895, - "rewards/rejected": -3.938408374786377, + "logits/chosen": -1.3412398099899292, + "logits/rejected": -1.2113425731658936, + "logps/chosen": -574.6760864257812, + "logps/rejected": -662.1055908203125, + "loss": 0.4814, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8245387077331543, + "rewards/margins": 1.1259464025497437, + "rewards/rejected": -3.9504852294921875, "step": 3570 }, { "epoch": 0.94, - "grad_norm": 8.625, + "grad_norm": 10.125, "learning_rate": 6.037764851154426e-08, - "logits/chosen": -1.425642490386963, - "logits/rejected": -1.3774255514144897, - "logps/chosen": -533.1048583984375, - "logps/rejected": -653.04736328125, - "loss": 0.4959, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.6904561519622803, - "rewards/margins": 1.1518871784210205, - "rewards/rejected": -3.842343807220459, + "logits/chosen": -1.3283928632736206, + "logits/rejected": -1.286163568496704, + "logps/chosen": -535.401611328125, + "logps/rejected": -654.7335205078125, + "loss": 0.4822, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7134242057800293, + "rewards/margins": 1.1457810401916504, + "rewards/rejected": -3.8592045307159424, "step": 3580 }, { "epoch": 0.94, - "grad_norm": 7.5625, + "grad_norm": 6.9375, "learning_rate": 5.548924522327748e-08, - "logits/chosen": -1.4221152067184448, - "logits/rejected": -1.2739677429199219, - "logps/chosen": -533.532958984375, - "logps/rejected": -629.9300537109375, - "loss": 0.4811, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.7177557945251465, - "rewards/margins": 1.110734224319458, - "rewards/rejected": -3.8284904956817627, + "logits/chosen": -1.3209599256515503, + "logits/rejected": -1.182340383529663, + "logps/chosen": -537.6961669921875, + "logps/rejected": -631.2122192382812, + "loss": 0.4825, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7593882083892822, + "rewards/margins": 1.081923484802246, + "rewards/rejected": -3.8413116931915283, "step": 3590 }, { "epoch": 0.94, - "grad_norm": 10.375, + "grad_norm": 10.9375, "learning_rate": 5.0804957879556915e-08, - "logits/chosen": -1.3408573865890503, - "logits/rejected": -1.2565336227416992, - "logps/chosen": -498.77581787109375, - "logps/rejected": -613.9055786132812, - "loss": 0.4617, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.732095241546631, - "rewards/margins": 1.105553150177002, - "rewards/rejected": -3.837648391723633, + "logits/chosen": -1.249495506286621, + "logits/rejected": -1.1639585494995117, + "logps/chosen": -500.98992919921875, + "logps/rejected": -613.1359252929688, + "loss": 0.466, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7542366981506348, + "rewards/margins": 1.0757157802581787, + "rewards/rejected": -3.8299522399902344, "step": 3600 }, { "epoch": 0.94, - "eval_logits/chosen": -1.337357521057129, - "eval_logits/rejected": -1.210402011871338, - "eval_logps/chosen": -543.2186279296875, - "eval_logps/rejected": -636.1134643554688, - "eval_loss": 0.4894636869430542, - "eval_rewards/accuracies": 0.7434999942779541, - "eval_rewards/chosen": -2.785665988922119, - "eval_rewards/margins": 1.1293821334838867, - "eval_rewards/rejected": -3.915048122406006, - "eval_runtime": 384.9877, - "eval_samples_per_second": 5.195, + "eval_logits/chosen": -1.2554669380187988, + "eval_logits/rejected": -1.1347417831420898, + "eval_logps/chosen": -545.9835815429688, + "eval_logps/rejected": -635.6727294921875, + "eval_loss": 0.48658978939056396, + "eval_rewards/accuracies": 0.7400000095367432, + "eval_rewards/chosen": -2.813314914703369, + "eval_rewards/margins": 1.0973262786865234, + "eval_rewards/rejected": -3.9106414318084717, + "eval_runtime": 385.0907, + "eval_samples_per_second": 5.194, "eval_steps_per_second": 0.649, "step": 3600 }, { "epoch": 0.94, - "grad_norm": 12.8125, + "grad_norm": 10.1875, "learning_rate": 4.632517761702815e-08, - "logits/chosen": -1.3654999732971191, - "logits/rejected": -1.2209546566009521, - "logps/chosen": -514.5064697265625, - "logps/rejected": -634.9259033203125, - "loss": 0.451, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.7906320095062256, - "rewards/margins": 1.32748281955719, - "rewards/rejected": -4.118114948272705, + "logits/chosen": -1.2666916847229004, + "logits/rejected": -1.127403974533081, + "logps/chosen": -518.2024536132812, + "logps/rejected": -636.0374145507812, + "loss": 0.4363, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.827592134475708, + "rewards/margins": 1.3016375303268433, + "rewards/rejected": -4.129229545593262, "step": 3610 }, { "epoch": 0.95, - "grad_norm": 11.9375, + "grad_norm": 12.5, "learning_rate": 4.205027849605359e-08, - "logits/chosen": -1.3972408771514893, - "logits/rejected": -1.2909516096115112, - "logps/chosen": -535.4818115234375, - "logps/rejected": -608.0510864257812, - "loss": 0.5396, + "logits/chosen": -1.2991037368774414, + "logits/rejected": -1.2022724151611328, + "logps/chosen": -558.1617431640625, + "logps/rejected": -612.0789794921875, + "loss": 0.68, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.875584363937378, - "rewards/margins": 1.0263030529022217, - "rewards/rejected": -3.9018874168395996, + "rewards/chosen": -3.1023831367492676, + "rewards/margins": 0.8397830128669739, + "rewards/rejected": -3.9421660900115967, "step": 3620 }, { "epoch": 0.95, - "grad_norm": 9.875, + "grad_norm": 9.375, "learning_rate": 3.798061746947995e-08, - "logits/chosen": -1.5177159309387207, - "logits/rejected": -1.354299783706665, - "logps/chosen": -539.2265014648438, - "logps/rejected": -615.7210693359375, - "loss": 0.4806, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.750765800476074, - "rewards/margins": 1.1568615436553955, - "rewards/rejected": -3.907627820968628, + "logits/chosen": -1.4298536777496338, + "logits/rejected": -1.2711670398712158, + "logps/chosen": -541.1964721679688, + "logps/rejected": -615.2937622070312, + "loss": 0.479, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.770465135574341, + "rewards/margins": 1.1328895092010498, + "rewards/rejected": -3.9033546447753906, "step": 3630 }, { "epoch": 0.95, - "grad_norm": 10.5, + "grad_norm": 10.9375, "learning_rate": 3.411653435283158e-08, - "logits/chosen": -1.4375641345977783, - "logits/rejected": -1.2251198291778564, - "logps/chosen": -544.5794677734375, - "logps/rejected": -597.5071411132812, - "loss": 0.4841, + "logits/chosen": -1.3373726606369019, + "logits/rejected": -1.1359134912490845, + "logps/chosen": -546.8034057617188, + "logps/rejected": -593.7394409179688, + "loss": 0.4962, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.713228464126587, - "rewards/margins": 1.0865360498428345, - "rewards/rejected": -3.799764633178711, + "rewards/chosen": -2.7354676723480225, + "rewards/margins": 1.0266190767288208, + "rewards/rejected": -3.762086868286133, "step": 3640 }, { "epoch": 0.96, - "grad_norm": 13.125, + "grad_norm": 7.03125, "learning_rate": 3.04583517959367e-08, - "logits/chosen": -1.4777367115020752, - "logits/rejected": -1.3228670358657837, - "logps/chosen": -516.0816650390625, - "logps/rejected": -592.5147094726562, - "loss": 0.4782, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.6396498680114746, - "rewards/margins": 1.0887558460235596, - "rewards/rejected": -3.728405714035034, + "logits/chosen": -1.3844215869903564, + "logits/rejected": -1.2332738637924194, + "logps/chosen": -517.3903198242188, + "logps/rejected": -597.7869262695312, + "loss": 0.4532, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6527369022369385, + "rewards/margins": 1.128391146659851, + "rewards/rejected": -3.7811279296875, "step": 3650 }, { "epoch": 0.96, - "grad_norm": 15.25, + "grad_norm": 9.0625, "learning_rate": 2.7006375255985984e-08, - "logits/chosen": -1.4119120836257935, - "logits/rejected": -1.3788807392120361, - "logps/chosen": -553.791015625, - "logps/rejected": -641.79443359375, - "loss": 0.5775, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.8898892402648926, - "rewards/margins": 0.8745861053466797, - "rewards/rejected": -3.7644753456115723, + "logits/chosen": -1.3098169565200806, + "logits/rejected": -1.2761331796646118, + "logps/chosen": -556.2361450195312, + "logps/rejected": -640.65283203125, + "loss": 0.5747, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.914341449737549, + "rewards/margins": 0.838718056678772, + "rewards/rejected": -3.753058910369873, "step": 3660 }, { "epoch": 0.96, - "grad_norm": 10.0, + "grad_norm": 10.1875, "learning_rate": 2.3760892972027328e-08, - "logits/chosen": -1.5255684852600098, - "logits/rejected": -1.3464335203170776, - "logps/chosen": -559.984375, - "logps/rejected": -636.6268310546875, - "loss": 0.5395, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.874812602996826, - "rewards/margins": 1.1065670251846313, - "rewards/rejected": -3.981379270553589, + "logits/chosen": -1.4296592473983765, + "logits/rejected": -1.253159761428833, + "logps/chosen": -565.5921020507812, + "logps/rejected": -640.4884643554688, + "loss": 0.5445, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.93088960647583, + "rewards/margins": 1.0891053676605225, + "rewards/rejected": -4.01999568939209, "step": 3670 }, { "epoch": 0.96, - "grad_norm": 12.8125, + "grad_norm": 11.875, "learning_rate": 2.072217594089765e-08, - "logits/chosen": -1.3876298666000366, - "logits/rejected": -1.365395188331604, - "logps/chosen": -542.5714721679688, - "logps/rejected": -655.452880859375, - "loss": 0.4209, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.8103041648864746, - "rewards/margins": 1.2513482570648193, - "rewards/rejected": -4.061652660369873, + "logits/chosen": -1.2928217649459839, + "logits/rejected": -1.2739886045455933, + "logps/chosen": -544.60205078125, + "logps/rejected": -657.6286010742188, + "loss": 0.4198, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.8306102752685547, + "rewards/margins": 1.2527996301651, + "rewards/rejected": -4.083409786224365, "step": 3680 }, { "epoch": 0.97, - "grad_norm": 8.375, + "grad_norm": 8.8125, "learning_rate": 1.789047789459375e-08, - "logits/chosen": -1.488208532333374, - "logits/rejected": -1.2888844013214111, - "logps/chosen": -593.5070190429688, - "logps/rejected": -661.3151245117188, - "loss": 0.5217, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.765303373336792, - "rewards/margins": 1.168646216392517, - "rewards/rejected": -3.9339499473571777, + "logits/chosen": -1.3845082521438599, + "logits/rejected": -1.1975808143615723, + "logps/chosen": -600.7689819335938, + "logps/rejected": -661.3809204101562, + "loss": 0.5254, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.837923049926758, + "rewards/margins": 1.0966850519180298, + "rewards/rejected": -3.934607744216919, "step": 3690 }, { "epoch": 0.97, - "grad_norm": 7.65625, + "grad_norm": 6.875, "learning_rate": 1.5266035279088708e-08, - "logits/chosen": -1.322689414024353, - "logits/rejected": -1.1827501058578491, - "logps/chosen": -589.371826171875, - "logps/rejected": -676.3383178710938, - "loss": 0.4797, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.927542209625244, - "rewards/margins": 1.131638765335083, - "rewards/rejected": -4.059180736541748, + "logits/chosen": -1.2211766242980957, + "logits/rejected": -1.0900758504867554, + "logps/chosen": -589.4644775390625, + "logps/rejected": -672.6224975585938, + "loss": 0.4945, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.928469181060791, + "rewards/margins": 1.0935529470443726, + "rewards/rejected": -4.022022247314453, "step": 3700 }, { "epoch": 0.97, - "eval_logits/chosen": -1.3342877626419067, - "eval_logits/rejected": -1.2075273990631104, - "eval_logps/chosen": -543.0763549804688, - "eval_logps/rejected": -635.919189453125, - "eval_loss": 0.4895748794078827, - "eval_rewards/accuracies": 0.7434999942779541, - "eval_rewards/chosen": -2.784243583679199, - "eval_rewards/margins": 1.1288617849349976, - "eval_rewards/rejected": -3.9131054878234863, - "eval_runtime": 385.4925, - "eval_samples_per_second": 5.188, + "eval_logits/chosen": -1.252778172492981, + "eval_logits/rejected": -1.1321126222610474, + "eval_logps/chosen": -545.6665649414062, + "eval_logps/rejected": -635.412353515625, + "eval_loss": 0.486397385597229, + "eval_rewards/accuracies": 0.7400000095367432, + "eval_rewards/chosen": -2.8101449012756348, + "eval_rewards/margins": 1.0978920459747314, + "eval_rewards/rejected": -3.9080374240875244, + "eval_runtime": 385.1334, + "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.649, "step": 3700 }, { "epoch": 0.97, - "grad_norm": 16.0, + "grad_norm": 12.1875, "learning_rate": 1.2849067234584623e-08, - "logits/chosen": -1.3140819072723389, - "logits/rejected": -1.2478830814361572, - "logps/chosen": -514.704833984375, - "logps/rejected": -628.93017578125, - "loss": 0.4821, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.7834410667419434, - "rewards/margins": 1.2183729410171509, - "rewards/rejected": -4.001813888549805, + "logits/chosen": -1.2232288122177124, + "logits/rejected": -1.1615407466888428, + "logps/chosen": -513.3945922851562, + "logps/rejected": -625.0435791015625, + "loss": 0.4715, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7703394889831543, + "rewards/margins": 1.192608118057251, + "rewards/rejected": -3.9629478454589844, "step": 3710 }, { "epoch": 0.97, - "grad_norm": 15.1875, + "grad_norm": 11.1875, "learning_rate": 1.0639775577218625e-08, - "logits/chosen": -1.3101227283477783, - "logits/rejected": -1.1396509408950806, - "logps/chosen": -528.97265625, - "logps/rejected": -609.4863891601562, + "logits/chosen": -1.2154030799865723, + "logits/rejected": -1.0470209121704102, + "logps/chosen": -534.564453125, + "logps/rejected": -611.8311767578125, "loss": 0.5179, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.7978572845458984, - "rewards/margins": 1.1644933223724365, - "rewards/rejected": -3.962350368499756, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8537747859954834, + "rewards/margins": 1.1320233345031738, + "rewards/rejected": -3.985797882080078, "step": 3720 }, { "epoch": 0.98, - "grad_norm": 11.5625, + "grad_norm": 10.3125, "learning_rate": 8.638344782207486e-09, - "logits/chosen": -1.3434375524520874, - "logits/rejected": -1.2269071340560913, - "logps/chosen": -513.5137939453125, - "logps/rejected": -601.6809692382812, - "loss": 0.4808, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.6822597980499268, - "rewards/margins": 1.1096186637878418, - "rewards/rejected": -3.7918784618377686, + "logits/chosen": -1.2473368644714355, + "logits/rejected": -1.1350939273834229, + "logps/chosen": -516.2252197265625, + "logps/rejected": -601.020263671875, + "loss": 0.4856, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.709373950958252, + "rewards/margins": 1.07589852809906, + "rewards/rejected": -3.7852725982666016, "step": 3730 }, { "epoch": 0.98, - "grad_norm": 9.875, + "grad_norm": 9.3125, "learning_rate": 6.84494196844715e-09, - "logits/chosen": -1.3948211669921875, - "logits/rejected": -1.2849534749984741, - "logps/chosen": -548.9085083007812, - "logps/rejected": -666.4553833007812, - "loss": 0.4667, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.7680201530456543, - "rewards/margins": 1.2913219928741455, - "rewards/rejected": -4.059341907501221, + "logits/chosen": -1.2988349199295044, + "logits/rejected": -1.1963183879852295, + "logps/chosen": -549.7848510742188, + "logps/rejected": -670.5635986328125, + "loss": 0.4567, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7767837047576904, + "rewards/margins": 1.3236409425735474, + "rewards/rejected": -4.100424766540527, "step": 3740 }, { "epoch": 0.98, - "grad_norm": 8.875, + "grad_norm": 7.96875, "learning_rate": 5.259716884556121e-09, - "logits/chosen": -1.4579029083251953, - "logits/rejected": -1.3110452890396118, - "logps/chosen": -538.3919067382812, - "logps/rejected": -639.5615234375, - "loss": 0.4595, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.739250659942627, - "rewards/margins": 1.1599111557006836, - "rewards/rejected": -3.8991622924804688, + "logits/chosen": -1.3606340885162354, + "logits/rejected": -1.224469780921936, + "logps/chosen": -543.1236572265625, + "logps/rejected": -640.4927978515625, + "loss": 0.4694, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7865681648254395, + "rewards/margins": 1.1219072341918945, + "rewards/rejected": -3.908475399017334, "step": 3750 }, { "epoch": 0.98, - "grad_norm": 8.8125, + "grad_norm": 9.0, "learning_rate": 3.882801896372967e-09, - "logits/chosen": -1.4429559707641602, - "logits/rejected": -1.3740085363388062, - "logps/chosen": -534.4335327148438, - "logps/rejected": -618.3828735351562, - "loss": 0.4868, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.6907172203063965, - "rewards/margins": 1.1578295230865479, - "rewards/rejected": -3.8485465049743652, + "logits/chosen": -1.3460079431533813, + "logits/rejected": -1.2832306623458862, + "logps/chosen": -539.73583984375, + "logps/rejected": -619.1431884765625, + "loss": 0.4913, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7437405586242676, + "rewards/margins": 1.1124091148376465, + "rewards/rejected": -3.8561501502990723, "step": 3760 }, { "epoch": 0.99, - "grad_norm": 8.625, + "grad_norm": 12.8125, "learning_rate": 2.7143119759026614e-09, - "logits/chosen": -1.4640624523162842, - "logits/rejected": -1.2803726196289062, - "logps/chosen": -557.8724975585938, - "logps/rejected": -646.456298828125, - "loss": 0.4267, + "logits/chosen": -1.3690940141677856, + "logits/rejected": -1.1916528940200806, + "logps/chosen": -560.9449462890625, + "logps/rejected": -645.1325073242188, + "loss": 0.4329, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.7360005378723145, - "rewards/margins": 1.1408417224884033, - "rewards/rejected": -3.8768420219421387, + "rewards/chosen": -2.7667250633239746, + "rewards/margins": 1.096879482269287, + "rewards/rejected": -3.8636043071746826, "step": 3770 }, { "epoch": 0.99, - "grad_norm": 11.125, + "grad_norm": 7.65625, "learning_rate": 1.754344691717591e-09, - "logits/chosen": -1.3610948324203491, - "logits/rejected": -1.3128563165664673, - "logps/chosen": -533.3128662109375, - "logps/rejected": -645.0467529296875, - "loss": 0.5359, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.835211992263794, - "rewards/margins": 0.896623432636261, - "rewards/rejected": -3.731835126876831, + "logits/chosen": -1.2690956592559814, + "logits/rejected": -1.2165257930755615, + "logps/chosen": -535.2860107421875, + "logps/rejected": -644.5172119140625, + "loss": 0.5302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8549444675445557, + "rewards/margins": 0.8715957403182983, + "rewards/rejected": -3.7265400886535645, "step": 3780 }, { "epoch": 0.99, - "grad_norm": 9.9375, + "grad_norm": 10.9375, "learning_rate": 1.0029802008096335e-09, - "logits/chosen": -1.3813669681549072, - "logits/rejected": -1.2357242107391357, - "logps/chosen": -551.9505615234375, - "logps/rejected": -645.6619262695312, - "loss": 0.4777, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.7765860557556152, - "rewards/margins": 1.1649653911590576, - "rewards/rejected": -3.941551685333252, + "logits/chosen": -1.2903029918670654, + "logits/rejected": -1.1491575241088867, + "logps/chosen": -556.7200927734375, + "logps/rejected": -649.4456176757812, + "loss": 0.4709, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.824282169342041, + "rewards/margins": 1.1551063060760498, + "rewards/rejected": -3.979388475418091, "step": 3790 }, { "epoch": 0.99, - "grad_norm": 11.0, + "grad_norm": 9.625, "learning_rate": 4.602812418974534e-10, - "logits/chosen": -1.48513925075531, - "logits/rejected": -1.3569138050079346, - "logps/chosen": -561.6650390625, - "logps/rejected": -650.4069213867188, - "loss": 0.5092, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.7982420921325684, - "rewards/margins": 1.1328747272491455, - "rewards/rejected": -3.931116819381714, + "logits/chosen": -1.3866922855377197, + "logits/rejected": -1.2608470916748047, + "logps/chosen": -567.4412231445312, + "logps/rejected": -653.213623046875, + "loss": 0.5013, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.856003999710083, + "rewards/margins": 1.1031793355941772, + "rewards/rejected": -3.9591832160949707, "step": 3800 }, { "epoch": 0.99, - "eval_logits/chosen": -1.33402681350708, - "eval_logits/rejected": -1.207356572151184, - "eval_logps/chosen": -543.0327758789062, - "eval_logps/rejected": -635.90625, - "eval_loss": 0.48944273591041565, - "eval_rewards/accuracies": 0.7444999814033508, - "eval_rewards/chosen": -2.7838070392608643, - "eval_rewards/margins": 1.1291695833206177, - "eval_rewards/rejected": -3.9129767417907715, - "eval_runtime": 385.4448, - "eval_samples_per_second": 5.189, + "eval_logits/chosen": -1.2524324655532837, + "eval_logits/rejected": -1.1317205429077148, + "eval_logps/chosen": -545.9131469726562, + "eval_logps/rejected": -635.618408203125, + "eval_loss": 0.48637571930885315, + "eval_rewards/accuracies": 0.7394999861717224, + "eval_rewards/chosen": -2.8126115798950195, + "eval_rewards/margins": 1.0974864959716797, + "eval_rewards/rejected": -3.910098075866699, + "eval_runtime": 385.0016, + "eval_samples_per_second": 5.195, "eval_steps_per_second": 0.649, "step": 3800 }, { "epoch": 1.0, - "grad_norm": 11.25, + "grad_norm": 10.4375, "learning_rate": 1.2629313018819312e-10, - "logits/chosen": -1.4069092273712158, - "logits/rejected": -1.28346848487854, - "logps/chosen": -526.5909423828125, - "logps/rejected": -611.5968017578125, - "loss": 0.5062, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.7411856651306152, - "rewards/margins": 1.006593942642212, - "rewards/rejected": -3.7477798461914062, + "logits/chosen": -1.312417984008789, + "logits/rejected": -1.191304326057434, + "logps/chosen": -529.3787231445312, + "logps/rejected": -611.1173095703125, + "loss": 0.5046, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7690625190734863, + "rewards/margins": 0.9739207029342651, + "rewards/rejected": -3.742983341217041, "step": 3810 }, { "epoch": 1.0, - "grad_norm": 20.25, + "grad_norm": 21.25, "learning_rate": 1.0437535929996855e-12, - "logits/chosen": -1.399791955947876, - "logits/rejected": -1.2320655584335327, - "logps/chosen": -567.6913452148438, - "logps/rejected": -660.8294067382812, - "loss": 0.4642, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.8092477321624756, - "rewards/margins": 1.3779563903808594, - "rewards/rejected": -4.187203884124756, + "logits/chosen": -1.306217908859253, + "logits/rejected": -1.1442514657974243, + "logps/chosen": -570.4493408203125, + "logps/rejected": -660.94970703125, + "loss": 0.4569, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.836827516555786, + "rewards/margins": 1.3515799045562744, + "rewards/rejected": -4.188406944274902, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, - "train_loss": 0.521260229900413, - "train_runtime": 42446.6624, - "train_samples_per_second": 1.44, - "train_steps_per_second": 0.09 + "train_loss": 0.5238101308459981, + "train_runtime": 42749.2467, + "train_samples_per_second": 1.43, + "train_steps_per_second": 0.089 } ], "logging_steps": 10,