diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -27,663 +27,663 @@ "epoch": 0.0, "grad_norm": 0.271484375, "learning_rate": 8.605851979345955e-08, - "logits/chosen": -3.500865936279297, - "logits/rejected": -3.4951529502868652, - "logps/chosen": -61.67301559448242, - "logps/rejected": -57.93098449707031, + "logits/chosen": -3.501009464263916, + "logits/rejected": -3.495290756225586, + "logps/chosen": -61.66798400878906, + "logps/rejected": -57.93266296386719, "loss": 0.693, "rewards/accuracies": 0.4861111044883728, - "rewards/chosen": 0.00022746472677681595, - "rewards/margins": 0.0002776283654384315, - "rewards/rejected": -5.016366776544601e-05, + "rewards/chosen": 0.00027778727235272527, + "rewards/margins": 0.0003447102790232748, + "rewards/rejected": -6.692303577437997e-05, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.2490234375, "learning_rate": 1.721170395869191e-07, - "logits/chosen": -3.529344081878662, - "logits/rejected": -3.5275402069091797, - "logps/chosen": -63.89023971557617, - "logps/rejected": -61.82512664794922, - "loss": 0.6931, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 8.464425627607852e-05, - "rewards/margins": 8.538198744645342e-05, - "rewards/rejected": -7.377296924460097e-07, + "logits/chosen": -3.5296554565429688, + "logits/rejected": -3.5278468132019043, + "logps/chosen": -63.88775634765625, + "logps/rejected": -61.84159469604492, + "loss": 0.693, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.00010944288078462705, + "rewards/margins": 0.00027477304683998227, + "rewards/rejected": -0.0001653301587793976, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.302734375, "learning_rate": 2.5817555938037866e-07, - "logits/chosen": -3.519536256790161, - "logits/rejected": -3.5142159461975098, - "logps/chosen": -67.57096099853516, - "logps/rejected": -66.90937042236328, - "loss": 0.6933, + "logits/chosen": -3.5192017555236816, + "logits/rejected": -3.5138943195343018, + "logps/chosen": -67.57090759277344, + "logps/rejected": -66.92351531982422, + "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": 4.063212691107765e-05, - "rewards/margins": -0.0002516761014703661, - "rewards/rejected": 0.0002923081920016557, + "rewards/chosen": 4.100705336895771e-05, + "rewards/margins": -0.0001098484281101264, + "rewards/rejected": 0.00015085548511706293, "step": 30 }, { "epoch": 0.01, - "grad_norm": 0.23828125, + "grad_norm": 0.2392578125, "learning_rate": 3.442340791738382e-07, - "logits/chosen": -3.505084276199341, - "logits/rejected": -3.4988949298858643, - "logps/chosen": -66.155517578125, - "logps/rejected": -58.66449737548828, + "logits/chosen": -3.5046753883361816, + "logits/rejected": -3.498481273651123, + "logps/chosen": -66.16209411621094, + "logps/rejected": -58.65666580200195, "loss": 0.6932, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.0003996006562374532, - "rewards/margins": -2.4692481019883417e-05, - "rewards/rejected": 0.0004242931609041989, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00033376910141669214, + "rewards/margins": -0.00016887595120351762, + "rewards/rejected": 0.0005026450380682945, "step": 40 }, { "epoch": 0.01, - "grad_norm": 0.251953125, + "grad_norm": 0.25390625, "learning_rate": 4.302925989672978e-07, - "logits/chosen": -3.510054111480713, - "logits/rejected": -3.505955457687378, - "logps/chosen": -62.5504150390625, - "logps/rejected": -58.739418029785156, - "loss": 0.6931, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": 0.0005882784607820213, - "rewards/margins": 7.251681381603703e-05, - "rewards/rejected": 0.0005157616687938571, + "logits/chosen": -3.509906053543091, + "logits/rejected": -3.5057880878448486, + "logps/chosen": -62.557472229003906, + "logps/rejected": -58.73368453979492, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.0005178075516596437, + "rewards/margins": -5.5213680752785876e-05, + "rewards/rejected": 0.000573021243326366, "step": 50 }, { "epoch": 0.01, - "grad_norm": 0.3046875, + "grad_norm": 0.302734375, "learning_rate": 5.163511187607573e-07, - "logits/chosen": -3.532723903656006, - "logits/rejected": -3.5259487628936768, - "logps/chosen": -67.88531494140625, - "logps/rejected": -62.6588134765625, - "loss": 0.6931, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": 0.0007649865001440048, - "rewards/margins": 1.9654471543617547e-06, - "rewards/rejected": 0.0007630210602656007, + "logits/chosen": -3.5339508056640625, + "logits/rejected": -3.5271873474121094, + "logps/chosen": -67.87001037597656, + "logps/rejected": -62.6706428527832, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0009180423803627491, + "rewards/margins": 0.0002732599969021976, + "rewards/rejected": 0.0006447824416682124, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.2578125, "learning_rate": 6.024096385542169e-07, - "logits/chosen": -3.5082130432128906, - "logits/rejected": -3.5031533241271973, - "logps/chosen": -64.43404388427734, + "logits/chosen": -3.5073513984680176, + "logits/rejected": -3.502321243286133, + "logps/chosen": -64.41812133789062, "logps/rejected": -61.06683349609375, - "loss": 0.6931, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0014903316041454673, - "rewards/margins": 9.859764395514503e-05, - "rewards/rejected": 0.0013917339965701103, + "loss": 0.693, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0016494952142238617, + "rewards/margins": 0.0002577772247605026, + "rewards/rejected": 0.0013917179312556982, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.2734375, "learning_rate": 6.884681583476764e-07, - "logits/chosen": -3.543382167816162, - "logits/rejected": -3.536128282546997, - "logps/chosen": -69.94771575927734, - "logps/rejected": -62.211883544921875, - "loss": 0.6929, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0019744255114346743, - "rewards/margins": 0.0004983833059668541, - "rewards/rejected": 0.0014760419726371765, + "logits/chosen": -3.543259859085083, + "logits/rejected": -3.535987138748169, + "logps/chosen": -69.9263687133789, + "logps/rejected": -62.202117919921875, + "loss": 0.6928, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.002187983598560095, + "rewards/margins": 0.0006142753991298378, + "rewards/rejected": 0.001573708257637918, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.26171875, "learning_rate": 7.745266781411361e-07, - "logits/chosen": -3.510272264480591, - "logits/rejected": -3.5065231323242188, - "logps/chosen": -64.821044921875, - "logps/rejected": -60.5690803527832, + "logits/chosen": -3.5097403526306152, + "logits/rejected": -3.505981922149658, + "logps/chosen": -64.8249282836914, + "logps/rejected": -60.57047653198242, "loss": 0.693, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.002713029272854328, - "rewards/margins": 0.00037466129288077354, - "rewards/rejected": 0.0023383679799735546, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.002674184273928404, + "rewards/margins": 0.00034979888005182147, + "rewards/rejected": 0.002324385568499565, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.263671875, "learning_rate": 8.605851979345956e-07, - "logits/chosen": -3.534849166870117, - "logits/rejected": -3.5216782093048096, - "logps/chosen": -66.38619232177734, - "logps/rejected": -57.2902717590332, - "loss": 0.6926, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.003791350871324539, - "rewards/margins": 0.0010863130446523428, - "rewards/rejected": 0.0027050375938415527, + "logits/chosen": -3.5349838733673096, + "logits/rejected": -3.5218091011047363, + "logps/chosen": -66.39881134033203, + "logps/rejected": -57.2912712097168, + "loss": 0.6927, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.0036652986891567707, + "rewards/margins": 0.0009702268871478736, + "rewards/rejected": 0.0026950715109705925, "step": 100 }, { "epoch": 0.02, - "eval_logits/chosen": -3.4979639053344727, - "eval_logits/rejected": -3.4962334632873535, - "eval_logps/chosen": -70.9185791015625, - "eval_logps/rejected": -74.63919830322266, - "eval_loss": 0.6930303573608398, - "eval_rewards/accuracies": 0.5192843675613403, - "eval_rewards/chosen": 0.004914432298392057, - "eval_rewards/margins": 0.00023599098494742066, - "eval_rewards/rejected": 0.00467844121158123, - "eval_runtime": 486.9073, - "eval_samples_per_second": 8.839, - "eval_steps_per_second": 1.105, + "eval_logits/chosen": -3.4983110427856445, + "eval_logits/rejected": -3.496581554412842, + "eval_logps/chosen": -70.91754913330078, + "eval_logps/rejected": -74.6415786743164, + "eval_loss": 0.6930133700370789, + "eval_rewards/accuracies": 0.5204461216926575, + "eval_rewards/chosen": 0.004924696870148182, + "eval_rewards/margins": 0.0002700270852074027, + "eval_rewards/rejected": 0.004654669668525457, + "eval_runtime": 486.5221, + "eval_samples_per_second": 8.846, + "eval_steps_per_second": 1.106, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.291015625, "learning_rate": 9.466437177280551e-07, - "logits/chosen": -3.5418262481689453, - "logits/rejected": -3.537301540374756, - "logps/chosen": -66.12630462646484, - "logps/rejected": -60.84473419189453, - "loss": 0.6928, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.0047551849856972694, - "rewards/margins": 0.0006528754602186382, - "rewards/rejected": 0.004102309234440327, + "logits/chosen": -3.5417017936706543, + "logits/rejected": -3.537182331085205, + "logps/chosen": -66.14574432373047, + "logps/rejected": -60.83588790893555, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.004560687579214573, + "rewards/margins": 0.0003698725195135921, + "rewards/rejected": 0.004190815147012472, "step": 110 }, { "epoch": 0.02, - "grad_norm": 0.2890625, + "grad_norm": 0.287109375, "learning_rate": 1.0327022375215146e-06, - "logits/chosen": -3.5191383361816406, - "logits/rejected": -3.515841007232666, - "logps/chosen": -62.40716552734375, - "logps/rejected": -63.18220138549805, + "logits/chosen": -3.5192337036132812, + "logits/rejected": -3.5158679485321045, + "logps/chosen": -62.41202926635742, + "logps/rejected": -63.18855667114258, "loss": 0.6929, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.004575005732476711, - "rewards/margins": 0.0005694165593013167, - "rewards/rejected": 0.004005589056760073, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.004526413977146149, + "rewards/margins": 0.0005844075931236148, + "rewards/rejected": 0.003942006267607212, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.251953125, "learning_rate": 1.1187607573149743e-06, - "logits/chosen": -3.5376014709472656, - "logits/rejected": -3.5348377227783203, - "logps/chosen": -66.56596374511719, - "logps/rejected": -61.85821533203125, - "loss": 0.6928, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.005736568011343479, - "rewards/margins": 0.0006158509640954435, - "rewards/rejected": 0.0051207165233790874, + "logits/chosen": -3.538072109222412, + "logits/rejected": -3.535295009613037, + "logps/chosen": -66.57141876220703, + "logps/rejected": -61.85688018798828, + "loss": 0.6929, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0056819794699549675, + "rewards/margins": 0.0005479130195453763, + "rewards/rejected": 0.005134066101163626, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.27734375, "learning_rate": 1.2048192771084338e-06, - "logits/chosen": -3.5332303047180176, - "logits/rejected": -3.528048038482666, - "logps/chosen": -63.472206115722656, - "logps/rejected": -60.507774353027344, - "loss": 0.6928, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.005915635731071234, - "rewards/margins": 0.0006704704137519002, - "rewards/rejected": 0.005245164968073368, + "logits/chosen": -3.5338687896728516, + "logits/rejected": -3.5286478996276855, + "logps/chosen": -63.46132278442383, + "logps/rejected": -60.513450622558594, + "loss": 0.6927, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.006024450063705444, + "rewards/margins": 0.0008360937936231494, + "rewards/rejected": 0.005188356153666973, "step": 140 }, { "epoch": 0.03, - "grad_norm": 0.26953125, + "grad_norm": 0.271484375, "learning_rate": 1.2908777969018935e-06, - "logits/chosen": -3.506129026412964, - "logits/rejected": -3.502753496170044, - "logps/chosen": -62.16828155517578, - "logps/rejected": -59.287452697753906, - "loss": 0.6925, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.00653434032574296, - "rewards/margins": 0.0012093739351257682, - "rewards/rejected": 0.00532496627420187, + "logits/chosen": -3.507450819015503, + "logits/rejected": -3.5040555000305176, + "logps/chosen": -62.18046951293945, + "logps/rejected": -59.2562141418457, + "loss": 0.6928, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.006412480026483536, + "rewards/margins": 0.0007751357043161988, + "rewards/rejected": 0.005637344904243946, "step": 150 }, { "epoch": 0.03, "grad_norm": 0.2431640625, "learning_rate": 1.3769363166953528e-06, - "logits/chosen": -3.504394054412842, - "logits/rejected": -3.5007827281951904, - "logps/chosen": -63.689369201660156, - "logps/rejected": -62.33386993408203, + "logits/chosen": -3.505376100540161, + "logits/rejected": -3.5017974376678467, + "logps/chosen": -63.6754150390625, + "logps/rejected": -62.3272590637207, "loss": 0.6927, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.007361248135566711, - "rewards/margins": 0.0008054021745920181, - "rewards/rejected": 0.006555846426635981, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.007500818930566311, + "rewards/margins": 0.000878830614965409, + "rewards/rejected": 0.006621988955885172, "step": 160 }, { "epoch": 0.03, "grad_norm": 0.2734375, "learning_rate": 1.4629948364888125e-06, - "logits/chosen": -3.5105483531951904, - "logits/rejected": -3.506767749786377, - "logps/chosen": -64.63548278808594, - "logps/rejected": -58.397705078125, - "loss": 0.6925, + "logits/chosen": -3.510266065597534, + "logits/rejected": -3.506471633911133, + "logps/chosen": -64.6450424194336, + "logps/rejected": -58.39984130859375, + "loss": 0.6926, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.008624577894806862, - "rewards/margins": 0.00120867311488837, - "rewards/rejected": 0.007415904197841883, + "rewards/chosen": 0.008529067039489746, + "rewards/margins": 0.0011345654493197799, + "rewards/rejected": 0.007394501473754644, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.29296875, "learning_rate": 1.5490533562822722e-06, - "logits/chosen": -3.5382277965545654, - "logits/rejected": -3.5333220958709717, - "logps/chosen": -64.91260528564453, - "logps/rejected": -59.960670471191406, + "logits/chosen": -3.538785219192505, + "logits/rejected": -3.533876895904541, + "logps/chosen": -64.92842864990234, + "logps/rejected": -59.96929168701172, "loss": 0.6927, "rewards/accuracies": 0.5625, - "rewards/chosen": 0.00788290984928608, - "rewards/margins": 0.0009173898724839091, - "rewards/rejected": 0.006965520326048136, + "rewards/chosen": 0.00772461574524641, + "rewards/margins": 0.0008454096387140453, + "rewards/rejected": 0.0068792058154940605, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.306640625, "learning_rate": 1.6351118760757316e-06, - "logits/chosen": -3.517756700515747, - "logits/rejected": -3.5076632499694824, - "logps/chosen": -68.32835388183594, - "logps/rejected": -59.00849151611328, - "loss": 0.6916, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.010297342203557491, - "rewards/margins": 0.003029712475836277, - "rewards/rejected": 0.007267629262059927, + "logits/chosen": -3.518183946609497, + "logits/rejected": -3.5080676078796387, + "logps/chosen": -68.33369445800781, + "logps/rejected": -59.007720947265625, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.010243936441838741, + "rewards/margins": 0.0029686293564736843, + "rewards/rejected": 0.007275307085365057, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.28125, "learning_rate": 1.7211703958691911e-06, - "logits/chosen": -3.505796432495117, - "logits/rejected": -3.5029215812683105, - "logps/chosen": -63.019500732421875, - "logps/rejected": -61.403404235839844, - "loss": 0.6919, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.010612553916871548, - "rewards/margins": 0.0024268892593681812, - "rewards/rejected": 0.008185665123164654, + "logits/chosen": -3.5061545372009277, + "logits/rejected": -3.5032687187194824, + "logps/chosen": -63.0229606628418, + "logps/rejected": -61.38782501220703, + "loss": 0.692, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.01057803351432085, + "rewards/margins": 0.00223658699542284, + "rewards/rejected": 0.008341444656252861, "step": 200 }, { "epoch": 0.03, - "eval_logits/chosen": -3.49253511428833, - "eval_logits/rejected": -3.4908037185668945, - "eval_logps/chosen": -69.95048522949219, - "eval_logps/rejected": -73.75398254394531, - "eval_loss": 0.6926223635673523, - "eval_rewards/accuracies": 0.5678438544273376, - "eval_rewards/chosen": 0.014595309272408485, - "eval_rewards/margins": 0.0010647153249010444, - "eval_rewards/rejected": 0.013530593365430832, - "eval_runtime": 483.8947, - "eval_samples_per_second": 8.894, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.494154214859009, + "eval_logits/rejected": -3.492429733276367, + "eval_logps/chosen": -69.94579315185547, + "eval_logps/rejected": -73.75847625732422, + "eval_loss": 0.6925765872001648, + "eval_rewards/accuracies": 0.5615706443786621, + "eval_rewards/chosen": 0.014642315916717052, + "eval_rewards/margins": 0.001156591926701367, + "eval_rewards/rejected": 0.013485724106431007, + "eval_runtime": 484.1845, + "eval_samples_per_second": 8.889, + "eval_steps_per_second": 1.111, "step": 200 }, { "epoch": 0.04, "grad_norm": 0.259765625, "learning_rate": 1.8072289156626508e-06, - "logits/chosen": -3.5087637901306152, - "logits/rejected": -3.5072245597839355, - "logps/chosen": -62.840614318847656, - "logps/rejected": -63.97832107543945, - "loss": 0.6918, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.011572018265724182, - "rewards/margins": 0.0027770684100687504, - "rewards/rejected": 0.008794950321316719, + "logits/chosen": -3.5086097717285156, + "logits/rejected": -3.5070700645446777, + "logps/chosen": -62.868133544921875, + "logps/rejected": -63.98527908325195, + "loss": 0.6919, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.011296862736344337, + "rewards/margins": 0.0025715038646012545, + "rewards/rejected": 0.008725358173251152, "step": 210 }, { "epoch": 0.04, "grad_norm": 0.2578125, "learning_rate": 1.8932874354561103e-06, - "logits/chosen": -3.502013683319092, - "logits/rejected": -3.494791030883789, - "logps/chosen": -62.28466796875, - "logps/rejected": -58.1067008972168, - "loss": 0.6912, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.014032213017344475, - "rewards/margins": 0.0038597029633820057, - "rewards/rejected": 0.010172510519623756, + "logits/chosen": -3.5019657611846924, + "logits/rejected": -3.4947357177734375, + "logps/chosen": -62.300025939941406, + "logps/rejected": -58.14155960083008, + "loss": 0.6911, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.013878649100661278, + "rewards/margins": 0.0040546623058617115, + "rewards/rejected": 0.009823987260460854, "step": 220 }, { "epoch": 0.04, - "grad_norm": 0.271484375, + "grad_norm": 0.26953125, "learning_rate": 1.9793459552495696e-06, - "logits/chosen": -3.515824794769287, - "logits/rejected": -3.512291431427002, - "logps/chosen": -57.69233322143555, - "logps/rejected": -56.74806594848633, - "loss": 0.6917, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.014439724385738373, - "rewards/margins": 0.0030108988285064697, - "rewards/rejected": 0.011428825557231903, + "logits/chosen": -3.5150229930877686, + "logits/rejected": -3.511476993560791, + "logps/chosen": -57.71666717529297, + "logps/rejected": -56.743553161621094, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014196398667991161, + "rewards/margins": 0.002722408389672637, + "rewards/rejected": 0.01147399004548788, "step": 230 }, { "epoch": 0.04, "grad_norm": 0.255859375, "learning_rate": 2.0654044750430293e-06, - "logits/chosen": -3.5206668376922607, - "logits/rejected": -3.5107674598693848, - "logps/chosen": -64.91423034667969, - "logps/rejected": -59.14391326904297, - "loss": 0.6907, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.016033673658967018, - "rewards/margins": 0.004976017866283655, - "rewards/rejected": 0.01105765625834465, + "logits/chosen": -3.5202598571777344, + "logits/rejected": -3.5103707313537598, + "logps/chosen": -64.93174743652344, + "logps/rejected": -59.1458854675293, + "loss": 0.6908, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.015858512371778488, + "rewards/margins": 0.004820539150387049, + "rewards/rejected": 0.011037970893085003, "step": 240 }, { "epoch": 0.04, - "grad_norm": 0.271484375, + "grad_norm": 0.26953125, "learning_rate": 2.151462994836489e-06, - "logits/chosen": -3.505598783493042, - "logits/rejected": -3.502315044403076, - "logps/chosen": -61.88445281982422, - "logps/rejected": -57.96600341796875, + "logits/chosen": -3.5057454109191895, + "logits/rejected": -3.502401828765869, + "logps/chosen": -61.890663146972656, + "logps/rejected": -57.9714469909668, "loss": 0.6914, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.018100310117006302, - "rewards/margins": 0.003605439094826579, - "rewards/rejected": 0.014494871720671654, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.01803828775882721, + "rewards/margins": 0.003597863484174013, + "rewards/rejected": 0.014440424740314484, "step": 250 }, { "epoch": 0.04, "grad_norm": 0.271484375, "learning_rate": 2.2375215146299486e-06, - "logits/chosen": -3.5090572834014893, - "logits/rejected": -3.5079474449157715, - "logps/chosen": -63.0159912109375, - "logps/rejected": -62.857337951660156, - "loss": 0.6894, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.018999049440026283, - "rewards/margins": 0.007648964412510395, - "rewards/rejected": 0.011350083164870739, + "logits/chosen": -3.509375810623169, + "logits/rejected": -3.5082786083221436, + "logps/chosen": -63.0054931640625, + "logps/rejected": -62.8714485168457, + "loss": 0.6892, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.019104022532701492, + "rewards/margins": 0.007894990965723991, + "rewards/rejected": 0.011209032498300076, "step": 260 }, { "epoch": 0.05, "grad_norm": 0.267578125, "learning_rate": 2.323580034423408e-06, - "logits/chosen": -3.501816987991333, - "logits/rejected": -3.497560501098633, - "logps/chosen": -60.06858444213867, - "logps/rejected": -59.70167922973633, - "loss": 0.69, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.02066418156027794, - "rewards/margins": 0.0063255527056753635, - "rewards/rejected": 0.014338627457618713, + "logits/chosen": -3.5024642944335938, + "logits/rejected": -3.49824595451355, + "logps/chosen": -60.08552169799805, + "logps/rejected": -59.7121467590332, + "loss": 0.6901, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.020494792610406876, + "rewards/margins": 0.00626087561249733, + "rewards/rejected": 0.014233916997909546, "step": 270 }, { "epoch": 0.05, - "grad_norm": 0.28125, + "grad_norm": 0.283203125, "learning_rate": 2.4096385542168676e-06, - "logits/chosen": -3.5297698974609375, - "logits/rejected": -3.52246356010437, - "logps/chosen": -66.10899353027344, - "logps/rejected": -59.8133430480957, - "loss": 0.6892, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.022841677069664, - "rewards/margins": 0.0080425925552845, - "rewards/rejected": 0.014799085445702076, + "logits/chosen": -3.5296006202697754, + "logits/rejected": -3.522291660308838, + "logps/chosen": -66.11088562011719, + "logps/rejected": -59.79486846923828, + "loss": 0.6893, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.022822776809334755, + "rewards/margins": 0.007838909514248371, + "rewards/rejected": 0.014983865432441235, "step": 280 }, { "epoch": 0.05, "grad_norm": 0.287109375, "learning_rate": 2.4956970740103273e-06, - "logits/chosen": -3.5089409351348877, - "logits/rejected": -3.506542682647705, - "logps/chosen": -64.64678192138672, - "logps/rejected": -60.837615966796875, + "logits/chosen": -3.5092430114746094, + "logits/rejected": -3.5068252086639404, + "logps/chosen": -64.64472198486328, + "logps/rejected": -60.8430290222168, "loss": 0.6903, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.020989563316106796, - "rewards/margins": 0.005816182587295771, - "rewards/rejected": 0.015173378400504589, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.021010177209973335, + "rewards/margins": 0.005890936590731144, + "rewards/rejected": 0.015119239687919617, "step": 290 }, { "epoch": 0.05, "grad_norm": 0.283203125, "learning_rate": 2.581755593803787e-06, - "logits/chosen": -3.509040355682373, - "logits/rejected": -3.5068492889404297, - "logps/chosen": -62.775413513183594, - "logps/rejected": -60.65922927856445, - "loss": 0.6888, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.023325210437178612, - "rewards/margins": 0.008827079087495804, - "rewards/rejected": 0.014498132281005383, + "logits/chosen": -3.508896589279175, + "logits/rejected": -3.5066845417022705, + "logps/chosen": -62.75114059448242, + "logps/rejected": -60.66050338745117, + "loss": 0.6887, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.02356800064444542, + "rewards/margins": 0.009082594886422157, + "rewards/rejected": 0.014485405758023262, "step": 300 }, { "epoch": 0.05, - "eval_logits/chosen": -3.486070156097412, - "eval_logits/rejected": -3.4843297004699707, - "eval_logps/chosen": -67.8994369506836, - "eval_logps/rejected": -72.02379608154297, - "eval_loss": 0.6910805106163025, - "eval_rewards/accuracies": 0.5748141407966614, - "eval_rewards/chosen": 0.035105764865875244, - "eval_rewards/margins": 0.004273186903446913, - "eval_rewards/rejected": 0.030832577496767044, - "eval_runtime": 483.9038, - "eval_samples_per_second": 8.894, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.4875731468200684, + "eval_logits/rejected": -3.485839605331421, + "eval_logps/chosen": -67.90235900878906, + "eval_logps/rejected": -72.03018951416016, + "eval_loss": 0.6910621523857117, + "eval_rewards/accuracies": 0.5731877088546753, + "eval_rewards/chosen": 0.03507662191987038, + "eval_rewards/margins": 0.004308138974010944, + "eval_rewards/rejected": 0.030768483877182007, + "eval_runtime": 484.3093, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.111, "step": 300 }, { "epoch": 0.05, - "grad_norm": 0.294921875, + "grad_norm": 0.29296875, "learning_rate": 2.6678141135972463e-06, - "logits/chosen": -3.5071263313293457, - "logits/rejected": -3.506284236907959, - "logps/chosen": -62.853904724121094, - "logps/rejected": -59.47548294067383, + "logits/chosen": -3.5070648193359375, + "logits/rejected": -3.506229877471924, + "logps/chosen": -62.83290481567383, + "logps/rejected": -59.43952178955078, "loss": 0.69, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.02449950948357582, - "rewards/margins": 0.006591873709112406, - "rewards/rejected": 0.01790763810276985, + "rewards/chosen": 0.024709565564990044, + "rewards/margins": 0.006442278623580933, + "rewards/rejected": 0.01826728694140911, "step": 310 }, { "epoch": 0.06, "grad_norm": 0.279296875, "learning_rate": 2.7538726333907055e-06, - "logits/chosen": -3.4970526695251465, - "logits/rejected": -3.4916062355041504, - "logps/chosen": -64.11039733886719, - "logps/rejected": -56.41526412963867, - "loss": 0.6884, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.020039405673742294, - "rewards/margins": 0.009693250991404057, - "rewards/rejected": 0.010346156544983387, + "logits/chosen": -3.497121810913086, + "logits/rejected": -3.4916274547576904, + "logps/chosen": -64.09577178955078, + "logps/rejected": -56.424560546875, + "loss": 0.6883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02018563821911812, + "rewards/margins": 0.009932487271726131, + "rewards/rejected": 0.010253149084746838, "step": 320 }, { "epoch": 0.06, "grad_norm": 0.279296875, "learning_rate": 2.8399311531841657e-06, - "logits/chosen": -3.506737232208252, - "logits/rejected": -3.502964496612549, - "logps/chosen": -63.77069091796875, - "logps/rejected": -58.83369064331055, + "logits/chosen": -3.5062835216522217, + "logits/rejected": -3.502513885498047, + "logps/chosen": -63.776573181152344, + "logps/rejected": -58.834251403808594, "loss": 0.6875, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.019164523109793663, - "rewards/margins": 0.011498978361487389, - "rewards/rejected": 0.007665542419999838, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.01910565234720707, + "rewards/margins": 0.011445741169154644, + "rewards/rejected": 0.007659909315407276, "step": 330 }, { "epoch": 0.06, "grad_norm": 0.30859375, "learning_rate": 2.925989672977625e-06, - "logits/chosen": -3.5138423442840576, - "logits/rejected": -3.509005308151245, - "logps/chosen": -60.804222106933594, - "logps/rejected": -60.57802200317383, - "loss": 0.6877, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.017904791980981827, - "rewards/margins": 0.011298349127173424, - "rewards/rejected": 0.006606444716453552, + "logits/chosen": -3.513662815093994, + "logits/rejected": -3.508847713470459, + "logps/chosen": -60.776634216308594, + "logps/rejected": -60.560829162597656, + "loss": 0.6876, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.018180642277002335, + "rewards/margins": 0.011402291245758533, + "rewards/rejected": 0.006778349168598652, "step": 340 }, { "epoch": 0.06, "grad_norm": 0.310546875, "learning_rate": 3.012048192771085e-06, - "logits/chosen": -3.4848923683166504, - "logits/rejected": -3.4764468669891357, - "logps/chosen": -65.43782806396484, - "logps/rejected": -62.749595642089844, + "logits/chosen": -3.485520124435425, + "logits/rejected": -3.477074384689331, + "logps/chosen": -65.42162322998047, + "logps/rejected": -62.7513427734375, "loss": 0.6885, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.014699401333928108, - "rewards/margins": 0.009541595354676247, - "rewards/rejected": 0.005157806910574436, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014861424453556538, + "rewards/margins": 0.009721105918288231, + "rewards/rejected": 0.005140319466590881, "step": 350 }, { "epoch": 0.06, "grad_norm": 0.310546875, "learning_rate": 3.0981067125645443e-06, - "logits/chosen": -3.5276389122009277, - "logits/rejected": -3.523733139038086, - "logps/chosen": -64.9666519165039, - "logps/rejected": -58.31694412231445, + "logits/chosen": -3.527310609817505, + "logits/rejected": -3.523397445678711, + "logps/chosen": -64.94847106933594, + "logps/rejected": -58.3104133605957, "loss": 0.6882, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.006131452973932028, - "rewards/margins": 0.010225333273410797, - "rewards/rejected": -0.004093879368156195, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.006313213612884283, + "rewards/margins": 0.010341762565076351, + "rewards/rejected": -0.004028548486530781, "step": 360 }, { "epoch": 0.06, "grad_norm": 0.314453125, "learning_rate": 3.1841652323580036e-06, - "logits/chosen": -3.5156378746032715, - "logits/rejected": -3.5106139183044434, - "logps/chosen": -64.2197494506836, - "logps/rejected": -61.08530807495117, + "logits/chosen": -3.5157413482666016, + "logits/rejected": -3.510713577270508, + "logps/chosen": -64.20075988769531, + "logps/rejected": -61.081268310546875, "loss": 0.6867, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.002987655345350504, - "rewards/margins": 0.013294967822730541, - "rewards/rejected": -0.010307312943041325, + "rewards/chosen": 0.0031775743700563908, + "rewards/margins": 0.013444487936794758, + "rewards/rejected": -0.010266912169754505, "step": 370 }, { "epoch": 0.07, "grad_norm": 0.298828125, "learning_rate": 3.2702237521514633e-06, - "logits/chosen": -3.523965358734131, - "logits/rejected": -3.5195980072021484, - "logps/chosen": -64.2259292602539, - "logps/rejected": -60.46531295776367, + "logits/chosen": -3.523954391479492, + "logits/rejected": -3.519592761993408, + "logps/chosen": -64.20637512207031, + "logps/rejected": -60.44745635986328, "loss": 0.6863, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.003907281905412674, - "rewards/margins": 0.01409735344350338, - "rewards/rejected": -0.010190071538090706, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.004102800972759724, + "rewards/margins": 0.014114337041974068, + "rewards/rejected": -0.010011536069214344, "step": 380 }, { "epoch": 0.07, "grad_norm": 0.3359375, "learning_rate": 3.356282271944923e-06, - "logits/chosen": -3.508693218231201, - "logits/rejected": -3.5038185119628906, - "logps/chosen": -67.0108413696289, - "logps/rejected": -63.60310745239258, - "loss": 0.6856, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.0033676461316645145, - "rewards/margins": 0.015611497685313225, - "rewards/rejected": -0.012243852019309998, + "logits/chosen": -3.509253978729248, + "logits/rejected": -3.5043952465057373, + "logps/chosen": -66.97331237792969, + "logps/rejected": -63.58121871948242, + "loss": 0.6855, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.0037429791409522295, + "rewards/margins": 0.015767943114042282, + "rewards/rejected": -0.012024962343275547, "step": 390 }, { "epoch": 0.07, "grad_norm": 0.296875, "learning_rate": 3.4423407917383822e-06, - "logits/chosen": -3.5082168579101562, - "logits/rejected": -3.5035948753356934, - "logps/chosen": -64.50901794433594, - "logps/rejected": -61.93944549560547, - "loss": 0.6864, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.0032909319270402193, - "rewards/margins": 0.01393433939665556, - "rewards/rejected": -0.017225272953510284, + "logits/chosen": -3.508552074432373, + "logits/rejected": -3.5039405822753906, + "logps/chosen": -64.53360748291016, + "logps/rejected": -61.95587921142578, + "loss": 0.6865, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0035368031822144985, + "rewards/margins": 0.013852817006409168, + "rewards/rejected": -0.01738962158560753, "step": 400 }, { "epoch": 0.07, - "eval_logits/chosen": -3.4826815128326416, - "eval_logits/rejected": -3.4808835983276367, - "eval_logps/chosen": -69.75044250488281, - "eval_logps/rejected": -74.32178497314453, - "eval_loss": 0.6890121698379517, - "eval_rewards/accuracies": 0.5627323389053345, - "eval_rewards/chosen": 0.016595730558037758, - "eval_rewards/margins": 0.008743190206587315, - "eval_rewards/rejected": 0.00785253755748272, - "eval_runtime": 483.6906, - "eval_samples_per_second": 8.898, + "eval_logits/chosen": -3.4804558753967285, + "eval_logits/rejected": -3.4786477088928223, + "eval_logps/chosen": -69.7676773071289, + "eval_logps/rejected": -74.33695220947266, + "eval_loss": 0.6890220046043396, + "eval_rewards/accuracies": 0.5608736276626587, + "eval_rewards/chosen": 0.016423281282186508, + "eval_rewards/margins": 0.008722393773496151, + "eval_rewards/rejected": 0.0077008879743516445, + "eval_runtime": 483.8566, + "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 400 }, @@ -691,8974 +691,8974 @@ "epoch": 0.07, "grad_norm": 0.306640625, "learning_rate": 3.528399311531842e-06, - "logits/chosen": -3.5057005882263184, - "logits/rejected": -3.505286455154419, - "logps/chosen": -60.99641036987305, - "logps/rejected": -65.09056091308594, + "logits/chosen": -3.5053210258483887, + "logits/rejected": -3.5049126148223877, + "logps/chosen": -61.0108528137207, + "logps/rejected": -65.10823059082031, "loss": 0.6871, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.007737401872873306, - "rewards/margins": 0.012748445384204388, - "rewards/rejected": -0.02048584818840027, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.007881749421358109, + "rewards/margins": 0.012780706398189068, + "rewards/rejected": -0.0206624586135149, "step": 410 }, { "epoch": 0.07, - "grad_norm": 0.3671875, + "grad_norm": 0.369140625, "learning_rate": 3.6144578313253016e-06, - "logits/chosen": -3.4838051795959473, - "logits/rejected": -3.4827370643615723, - "logps/chosen": -66.43543243408203, - "logps/rejected": -64.77276611328125, + "logits/chosen": -3.4826858043670654, + "logits/rejected": -3.4816393852233887, + "logps/chosen": -66.45621490478516, + "logps/rejected": -64.80440521240234, "loss": 0.6903, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.015771260485053062, - "rewards/margins": 0.006309186574071646, - "rewards/rejected": -0.022080447524785995, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.015979185700416565, + "rewards/margins": 0.006417684257030487, + "rewards/rejected": -0.022396868094801903, "step": 420 }, { "epoch": 0.07, - "grad_norm": 0.333984375, + "grad_norm": 0.3359375, "learning_rate": 3.700516351118761e-06, - "logits/chosen": -3.513059139251709, - "logits/rejected": -3.5096065998077393, - "logps/chosen": -65.94549560546875, - "logps/rejected": -64.00807189941406, + "logits/chosen": -3.514251232147217, + "logits/rejected": -3.5107944011688232, + "logps/chosen": -65.9752197265625, + "logps/rejected": -64.03770446777344, "loss": 0.6865, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.006407345645129681, - "rewards/margins": 0.013842855580151081, - "rewards/rejected": -0.020250199362635612, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.006704515311866999, + "rewards/margins": 0.013841964304447174, + "rewards/rejected": -0.020546479150652885, "step": 430 }, { "epoch": 0.08, - "grad_norm": 0.369140625, + "grad_norm": 0.37109375, "learning_rate": 3.7865748709122206e-06, - "logits/chosen": -3.5059781074523926, - "logits/rejected": -3.498321056365967, - "logps/chosen": -65.25879669189453, - "logps/rejected": -62.97725296020508, + "logits/chosen": -3.5058677196502686, + "logits/rejected": -3.4982612133026123, + "logps/chosen": -65.30903625488281, + "logps/rejected": -63.014625549316406, "loss": 0.6843, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.00661065150052309, - "rewards/margins": 0.018645433709025383, - "rewards/rejected": -0.02525608241558075, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007113118655979633, + "rewards/margins": 0.01851661317050457, + "rewards/rejected": -0.025629732757806778, "step": 440 }, { "epoch": 0.08, - "grad_norm": 0.322265625, + "grad_norm": 0.32421875, "learning_rate": 3.87263339070568e-06, - "logits/chosen": -3.4879813194274902, - "logits/rejected": -3.4803619384765625, - "logps/chosen": -66.70152282714844, - "logps/rejected": -66.2663803100586, + "logits/chosen": -3.4888668060302734, + "logits/rejected": -3.481332302093506, + "logps/chosen": -66.73682403564453, + "logps/rejected": -66.31524658203125, "loss": 0.6814, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.0003645656688604504, - "rewards/margins": 0.0243084616959095, - "rewards/rejected": -0.02394389547407627, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.1589378118515015e-05, + "rewards/margins": 0.02444417215883732, + "rewards/rejected": -0.024432582780718803, "step": 450 }, { "epoch": 0.08, "grad_norm": 0.34375, "learning_rate": 3.958691910499139e-06, - "logits/chosen": -3.480090618133545, - "logits/rejected": -3.4755382537841797, - "logps/chosen": -68.15080261230469, - "logps/rejected": -63.669464111328125, + "logits/chosen": -3.480597734451294, + "logits/rejected": -3.476097822189331, + "logps/chosen": -68.15984344482422, + "logps/rejected": -63.6702995300293, "loss": 0.6876, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.01759023405611515, - "rewards/margins": 0.012100599706172943, - "rewards/rejected": -0.029690831899642944, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.017680566757917404, + "rewards/margins": 0.012018715031445026, + "rewards/rejected": -0.029699280858039856, "step": 460 }, { "epoch": 0.08, "grad_norm": 0.34765625, "learning_rate": 4.0447504302926e-06, - "logits/chosen": -3.4969534873962402, - "logits/rejected": -3.4901657104492188, - "logps/chosen": -65.4578857421875, - "logps/rejected": -62.07141876220703, + "logits/chosen": -3.4964325428009033, + "logits/rejected": -3.489607334136963, + "logps/chosen": -65.46156311035156, + "logps/rejected": -62.06793212890625, "loss": 0.6826, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.01021095085889101, - "rewards/margins": 0.022059690207242966, - "rewards/rejected": -0.0322706401348114, + "rewards/chosen": -0.01024774368852377, + "rewards/margins": 0.021988026797771454, + "rewards/rejected": -0.0322357676923275, "step": 470 }, { "epoch": 0.08, "grad_norm": 0.384765625, "learning_rate": 4.1308089500860585e-06, - "logits/chosen": -3.4784607887268066, - "logits/rejected": -3.478670120239258, - "logps/chosen": -65.68806457519531, - "logps/rejected": -70.04661560058594, - "loss": 0.6836, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.015385270118713379, - "rewards/margins": 0.02035362645983696, - "rewards/rejected": -0.03573889657855034, + "logits/chosen": -3.4784443378448486, + "logits/rejected": -3.478658676147461, + "logps/chosen": -65.70050048828125, + "logps/rejected": -70.04431915283203, + "loss": 0.6837, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.015509704127907753, + "rewards/margins": 0.02020612731575966, + "rewards/rejected": -0.035715825855731964, "step": 480 }, { "epoch": 0.08, "grad_norm": 0.392578125, "learning_rate": 4.216867469879519e-06, - "logits/chosen": -3.473278760910034, - "logits/rejected": -3.4622421264648438, - "logps/chosen": -71.43934631347656, - "logps/rejected": -62.09912872314453, + "logits/chosen": -3.472611665725708, + "logits/rejected": -3.4615890979766846, + "logps/chosen": -71.42842864990234, + "logps/rejected": -62.09379959106445, "loss": 0.6781, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.00411622179672122, - "rewards/margins": 0.03165096789598465, - "rewards/rejected": -0.03576719015836716, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0040070959366858006, + "rewards/margins": 0.03170691058039665, + "rewards/rejected": -0.035714007914066315, "step": 490 }, { "epoch": 0.09, - "grad_norm": 0.390625, + "grad_norm": 0.392578125, "learning_rate": 4.302925989672978e-06, - "logits/chosen": -3.476792097091675, - "logits/rejected": -3.4702491760253906, - "logps/chosen": -67.82813262939453, - "logps/rejected": -63.65681076049805, + "logits/chosen": -3.4761765003204346, + "logits/rejected": -3.4696431159973145, + "logps/chosen": -67.80470275878906, + "logps/rejected": -63.649932861328125, "loss": 0.6864, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.01605495624244213, - "rewards/margins": 0.01460212655365467, - "rewards/rejected": -0.030657082796096802, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01582060381770134, + "rewards/margins": 0.014767659828066826, + "rewards/rejected": -0.030588263645768166, "step": 500 }, { "epoch": 0.09, - "eval_logits/chosen": -3.4687280654907227, - "eval_logits/rejected": -3.4669294357299805, - "eval_logps/chosen": -69.05587005615234, - "eval_logps/rejected": -74.20924377441406, - "eval_loss": 0.6864105463027954, - "eval_rewards/accuracies": 0.571561336517334, - "eval_rewards/chosen": 0.023541457951068878, - "eval_rewards/margins": 0.014563486911356449, - "eval_rewards/rejected": 0.00897796917706728, - "eval_runtime": 483.8192, - "eval_samples_per_second": 8.896, + "eval_logits/chosen": -3.4679765701293945, + "eval_logits/rejected": -3.4661855697631836, + "eval_logps/chosen": -69.05375671386719, + "eval_logps/rejected": -74.21285247802734, + "eval_loss": 0.6863834261894226, + "eval_rewards/accuracies": 0.5755111575126648, + "eval_rewards/chosen": 0.023562604561448097, + "eval_rewards/margins": 0.014620588161051273, + "eval_rewards/rejected": 0.00894201546907425, + "eval_runtime": 483.8442, + "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 500 }, { "epoch": 0.09, - "grad_norm": 0.423828125, + "grad_norm": 0.42578125, "learning_rate": 4.388984509466438e-06, - "logits/chosen": -3.469301700592041, - "logits/rejected": -3.4632949829101562, - "logps/chosen": -71.26062774658203, - "logps/rejected": -65.58277130126953, - "loss": 0.6858, + "logits/chosen": -3.4684576988220215, + "logits/rejected": -3.462451219558716, + "logps/chosen": -71.22663879394531, + "logps/rejected": -65.5745849609375, + "loss": 0.6856, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.026670118793845177, - "rewards/margins": 0.01618189923465252, - "rewards/rejected": -0.042852021753787994, + "rewards/chosen": -0.02633030153810978, + "rewards/margins": 0.01643994078040123, + "rewards/rejected": -0.04277024418115616, "step": 510 }, { "epoch": 0.09, "grad_norm": 0.59375, "learning_rate": 4.475043029259897e-06, - "logits/chosen": -3.4782233238220215, - "logits/rejected": -3.4720966815948486, - "logps/chosen": -72.25337982177734, - "logps/rejected": -67.89925384521484, - "loss": 0.6831, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.0589582622051239, - "rewards/margins": 0.021894726902246475, - "rewards/rejected": -0.08085299283266068, + "logits/chosen": -3.477713108062744, + "logits/rejected": -3.4715778827667236, + "logps/chosen": -72.2740707397461, + "logps/rejected": -67.93409729003906, + "loss": 0.683, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.05916525050997734, + "rewards/margins": 0.022036103531718254, + "rewards/rejected": -0.08120135962963104, "step": 520 }, { "epoch": 0.09, - "grad_norm": 0.404296875, + "grad_norm": 0.41015625, "learning_rate": 4.561101549053357e-06, - "logits/chosen": -3.468972682952881, - "logits/rejected": -3.4607043266296387, - "logps/chosen": -70.99177551269531, - "logps/rejected": -66.74734497070312, + "logits/chosen": -3.4684062004089355, + "logits/rejected": -3.4601237773895264, + "logps/chosen": -71.05815124511719, + "logps/rejected": -66.8134765625, "loss": 0.6744, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.044935859739780426, - "rewards/margins": 0.03998479247093201, - "rewards/rejected": -0.08492065221071243, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04559968411922455, + "rewards/margins": 0.03998229280114174, + "rewards/rejected": -0.08558198064565659, "step": 530 }, { "epoch": 0.09, "grad_norm": 0.435546875, "learning_rate": 4.647160068846816e-06, - "logits/chosen": -3.4679083824157715, - "logits/rejected": -3.4652633666992188, - "logps/chosen": -70.12334442138672, - "logps/rejected": -68.92913818359375, + "logits/chosen": -3.4680933952331543, + "logits/rejected": -3.4654452800750732, + "logps/chosen": -70.15602111816406, + "logps/rejected": -68.97514343261719, "loss": 0.6822, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.05896232649683952, - "rewards/margins": 0.024110907688736916, - "rewards/rejected": -0.08307323604822159, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.059289127588272095, + "rewards/margins": 0.024244192987680435, + "rewards/rejected": -0.08353332430124283, "step": 540 }, { "epoch": 0.09, "grad_norm": 0.46875, "learning_rate": 4.7332185886402755e-06, - "logits/chosen": -3.4796371459960938, - "logits/rejected": -3.476511001586914, - "logps/chosen": -68.4463119506836, - "logps/rejected": -67.37413024902344, - "loss": 0.676, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.04435231536626816, - "rewards/margins": 0.037112440913915634, - "rewards/rejected": -0.08146476745605469, + "logits/chosen": -3.4790825843811035, + "logits/rejected": -3.47590708732605, + "logps/chosen": -68.46205139160156, + "logps/rejected": -67.3900146484375, + "loss": 0.6761, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0445096381008625, + "rewards/margins": 0.037113942205905914, + "rewards/rejected": -0.08162357658147812, "step": 550 }, { "epoch": 0.1, - "grad_norm": 0.5546875, + "grad_norm": 0.55859375, "learning_rate": 4.819277108433735e-06, - "logits/chosen": -3.4870948791503906, - "logits/rejected": -3.486546754837036, - "logps/chosen": -67.91651916503906, - "logps/rejected": -71.3250961303711, - "loss": 0.6749, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.04496331140398979, - "rewards/margins": 0.03978149592876434, - "rewards/rejected": -0.08474480360746384, + "logits/chosen": -3.48732328414917, + "logits/rejected": -3.486783504486084, + "logps/chosen": -67.97535705566406, + "logps/rejected": -71.35958099365234, + "loss": 0.675, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.045551758259534836, + "rewards/margins": 0.03953787311911583, + "rewards/rejected": -0.08508963137865067, "step": 560 }, { "epoch": 0.1, - "grad_norm": 0.455078125, + "grad_norm": 0.462890625, "learning_rate": 4.905335628227195e-06, - "logits/chosen": -3.4745421409606934, - "logits/rejected": -3.4724769592285156, - "logps/chosen": -66.85447692871094, - "logps/rejected": -70.38420867919922, - "loss": 0.6793, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.04901731759309769, - "rewards/margins": 0.03062910959124565, - "rewards/rejected": -0.07964642345905304, + "logits/chosen": -3.473559617996216, + "logits/rejected": -3.471503734588623, + "logps/chosen": -67.1822738647461, + "logps/rejected": -70.67951965332031, + "loss": 0.6794, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.052295297384262085, + "rewards/margins": 0.030304264277219772, + "rewards/rejected": -0.08259955793619156, "step": 570 }, { "epoch": 0.1, - "grad_norm": 0.498046875, + "grad_norm": 0.5078125, "learning_rate": 4.991394148020655e-06, - "logits/chosen": -3.474553346633911, - "logits/rejected": -3.473367214202881, - "logps/chosen": -64.95735168457031, - "logps/rejected": -66.52529907226562, - "loss": 0.6809, + "logits/chosen": -3.4739394187927246, + "logits/rejected": -3.4727649688720703, + "logps/chosen": -65.24308776855469, + "logps/rejected": -66.8325424194336, + "loss": 0.6808, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.04008790850639343, - "rewards/margins": 0.026910793036222458, - "rewards/rejected": -0.06699870526790619, + "rewards/chosen": -0.04294530302286148, + "rewards/margins": 0.027125859633088112, + "rewards/rejected": -0.07007116079330444, "step": 580 }, { "epoch": 0.1, - "grad_norm": 0.62109375, + "grad_norm": 0.62890625, "learning_rate": 4.999963354556567e-06, - "logits/chosen": -3.468188524246216, - "logits/rejected": -3.4631600379943848, - "logps/chosen": -66.08407592773438, - "logps/rejected": -68.20244598388672, + "logits/chosen": -3.4689114093780518, + "logits/rejected": -3.463846206665039, + "logps/chosen": -65.95047760009766, + "logps/rejected": -68.07025146484375, "loss": 0.6766, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.012904520146548748, - "rewards/margins": 0.03578165918588638, - "rewards/rejected": -0.04868617653846741, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.011568538844585419, + "rewards/margins": 0.03579581156373024, + "rewards/rejected": -0.04736434668302536, "step": 590 }, { "epoch": 0.1, - "grad_norm": 0.5546875, + "grad_norm": 0.54296875, "learning_rate": 4.9998366803288885e-06, - "logits/chosen": -3.458890438079834, - "logits/rejected": -3.4556713104248047, - "logps/chosen": -68.03291320800781, - "logps/rejected": -70.33735656738281, - "loss": 0.6729, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.02153713069856167, - "rewards/margins": 0.043937359005212784, - "rewards/rejected": -0.0654744952917099, + "logits/chosen": -3.459731340408325, + "logits/rejected": -3.4565346240997314, + "logps/chosen": -67.85731506347656, + "logps/rejected": -70.11259460449219, + "loss": 0.6731, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.019781148061156273, + "rewards/margins": 0.04344576969742775, + "rewards/rejected": -0.06322692334651947, "step": 600 }, { "epoch": 0.1, - "eval_logits/chosen": -3.450634479522705, - "eval_logits/rejected": -3.448904514312744, - "eval_logps/chosen": -71.3561782836914, - "eval_logps/rejected": -77.16294860839844, - "eval_loss": 0.6836954951286316, - "eval_rewards/accuracies": 0.5868958830833435, - "eval_rewards/chosen": 0.0005383504321798682, - "eval_rewards/margins": 0.021097427234053612, - "eval_rewards/rejected": -0.020559076219797134, - "eval_runtime": 483.8167, + "eval_logits/chosen": -3.451451063156128, + "eval_logits/rejected": -3.44972562789917, + "eval_logps/chosen": -71.21887969970703, + "eval_logps/rejected": -77.00116729736328, + "eval_loss": 0.6837956309318542, + "eval_rewards/accuracies": 0.5871282815933228, + "eval_rewards/chosen": 0.001911371131427586, + "eval_rewards/margins": 0.020852578803896904, + "eval_rewards/rejected": -0.018941204994916916, + "eval_runtime": 483.7869, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 600 }, { "epoch": 0.11, - "grad_norm": 0.578125, + "grad_norm": 0.5859375, "learning_rate": 4.9996195294877135e-06, - "logits/chosen": -3.468038558959961, - "logits/rejected": -3.4673168659210205, - "logps/chosen": -72.26264953613281, - "logps/rejected": -73.00647735595703, - "loss": 0.6772, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.07366601377725601, - "rewards/margins": 0.03525187447667122, - "rewards/rejected": -0.10891789197921753, + "logits/chosen": -3.468444347381592, + "logits/rejected": -3.4676971435546875, + "logps/chosen": -72.33124542236328, + "logps/rejected": -73.11021423339844, + "loss": 0.677, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.07435192912817001, + "rewards/margins": 0.03560345247387886, + "rewards/rejected": -0.10995538532733917, "step": 610 }, { "epoch": 0.11, - "grad_norm": 0.703125, + "grad_norm": 0.68359375, "learning_rate": 4.999311909892384e-06, - "logits/chosen": -3.4714443683624268, - "logits/rejected": -3.4664993286132812, - "logps/chosen": -76.85301208496094, - "logps/rejected": -79.18445587158203, - "loss": 0.6598, + "logits/chosen": -3.4697864055633545, + "logits/rejected": -3.4647693634033203, + "logps/chosen": -77.31246948242188, + "logps/rejected": -79.62358856201172, + "loss": 0.6599, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.10608841478824615, - "rewards/margins": 0.07308591902256012, - "rewards/rejected": -0.17917433381080627, + "rewards/chosen": -0.11068302392959595, + "rewards/margins": 0.07288263738155365, + "rewards/rejected": -0.1835656464099884, "step": 620 }, { "epoch": 0.11, - "grad_norm": 0.625, + "grad_norm": 0.6171875, "learning_rate": 4.998913832676579e-06, - "logits/chosen": -3.4586329460144043, - "logits/rejected": -3.454993724822998, - "logps/chosen": -78.71052551269531, - "logps/rejected": -76.41368103027344, + "logits/chosen": -3.458186626434326, + "logits/rejected": -3.454568862915039, + "logps/chosen": -78.59623718261719, + "logps/rejected": -76.2980728149414, "loss": 0.6742, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.13212424516677856, - "rewards/margins": 0.043377432972192764, - "rewards/rejected": -0.17550167441368103, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13098138570785522, + "rewards/margins": 0.04336429387331009, + "rewards/rejected": -0.1743456870317459, "step": 630 }, { "epoch": 0.11, - "grad_norm": 0.99609375, + "grad_norm": 1.0390625, "learning_rate": 4.998425312247913e-06, - "logits/chosen": -3.470881700515747, - "logits/rejected": -3.467952013015747, - "logps/chosen": -77.85877990722656, - "logps/rejected": -81.37117767333984, - "loss": 0.6766, + "logits/chosen": -3.4712257385253906, + "logits/rejected": -3.468402862548828, + "logps/chosen": -77.81741333007812, + "logps/rejected": -81.34028625488281, + "loss": 0.6765, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.15040381252765656, - "rewards/margins": 0.04006998986005783, - "rewards/rejected": -0.19047380983829498, + "rewards/chosen": -0.14999021589756012, + "rewards/margins": 0.04017460346221924, + "rewards/rejected": -0.19016483426094055, "step": 640 }, { "epoch": 0.11, - "grad_norm": 0.59375, + "grad_norm": 0.58203125, "learning_rate": 4.997846366287408e-06, - "logits/chosen": -3.490457057952881, - "logits/rejected": -3.4913439750671387, - "logps/chosen": -80.30027770996094, - "logps/rejected": -80.63001251220703, - "loss": 0.6962, + "logits/chosen": -3.491013288497925, + "logits/rejected": -3.4919509887695312, + "logps/chosen": -79.38850402832031, + "logps/rejected": -79.69107818603516, + "loss": 0.6961, "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -0.17833223938941956, - "rewards/margins": -0.0010680959094315767, - "rewards/rejected": -0.17726415395736694, + "rewards/chosen": -0.16921451687812805, + "rewards/margins": -0.0013396050781011581, + "rewards/rejected": -0.16787490248680115, "step": 650 }, { "epoch": 0.11, - "grad_norm": 0.578125, + "grad_norm": 0.58203125, "learning_rate": 4.997177015748862e-06, - "logits/chosen": -3.451584577560425, - "logits/rejected": -3.4501731395721436, - "logps/chosen": -75.44596862792969, - "logps/rejected": -75.20318603515625, - "loss": 0.673, + "logits/chosen": -3.45072603225708, + "logits/rejected": -3.449314594268799, + "logps/chosen": -75.71781158447266, + "logps/rejected": -75.43241119384766, + "loss": 0.6732, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.09660879522562027, - "rewards/margins": 0.04417235776782036, - "rewards/rejected": -0.14078114926815033, + "rewards/chosen": -0.0993271917104721, + "rewards/margins": 0.04374626651406288, + "rewards/rejected": -0.14307346940040588, "step": 660 }, { "epoch": 0.12, - "grad_norm": 0.5234375, + "grad_norm": 0.51953125, "learning_rate": 4.996417284858085e-06, - "logits/chosen": -3.46248197555542, - "logits/rejected": -3.461444854736328, - "logps/chosen": -71.69635009765625, - "logps/rejected": -77.65562438964844, + "logits/chosen": -3.4625446796417236, + "logits/rejected": -3.4615135192871094, + "logps/chosen": -71.90211486816406, + "logps/rejected": -77.84752655029297, "loss": 0.6755, "rewards/accuracies": 0.625, - "rewards/chosen": -0.07967537641525269, - "rewards/margins": 0.04023570939898491, - "rewards/rejected": -0.1199110895395279, + "rewards/chosen": -0.0817331001162529, + "rewards/margins": 0.04009716957807541, + "rewards/rejected": -0.12183026969432831, "step": 670 }, { "epoch": 0.12, - "grad_norm": 0.53515625, + "grad_norm": 0.53125, "learning_rate": 4.995567201112025e-06, - "logits/chosen": -3.4511241912841797, - "logits/rejected": -3.447023868560791, - "logps/chosen": -73.80950927734375, - "logps/rejected": -71.80841827392578, + "logits/chosen": -3.4504151344299316, + "logits/rejected": -3.446338653564453, + "logps/chosen": -73.89292907714844, + "logps/rejected": -71.89225769042969, "loss": 0.6741, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.0922587662935257, - "rewards/margins": 0.041829634457826614, - "rewards/rejected": -0.134088397026062, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09309293329715729, + "rewards/margins": 0.041833892464637756, + "rewards/rejected": -0.13492682576179504, "step": 680 }, { "epoch": 0.12, - "grad_norm": 0.68359375, + "grad_norm": 0.671875, "learning_rate": 4.994626795277772e-06, - "logits/chosen": -3.475782871246338, - "logits/rejected": -3.4695351123809814, - "logps/chosen": -81.76004791259766, - "logps/rejected": -76.49635314941406, - "loss": 0.6668, + "logits/chosen": -3.4752120971679688, + "logits/rejected": -3.4689478874206543, + "logps/chosen": -81.83485412597656, + "logps/rejected": -76.503662109375, + "loss": 0.6672, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.11172202974557877, - "rewards/margins": 0.058455634862184525, - "rewards/rejected": -0.1701776683330536, + "rewards/chosen": -0.11247005313634872, + "rewards/margins": 0.057780712842941284, + "rewards/rejected": -0.1702507734298706, "step": 690 }, { "epoch": 0.12, "grad_norm": 0.61328125, "learning_rate": 4.993596101391443e-06, - "logits/chosen": -3.4717812538146973, - "logits/rejected": -3.4656291007995605, - "logps/chosen": -82.42352294921875, - "logps/rejected": -81.38658905029297, - "loss": 0.6745, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.1619960516691208, - "rewards/margins": 0.04331028833985329, - "rewards/rejected": -0.20530633628368378, + "logits/chosen": -3.471196413040161, + "logits/rejected": -3.4650673866271973, + "logps/chosen": -82.47694396972656, + "logps/rejected": -81.34616088867188, + "loss": 0.6749, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.16253015398979187, + "rewards/margins": 0.04237184301018715, + "rewards/rejected": -0.2049020230770111, "step": 700 }, { "epoch": 0.12, - "eval_logits/chosen": -3.4486892223358154, - "eval_logits/rejected": -3.446747303009033, - "eval_logps/chosen": -78.93724060058594, - "eval_logps/rejected": -85.99559020996094, - "eval_loss": 0.6785964965820312, - "eval_rewards/accuracies": 0.5954925417900085, - "eval_rewards/chosen": -0.07527217268943787, - "eval_rewards/margins": 0.03361326456069946, - "eval_rewards/rejected": -0.10888542979955673, - "eval_runtime": 483.831, - "eval_samples_per_second": 8.896, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.448903799057007, + "eval_logits/rejected": -3.44696044921875, + "eval_logps/chosen": -78.99446868896484, + "eval_logps/rejected": -86.01776885986328, + "eval_loss": 0.6787543296813965, + "eval_rewards/accuracies": 0.5980483293533325, + "eval_rewards/chosen": -0.07584448158740997, + "eval_rewards/margins": 0.03326273709535599, + "eval_rewards/rejected": -0.10910722613334656, + "eval_runtime": 484.0821, + "eval_samples_per_second": 8.891, + "eval_steps_per_second": 1.111, "step": 700 }, { "epoch": 0.12, "grad_norm": 0.73828125, "learning_rate": 4.992475156756952e-06, - "logits/chosen": -3.457979679107666, - "logits/rejected": -3.452868938446045, - "logps/chosen": -78.66218566894531, - "logps/rejected": -82.35487365722656, - "loss": 0.6669, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.14062485098838806, - "rewards/margins": 0.05895204097032547, - "rewards/rejected": -0.19957688450813293, + "logits/chosen": -3.4566562175750732, + "logits/rejected": -3.45149302482605, + "logps/chosen": -78.71736907958984, + "logps/rejected": -82.33731842041016, + "loss": 0.6672, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1411767303943634, + "rewards/margins": 0.05822443962097168, + "rewards/rejected": -0.19940117001533508, "step": 710 }, { "epoch": 0.12, - "grad_norm": 0.703125, + "grad_norm": 0.6953125, "learning_rate": 4.991264001944659e-06, - "logits/chosen": -3.448965549468994, - "logits/rejected": -3.4486708641052246, - "logps/chosen": -79.89633178710938, - "logps/rejected": -84.61561584472656, - "loss": 0.6662, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.15700937807559967, - "rewards/margins": 0.06110849231481552, - "rewards/rejected": -0.2181178778409958, + "logits/chosen": -3.448073148727417, + "logits/rejected": -3.4477546215057373, + "logps/chosen": -79.56318664550781, + "logps/rejected": -84.22515869140625, + "loss": 0.6664, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.15367797017097473, + "rewards/margins": 0.06053520366549492, + "rewards/rejected": -0.21421320736408234, "step": 720 }, { "epoch": 0.13, - "grad_norm": 0.98828125, + "grad_norm": 0.9921875, "learning_rate": 4.989962680789901e-06, - "logits/chosen": -3.4754321575164795, - "logits/rejected": -3.47139048576355, - "logps/chosen": -91.14657592773438, - "logps/rejected": -93.26203918457031, - "loss": 0.6583, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.24363143742084503, - "rewards/margins": 0.07978501170873642, - "rewards/rejected": -0.32341647148132324, + "logits/chosen": -3.4756839275360107, + "logits/rejected": -3.471583843231201, + "logps/chosen": -90.78223419189453, + "logps/rejected": -92.80326843261719, + "loss": 0.6587, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.23998799920082092, + "rewards/margins": 0.0788407102227211, + "rewards/rejected": -0.3188287019729614, "step": 730 }, { "epoch": 0.13, "grad_norm": 0.90234375, "learning_rate": 4.9885712403914095e-06, - "logits/chosen": -3.4460208415985107, - "logits/rejected": -3.4406116008758545, - "logps/chosen": -96.26573181152344, - "logps/rejected": -100.246826171875, - "loss": 0.6611, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.3055499196052551, - "rewards/margins": 0.07419048994779587, - "rewards/rejected": -0.3797404170036316, + "logits/chosen": -3.446706771850586, + "logits/rejected": -3.4412574768066406, + "logps/chosen": -96.09211730957031, + "logps/rejected": -100.0416030883789, + "loss": 0.6612, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.30381378531455994, + "rewards/margins": 0.073874332010746, + "rewards/rejected": -0.3776881694793701, "step": 740 }, { "epoch": 0.13, - "grad_norm": 0.81640625, + "grad_norm": 0.796875, "learning_rate": 4.9870897311096e-06, - "logits/chosen": -3.4695637226104736, - "logits/rejected": -3.4636406898498535, - "logps/chosen": -96.90727233886719, - "logps/rejected": -99.74751281738281, - "loss": 0.6584, + "logits/chosen": -3.4710693359375, + "logits/rejected": -3.4651427268981934, + "logps/chosen": -96.75785827636719, + "logps/rejected": -99.54371643066406, + "loss": 0.6585, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.2984744906425476, - "rewards/margins": 0.08067616075277328, - "rewards/rejected": -0.3791506886482239, + "rewards/chosen": -0.29698044061660767, + "rewards/margins": 0.08013220131397247, + "rewards/rejected": -0.37711262702941895, "step": 750 }, { "epoch": 0.13, - "grad_norm": 0.7578125, + "grad_norm": 0.74609375, "learning_rate": 4.985518206564751e-06, - "logits/chosen": -3.433748722076416, - "logits/rejected": -3.428129196166992, - "logps/chosen": -82.85520935058594, - "logps/rejected": -78.89864349365234, - "loss": 0.6736, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.17378118634223938, - "rewards/margins": 0.04856756702065468, - "rewards/rejected": -0.22234873473644257, + "logits/chosen": -3.434743881225586, + "logits/rejected": -3.4291248321533203, + "logps/chosen": -82.78807067871094, + "logps/rejected": -78.91937255859375, + "loss": 0.6731, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17310987412929535, + "rewards/margins": 0.04944610223174095, + "rewards/rejected": -0.2225559651851654, "step": 760 }, { "epoch": 0.13, - "grad_norm": 0.71484375, + "grad_norm": 0.69140625, "learning_rate": 4.983856723635067e-06, - "logits/chosen": -3.450777530670166, - "logits/rejected": -3.446965456008911, - "logps/chosen": -74.33808898925781, - "logps/rejected": -76.29521179199219, - "loss": 0.6671, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.10210736095905304, - "rewards/margins": 0.05967769771814346, - "rewards/rejected": -0.1617850363254547, + "logits/chosen": -3.4525153636932373, + "logits/rejected": -3.448737621307373, + "logps/chosen": -74.29286193847656, + "logps/rejected": -76.18905639648438, + "loss": 0.6674, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10165517032146454, + "rewards/margins": 0.05906829237937927, + "rewards/rejected": -0.1607234627008438, "step": 770 }, { "epoch": 0.13, "grad_norm": 0.7109375, "learning_rate": 4.982105342454616e-06, - "logits/chosen": -3.4446187019348145, - "logits/rejected": -3.438462734222412, - "logps/chosen": -78.2662353515625, - "logps/rejected": -80.68006896972656, - "loss": 0.6607, + "logits/chosen": -3.4456591606140137, + "logits/rejected": -3.439436674118042, + "logps/chosen": -78.19596862792969, + "logps/rejected": -80.53340148925781, + "loss": 0.6611, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.13819073140621185, - "rewards/margins": 0.07206545025110245, - "rewards/rejected": -0.2102561742067337, + "rewards/chosen": -0.13748806715011597, + "rewards/margins": 0.07130144536495209, + "rewards/rejected": -0.20878951251506805, "step": 780 }, { "epoch": 0.14, - "grad_norm": 0.70703125, + "grad_norm": 0.74609375, "learning_rate": 4.980264126411153e-06, - "logits/chosen": -3.4261698722839355, - "logits/rejected": -3.421785354614258, - "logps/chosen": -79.30335998535156, - "logps/rejected": -82.90193176269531, + "logits/chosen": -3.4281296730041504, + "logits/rejected": -3.4237301349639893, + "logps/chosen": -79.74320983886719, + "logps/rejected": -83.33930969238281, "loss": 0.6664, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.14551669359207153, - "rewards/margins": 0.062169916927814484, - "rewards/rejected": -0.2076866179704666, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.149915412068367, + "rewards/margins": 0.06214505434036255, + "rewards/rejected": -0.21206045150756836, "step": 790 }, { "epoch": 0.14, - "grad_norm": 0.97265625, + "grad_norm": 0.92578125, "learning_rate": 4.97833314214383e-06, - "logits/chosen": -3.4357306957244873, - "logits/rejected": -3.4315574169158936, - "logps/chosen": -87.78187561035156, - "logps/rejected": -92.76189422607422, - "loss": 0.6681, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.23072782158851624, - "rewards/margins": 0.06284385174512863, - "rewards/rejected": -0.29357171058654785, + "logits/chosen": -3.4373092651367188, + "logits/rejected": -3.433082103729248, + "logps/chosen": -87.97721862792969, + "logps/rejected": -93.013916015625, + "loss": 0.6678, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2326813042163849, + "rewards/margins": 0.06341059505939484, + "rewards/rejected": -0.29609188437461853, "step": 800 }, { "epoch": 0.14, - "eval_logits/chosen": -3.416940212249756, - "eval_logits/rejected": -3.415050983428955, - "eval_logps/chosen": -90.19148254394531, - "eval_logps/rejected": -98.6569595336914, - "eval_loss": 0.6738145351409912, - "eval_rewards/accuracies": 0.5954925417900085, - "eval_rewards/chosen": -0.1878146529197693, - "eval_rewards/margins": 0.0476844422519207, - "eval_rewards/rejected": -0.23549909889698029, - "eval_runtime": 484.0365, - "eval_samples_per_second": 8.892, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.4187793731689453, + "eval_logits/rejected": -3.4168734550476074, + "eval_logps/chosen": -89.99909210205078, + "eval_logps/rejected": -98.40325927734375, + "eval_loss": 0.674051821231842, + "eval_rewards/accuracies": 0.5906133651733398, + "eval_rewards/chosen": -0.18589067459106445, + "eval_rewards/margins": 0.047071486711502075, + "eval_rewards/rejected": -0.23296219110488892, + "eval_runtime": 483.9258, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 800 }, { "epoch": 0.14, - "grad_norm": 0.87890625, + "grad_norm": 0.90625, "learning_rate": 4.9763124595407785e-06, - "logits/chosen": -3.4379210472106934, - "logits/rejected": -3.4322097301483154, - "logps/chosen": -96.42681884765625, - "logps/rejected": -98.7668228149414, - "loss": 0.6686, + "logits/chosen": -3.43872332572937, + "logits/rejected": -3.4330341815948486, + "logps/chosen": -96.375, + "logps/rejected": -98.60665130615234, + "loss": 0.669, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.29752808809280396, - "rewards/margins": 0.061955541372299194, - "rewards/rejected": -0.35948362946510315, + "rewards/chosen": -0.29701000452041626, + "rewards/margins": 0.06087196618318558, + "rewards/rejected": -0.35788196325302124, "step": 810 }, { "epoch": 0.14, - "grad_norm": 0.70703125, + "grad_norm": 0.7109375, "learning_rate": 4.974202151736584e-06, - "logits/chosen": -3.4384703636169434, - "logits/rejected": -3.4332988262176514, - "logps/chosen": -87.07771301269531, - "logps/rejected": -90.78187561035156, - "loss": 0.6614, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.24036762118339539, - "rewards/margins": 0.07609249651432037, - "rewards/rejected": -0.31646016240119934, + "logits/chosen": -3.4395699501037598, + "logits/rejected": -3.4343771934509277, + "logps/chosen": -87.37809753417969, + "logps/rejected": -90.93356323242188, + "loss": 0.6619, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24337145686149597, + "rewards/margins": 0.07460545003414154, + "rewards/rejected": -0.3179769217967987, "step": 820 }, { "epoch": 0.14, - "grad_norm": 0.94921875, + "grad_norm": 0.91796875, "learning_rate": 4.972002295109638e-06, - "logits/chosen": -3.4144272804260254, - "logits/rejected": -3.4126338958740234, - "logps/chosen": -84.09645080566406, - "logps/rejected": -88.27267456054688, - "loss": 0.6581, + "logits/chosen": -3.4160866737365723, + "logits/rejected": -3.4143104553222656, + "logps/chosen": -84.53849029541016, + "logps/rejected": -88.76664733886719, + "loss": 0.6578, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.18734727799892426, - "rewards/margins": 0.08099165558815002, - "rewards/rejected": -0.2683389186859131, + "rewards/chosen": -0.19176757335662842, + "rewards/margins": 0.08151111006736755, + "rewards/rejected": -0.2732786536216736, "step": 830 }, { "epoch": 0.14, - "grad_norm": 0.93359375, + "grad_norm": 0.91015625, "learning_rate": 4.969712969279372e-06, - "logits/chosen": -3.4207637310028076, - "logits/rejected": -3.41292142868042, - "logps/chosen": -85.66937255859375, - "logps/rejected": -87.91021728515625, - "loss": 0.6529, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.203573539853096, - "rewards/margins": 0.09274569153785706, - "rewards/rejected": -0.29631924629211426, + "logits/chosen": -3.4232819080352783, + "logits/rejected": -3.4152915477752686, + "logps/chosen": -84.25233459472656, + "logps/rejected": -86.18933868408203, + "loss": 0.654, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.18940319120883942, + "rewards/margins": 0.08970735222101212, + "rewards/rejected": -0.27911052107810974, "step": 840 }, { "epoch": 0.15, - "grad_norm": 0.90234375, + "grad_norm": 0.85546875, "learning_rate": 4.967334257103379e-06, - "logits/chosen": -3.4048008918762207, - "logits/rejected": -3.4047226905822754, - "logps/chosen": -88.8246841430664, - "logps/rejected": -97.03109741210938, - "loss": 0.6693, + "logits/chosen": -3.4104301929473877, + "logits/rejected": -3.410254716873169, + "logps/chosen": -87.65679168701172, + "logps/rejected": -96.04679107666016, + "loss": 0.668, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.2746245265007019, - "rewards/margins": 0.06276088953018188, - "rewards/rejected": -0.3373854160308838, + "rewards/chosen": -0.2629455029964447, + "rewards/margins": 0.06459694355726242, + "rewards/rejected": -0.32754242420196533, "step": 850 }, { "epoch": 0.15, - "grad_norm": 1.5546875, + "grad_norm": 1.515625, "learning_rate": 4.9648662446744115e-06, - "logits/chosen": -3.4170143604278564, - "logits/rejected": -3.4106361865997314, - "logps/chosen": -86.87603759765625, - "logps/rejected": -92.53409576416016, - "loss": 0.6534, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.23126204311847687, - "rewards/margins": 0.09931255877017975, - "rewards/rejected": -0.3305746018886566, + "logits/chosen": -3.4208884239196777, + "logits/rejected": -3.414494276046753, + "logps/chosen": -86.74958801269531, + "logps/rejected": -92.23248291015625, + "loss": 0.6539, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.22999759018421173, + "rewards/margins": 0.0975608378648758, + "rewards/rejected": -0.3275584280490875, "step": 860 }, { "epoch": 0.15, - "grad_norm": 1.015625, + "grad_norm": 1.0078125, "learning_rate": 4.962309021317268e-06, - "logits/chosen": -3.4022164344787598, - "logits/rejected": -3.4014739990234375, - "logps/chosen": -83.60894775390625, - "logps/rejected": -93.01090240478516, - "loss": 0.6531, + "logits/chosen": -3.4057350158691406, + "logits/rejected": -3.4048256874084473, + "logps/chosen": -82.92234802246094, + "logps/rejected": -92.20657348632812, + "loss": 0.6534, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.220720574259758, - "rewards/margins": 0.09608150273561478, - "rewards/rejected": -0.316802054643631, + "rewards/chosen": -0.21385452151298523, + "rewards/margins": 0.09490419924259186, + "rewards/rejected": -0.3087587058544159, "step": 870 }, { "epoch": 0.15, - "grad_norm": 0.99609375, + "grad_norm": 1.0078125, "learning_rate": 4.959662679585559e-06, - "logits/chosen": -3.402447462081909, - "logits/rejected": -3.3985283374786377, - "logps/chosen": -91.08452606201172, - "logps/rejected": -96.7422103881836, - "loss": 0.6504, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.2804550230503082, - "rewards/margins": 0.10522060096263885, - "rewards/rejected": -0.3856756091117859, + "logits/chosen": -3.40657114982605, + "logits/rejected": -3.402423143386841, + "logps/chosen": -90.1636734008789, + "logps/rejected": -95.72185516357422, + "loss": 0.6505, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.27124643325805664, + "rewards/margins": 0.10422557592391968, + "rewards/rejected": -0.3754720091819763, "step": 880 }, { "epoch": 0.15, - "grad_norm": 0.921875, + "grad_norm": 0.87890625, "learning_rate": 4.956927315258356e-06, - "logits/chosen": -3.385986804962158, - "logits/rejected": -3.3800766468048096, - "logps/chosen": -96.92965698242188, - "logps/rejected": -94.84703826904297, - "loss": 0.6758, + "logits/chosen": -3.389841079711914, + "logits/rejected": -3.3837730884552, + "logps/chosen": -96.669189453125, + "logps/rejected": -94.84233093261719, + "loss": 0.6747, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.271820604801178, - "rewards/margins": 0.051150452345609665, - "rewards/rejected": -0.32297104597091675, + "rewards/chosen": -0.2692159414291382, + "rewards/margins": 0.05370805412530899, + "rewards/rejected": -0.3229239881038666, "step": 890 }, { "epoch": 0.16, - "grad_norm": 1.09375, + "grad_norm": 1.0703125, "learning_rate": 4.9541030273367276e-06, - "logits/chosen": -3.386122226715088, - "logits/rejected": -3.385578155517578, - "logps/chosen": -91.31932067871094, - "logps/rejected": -93.32504272460938, - "loss": 0.6661, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.26877859234809875, - "rewards/margins": 0.06785809248685837, - "rewards/rejected": -0.3366366922855377, + "logits/chosen": -3.388092041015625, + "logits/rejected": -3.3876006603240967, + "logps/chosen": -92.34650421142578, + "logps/rejected": -94.53489685058594, + "loss": 0.6655, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2790505290031433, + "rewards/margins": 0.069684699177742, + "rewards/rejected": -0.3487352430820465, "step": 900 }, { "epoch": 0.16, - "eval_logits/chosen": -3.375498056411743, - "eval_logits/rejected": -3.374028444290161, - "eval_logps/chosen": -88.59937286376953, - "eval_logps/rejected": -97.6375961303711, - "eval_loss": 0.671495795249939, - "eval_rewards/accuracies": 0.5922397971153259, - "eval_rewards/chosen": -0.17189349234104156, - "eval_rewards/margins": 0.053412046283483505, - "eval_rewards/rejected": -0.22530555725097656, - "eval_runtime": 483.8194, - "eval_samples_per_second": 8.896, + "eval_logits/chosen": -3.3825888633728027, + "eval_logits/rejected": -3.381103038787842, + "eval_logps/chosen": -89.96687316894531, + "eval_logps/rejected": -99.21875762939453, + "eval_loss": 0.6709262728691101, + "eval_rewards/accuracies": 0.5927044749259949, + "eval_rewards/chosen": -0.18556852638721466, + "eval_rewards/margins": 0.05554860830307007, + "eval_rewards/rejected": -0.24111711978912354, + "eval_runtime": 483.9133, + "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 900 }, { "epoch": 0.16, - "grad_norm": 0.93359375, + "grad_norm": 0.921875, "learning_rate": 4.951189918040154e-06, - "logits/chosen": -3.3894095420837402, - "logits/rejected": -3.3857262134552, - "logps/chosen": -91.1655044555664, - "logps/rejected": -99.69454193115234, - "loss": 0.6582, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.29526156187057495, - "rewards/margins": 0.09328372031450272, - "rewards/rejected": -0.3885452151298523, + "logits/chosen": -3.394538164138794, + "logits/rejected": -3.390890598297119, + "logps/chosen": -91.48938751220703, + "logps/rejected": -99.90650939941406, + "loss": 0.6586, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.29850026965141296, + "rewards/margins": 0.09216472506523132, + "rewards/rejected": -0.3906649947166443, "step": 910 }, { "epoch": 0.16, - "grad_norm": 1.0625, + "grad_norm": 0.9765625, "learning_rate": 4.948188092802828e-06, - "logits/chosen": -3.379016160964966, - "logits/rejected": -3.3721976280212402, - "logps/chosen": -94.36839294433594, - "logps/rejected": -92.09681701660156, - "loss": 0.6691, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.2924317717552185, - "rewards/margins": 0.06343097984790802, - "rewards/rejected": -0.35586273670196533, + "logits/chosen": -3.384577512741089, + "logits/rejected": -3.377805709838867, + "logps/chosen": -93.5625991821289, + "logps/rejected": -91.18093872070312, + "loss": 0.6696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2843739688396454, + "rewards/margins": 0.06233012676239014, + "rewards/rejected": -0.3467040956020355, "step": 920 }, { "epoch": 0.16, - "grad_norm": 0.9609375, + "grad_norm": 1.0234375, "learning_rate": 4.94509766026984e-06, - "logits/chosen": -3.3837249279022217, - "logits/rejected": -3.3793246746063232, - "logps/chosen": -87.13783264160156, - "logps/rejected": -95.68738555908203, - "loss": 0.6522, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.24055281281471252, - "rewards/margins": 0.10128758102655411, - "rewards/rejected": -0.34184038639068604, + "logits/chosen": -3.388658046722412, + "logits/rejected": -3.384310245513916, + "logps/chosen": -85.4650650024414, + "logps/rejected": -93.18614196777344, + "loss": 0.6555, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.22382517158985138, + "rewards/margins": 0.09300287067890167, + "rewards/rejected": -0.31682807207107544, "step": 930 }, { "epoch": 0.16, - "grad_norm": 0.9296875, + "grad_norm": 0.96875, "learning_rate": 4.941918732293246e-06, - "logits/chosen": -3.3826732635498047, - "logits/rejected": -3.3760414123535156, - "logps/chosen": -101.3785171508789, - "logps/rejected": -102.48109436035156, - "loss": 0.6574, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.3002975881099701, - "rewards/margins": 0.0871744379401207, - "rewards/rejected": -0.3874720633029938, + "logits/chosen": -3.3858253955841064, + "logits/rejected": -3.3792338371276855, + "logps/chosen": -99.0291519165039, + "logps/rejected": -99.58036804199219, + "loss": 0.6596, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.27680402994155884, + "rewards/margins": 0.08166074752807617, + "rewards/rejected": -0.358464777469635, "step": 940 }, { "epoch": 0.16, - "grad_norm": 0.94921875, + "grad_norm": 0.95703125, "learning_rate": 4.9386514239280156e-06, - "logits/chosen": -3.3428120613098145, - "logits/rejected": -3.3408846855163574, - "logps/chosen": -104.9477767944336, - "logps/rejected": -110.55946350097656, - "loss": 0.6662, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.3990733325481415, - "rewards/margins": 0.07994810491800308, - "rewards/rejected": -0.47902145981788635, + "logits/chosen": -3.3473052978515625, + "logits/rejected": -3.345482587814331, + "logps/chosen": -103.10880279541016, + "logps/rejected": -108.80924224853516, + "loss": 0.6652, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.3806834816932678, + "rewards/margins": 0.08083571493625641, + "rewards/rejected": -0.46151915192604065, "step": 950 }, { "epoch": 0.17, - "grad_norm": 0.91015625, + "grad_norm": 0.87890625, "learning_rate": 4.935295853427875e-06, - "logits/chosen": -3.345571517944336, - "logits/rejected": -3.3475120067596436, - "logps/chosen": -93.88117218017578, - "logps/rejected": -102.92850494384766, - "loss": 0.6674, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.32920488715171814, - "rewards/margins": 0.06851590424776077, - "rewards/rejected": -0.3977208137512207, + "logits/chosen": -3.3498833179473877, + "logits/rejected": -3.351750612258911, + "logps/chosen": -92.35511016845703, + "logps/rejected": -101.44236755371094, + "loss": 0.6671, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.3139442503452301, + "rewards/margins": 0.06891517341136932, + "rewards/rejected": -0.3828594386577606, "step": 960 }, { "epoch": 0.17, - "grad_norm": 1.140625, + "grad_norm": 0.95703125, "learning_rate": 4.9318521422410186e-06, - "logits/chosen": -3.3579020500183105, - "logits/rejected": -3.353294849395752, - "logps/chosen": -101.0849838256836, - "logps/rejected": -95.56061553955078, - "loss": 0.6744, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.307528555393219, - "rewards/margins": 0.05522537976503372, - "rewards/rejected": -0.3627539277076721, + "logits/chosen": -3.361420154571533, + "logits/rejected": -3.3568592071533203, + "logps/chosen": -99.36900329589844, + "logps/rejected": -93.79570770263672, + "loss": 0.674, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2903686761856079, + "rewards/margins": 0.054736148566007614, + "rewards/rejected": -0.345104843378067, "step": 970 }, { "epoch": 0.17, - "grad_norm": 0.8828125, + "grad_norm": 0.89453125, "learning_rate": 4.928320415005718e-06, - "logits/chosen": -3.3926453590393066, - "logits/rejected": -3.3888983726501465, - "logps/chosen": -90.27783203125, - "logps/rejected": -94.86637878417969, - "loss": 0.6541, + "logits/chosen": -3.3936476707458496, + "logits/rejected": -3.3898472785949707, + "logps/chosen": -89.65267944335938, + "logps/rejected": -94.29356384277344, + "loss": 0.6536, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.22734880447387695, - "rewards/margins": 0.09869574755430222, - "rewards/rejected": -0.3260445296764374, + "rewards/chosen": -0.22109732031822205, + "rewards/margins": 0.09921900928020477, + "rewards/rejected": -0.320316344499588, "step": 980 }, { "epoch": 0.17, - "grad_norm": 1.0546875, + "grad_norm": 1.015625, "learning_rate": 4.924700799545815e-06, - "logits/chosen": -3.3724656105041504, - "logits/rejected": -3.3683433532714844, - "logps/chosen": -92.61779022216797, - "logps/rejected": -98.67154693603516, - "loss": 0.6466, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.28296729922294617, - "rewards/margins": 0.11286024749279022, - "rewards/rejected": -0.3958275616168976, + "logits/chosen": -3.3750221729278564, + "logits/rejected": -3.3710360527038574, + "logps/chosen": -92.3640365600586, + "logps/rejected": -98.47993469238281, + "loss": 0.6459, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.280429869890213, + "rewards/margins": 0.11348159611225128, + "rewards/rejected": -0.3939114511013031, "step": 990 }, { "epoch": 0.17, - "grad_norm": 1.3515625, + "grad_norm": 1.21875, "learning_rate": 4.920993426866085e-06, - "logits/chosen": -3.3497300148010254, - "logits/rejected": -3.3454887866973877, - "logps/chosen": -116.35418701171875, - "logps/rejected": -115.3893051147461, - "loss": 0.6686, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4851459562778473, - "rewards/margins": 0.07257985323667526, - "rewards/rejected": -0.5577257871627808, + "logits/chosen": -3.356358766555786, + "logits/rejected": -3.352238893508911, + "logps/chosen": -115.8489990234375, + "logps/rejected": -114.68751525878906, + "loss": 0.6695, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.4800940155982971, + "rewards/margins": 0.0706138014793396, + "rewards/rejected": -0.5507077574729919, "step": 1000 }, { "epoch": 0.17, - "eval_logits/chosen": -3.3483028411865234, - "eval_logits/rejected": -3.3466553688049316, - "eval_logps/chosen": -111.16059112548828, - "eval_logps/rejected": -121.91670227050781, - "eval_loss": 0.6680831909179688, - "eval_rewards/accuracies": 0.5936338305473328, - "eval_rewards/chosen": -0.39750567078590393, - "eval_rewards/margins": 0.07059081643819809, - "eval_rewards/rejected": -0.4680964946746826, - "eval_runtime": 483.8996, - "eval_samples_per_second": 8.894, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.3611390590667725, + "eval_logits/rejected": -3.3595175743103027, + "eval_logps/chosen": -110.34318542480469, + "eval_logps/rejected": -120.9452896118164, + "eval_loss": 0.6685853600502014, + "eval_rewards/accuracies": 0.5945631861686707, + "eval_rewards/chosen": -0.389331579208374, + "eval_rewards/margins": 0.06905096769332886, + "eval_rewards/rejected": -0.4583825469017029, + "eval_runtime": 484.0568, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.111, "step": 1000 }, { "epoch": 0.17, - "grad_norm": 1.609375, + "grad_norm": 1.46875, "learning_rate": 4.917198431147504e-06, - "logits/chosen": -3.3379311561584473, - "logits/rejected": -3.3369412422180176, - "logps/chosen": -116.28370666503906, - "logps/rejected": -136.38829040527344, - "loss": 0.6223, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5286880731582642, - "rewards/margins": 0.19197988510131836, - "rewards/rejected": -0.7206679582595825, + "logits/chosen": -3.3470757007598877, + "logits/rejected": -3.346139907836914, + "logps/chosen": -114.94053649902344, + "logps/rejected": -134.77088928222656, + "loss": 0.6232, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.515256404876709, + "rewards/margins": 0.18923744559288025, + "rewards/rejected": -0.7044938206672668, "step": 1010 }, { "epoch": 0.18, - "grad_norm": 1.3203125, + "grad_norm": 1.265625, "learning_rate": 4.91331594974239e-06, - "logits/chosen": -3.3675708770751953, - "logits/rejected": -3.3619797229766846, - "logps/chosen": -126.566650390625, - "logps/rejected": -133.0839385986328, - "loss": 0.6425, + "logits/chosen": -3.3773505687713623, + "logits/rejected": -3.371786594390869, + "logps/chosen": -123.88951110839844, + "logps/rejected": -130.7519989013672, + "loss": 0.6408, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5853036642074585, - "rewards/margins": 0.12873545289039612, - "rewards/rejected": -0.714039146900177, + "rewards/chosen": -0.558532178401947, + "rewards/margins": 0.13218751549720764, + "rewards/rejected": -0.6907196640968323, "step": 1020 }, { "epoch": 0.18, - "grad_norm": 1.1953125, + "grad_norm": 1.2109375, "learning_rate": 4.90934612316943e-06, - "logits/chosen": -3.3567848205566406, - "logits/rejected": -3.352221965789795, - "logps/chosen": -112.8319320678711, - "logps/rejected": -117.05516052246094, - "loss": 0.6595, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.47556638717651367, - "rewards/margins": 0.09899481385946274, - "rewards/rejected": -0.5745611190795898, + "logits/chosen": -3.36560320854187, + "logits/rejected": -3.3610668182373047, + "logps/chosen": -111.2602310180664, + "logps/rejected": -115.28700256347656, + "loss": 0.66, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4598492980003357, + "rewards/margins": 0.09703020006418228, + "rewards/rejected": -0.5568795204162598, "step": 1030 }, { "epoch": 0.18, - "grad_norm": 1.3984375, + "grad_norm": 1.3359375, "learning_rate": 4.905289095108597e-06, - "logits/chosen": -3.362544298171997, - "logits/rejected": -3.360288619995117, - "logps/chosen": -109.68827819824219, - "logps/rejected": -116.13682556152344, - "loss": 0.6821, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -0.4699419140815735, - "rewards/margins": 0.05295227840542793, - "rewards/rejected": -0.5228942632675171, + "logits/chosen": -3.3711113929748535, + "logits/rejected": -3.368939161300659, + "logps/chosen": -109.08182525634766, + "logps/rejected": -115.71439361572266, + "loss": 0.6811, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4638773500919342, + "rewards/margins": 0.054792650043964386, + "rewards/rejected": -0.5186699628829956, "step": 1040 }, { "epoch": 0.18, - "grad_norm": 1.3125, + "grad_norm": 1.3515625, "learning_rate": 4.901145012395945e-06, - "logits/chosen": -3.340716600418091, - "logits/rejected": -3.3355400562286377, - "logps/chosen": -110.9808578491211, - "logps/rejected": -114.1646728515625, - "loss": 0.6485, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.43013253808021545, - "rewards/margins": 0.12207241356372833, - "rewards/rejected": -0.552204966545105, + "logits/chosen": -3.3484110832214355, + "logits/rejected": -3.3432037830352783, + "logps/chosen": -109.2693862915039, + "logps/rejected": -112.49226379394531, + "loss": 0.6478, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4130178987979889, + "rewards/margins": 0.12246304750442505, + "rewards/rejected": -0.5354809165000916, "step": 1050 }, { "epoch": 0.18, - "grad_norm": 1.3828125, + "grad_norm": 1.4765625, "learning_rate": 4.8969140250183036e-06, - "logits/chosen": -3.3481738567352295, - "logits/rejected": -3.344827175140381, - "logps/chosen": -110.7804183959961, - "logps/rejected": -113.61982727050781, - "loss": 0.6655, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4308958649635315, - "rewards/margins": 0.07684727013111115, - "rewards/rejected": -0.5077431201934814, + "logits/chosen": -3.355018138885498, + "logits/rejected": -3.3515992164611816, + "logps/chosen": -109.40594482421875, + "logps/rejected": -111.96232604980469, + "loss": 0.6666, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.41715121269226074, + "rewards/margins": 0.07401702553033829, + "rewards/rejected": -0.49116820096969604, "step": 1060 }, { "epoch": 0.18, - "grad_norm": 1.09375, + "grad_norm": 1.125, "learning_rate": 4.892596286107838e-06, - "logits/chosen": -3.381743907928467, - "logits/rejected": -3.3770880699157715, - "logps/chosen": -109.32756042480469, - "logps/rejected": -108.6933822631836, - "loss": 0.6764, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.41801881790161133, - "rewards/margins": 0.06484386324882507, - "rewards/rejected": -0.4828626215457916, + "logits/chosen": -3.389537811279297, + "logits/rejected": -3.3849761486053467, + "logps/chosen": -107.3383560180664, + "logps/rejected": -107.23735046386719, + "loss": 0.6731, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3981268107891083, + "rewards/margins": 0.07017555832862854, + "rewards/rejected": -0.4683023989200592, "step": 1070 }, { "epoch": 0.19, - "grad_norm": 0.859375, + "grad_norm": 0.8203125, "learning_rate": 4.888191951936516e-06, - "logits/chosen": -3.3660483360290527, - "logits/rejected": -3.3634610176086426, - "logps/chosen": -104.37919616699219, - "logps/rejected": -105.4195785522461, - "loss": 0.6605, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.3800170421600342, - "rewards/margins": 0.0873311385512352, - "rewards/rejected": -0.46734818816185, + "logits/chosen": -3.3715012073516846, + "logits/rejected": -3.3688080310821533, + "logps/chosen": -103.20147705078125, + "logps/rejected": -103.82188415527344, + "loss": 0.6623, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.36823979020118713, + "rewards/margins": 0.08313159644603729, + "rewards/rejected": -0.45137137174606323, "step": 1080 }, { "epoch": 0.19, - "grad_norm": 1.0078125, + "grad_norm": 1.296875, "learning_rate": 4.883701181910447e-06, - "logits/chosen": -3.3472914695739746, - "logits/rejected": -3.345745801925659, - "logps/chosen": -100.11200714111328, - "logps/rejected": -107.61392974853516, - "loss": 0.6578, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.3755348324775696, - "rewards/margins": 0.09375335276126862, - "rewards/rejected": -0.469288170337677, + "logits/chosen": -3.3489112854003906, + "logits/rejected": -3.3472843170166016, + "logps/chosen": -99.51116943359375, + "logps/rejected": -107.47447204589844, + "loss": 0.6554, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3695264458656311, + "rewards/margins": 0.09836713969707489, + "rewards/rejected": -0.4678936004638672, "step": 1090 }, { "epoch": 0.19, - "grad_norm": 1.1328125, + "grad_norm": 1.078125, "learning_rate": 4.879124138564116e-06, - "logits/chosen": -3.357037305831909, - "logits/rejected": -3.356238842010498, - "logps/chosen": -97.80937194824219, - "logps/rejected": -106.3392333984375, - "loss": 0.665, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.35469111800193787, - "rewards/margins": 0.08345221728086472, - "rewards/rejected": -0.4381433427333832, + "logits/chosen": -3.355736494064331, + "logits/rejected": -3.3550803661346436, + "logps/chosen": -97.91027069091797, + "logps/rejected": -106.47267150878906, + "loss": 0.6648, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.35570019483566284, + "rewards/margins": 0.08377765119075775, + "rewards/rejected": -0.4394778311252594, "step": 1100 }, { "epoch": 0.19, - "eval_logits/chosen": -3.3477072715759277, - "eval_logits/rejected": -3.3463261127471924, - "eval_logps/chosen": -92.17500305175781, - "eval_logps/rejected": -101.6746597290039, - "eval_loss": 0.6707614064216614, - "eval_rewards/accuracies": 0.5950278639793396, - "eval_rewards/chosen": -0.20764988660812378, - "eval_rewards/margins": 0.058026209473609924, - "eval_rewards/rejected": -0.2656761109828949, - "eval_runtime": 483.5387, - "eval_samples_per_second": 8.901, - "eval_steps_per_second": 1.113, + "eval_logits/chosen": -3.3453195095062256, + "eval_logits/rejected": -3.343949556350708, + "eval_logps/chosen": -92.19027709960938, + "eval_logps/rejected": -101.81735229492188, + "eval_loss": 0.670196533203125, + "eval_rewards/accuracies": 0.5975836515426636, + "eval_rewards/chosen": -0.2078026980161667, + "eval_rewards/margins": 0.05930037051439285, + "eval_rewards/rejected": -0.26710304617881775, + "eval_runtime": 483.818, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 1.112, "step": 1100 }, { "epoch": 0.19, - "grad_norm": 1.2578125, + "grad_norm": 1.234375, "learning_rate": 4.874460987554495e-06, - "logits/chosen": -3.361199140548706, - "logits/rejected": -3.363349437713623, - "logps/chosen": -93.87645721435547, - "logps/rejected": -103.97225189208984, - "loss": 0.6512, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.30472373962402344, - "rewards/margins": 0.10587634146213531, - "rewards/rejected": -0.41060003638267517, + "logits/chosen": -3.35750150680542, + "logits/rejected": -3.3596253395080566, + "logps/chosen": -93.9940185546875, + "logps/rejected": -104.469482421875, + "loss": 0.6495, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.30589941143989563, + "rewards/margins": 0.10967297852039337, + "rewards/rejected": -0.4155723452568054, "step": 1110 }, { "epoch": 0.19, - "grad_norm": 1.0, + "grad_norm": 1.0703125, "learning_rate": 4.869711897655058e-06, - "logits/chosen": -3.3703665733337402, - "logits/rejected": -3.3666203022003174, - "logps/chosen": -95.50496673583984, - "logps/rejected": -99.84251403808594, - "loss": 0.6539, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.287777841091156, - "rewards/margins": 0.09858438372612, - "rewards/rejected": -0.386362224817276, + "logits/chosen": -3.3663763999938965, + "logits/rejected": -3.3627076148986816, + "logps/chosen": -95.74103546142578, + "logps/rejected": -99.71363830566406, + "loss": 0.6558, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2901385426521301, + "rewards/margins": 0.09493489563465118, + "rewards/rejected": -0.3850734233856201, "step": 1120 }, { "epoch": 0.19, - "grad_norm": 1.1015625, + "grad_norm": 1.078125, "learning_rate": 4.864877040749659e-06, - "logits/chosen": -3.348633289337158, - "logits/rejected": -3.344589948654175, - "logps/chosen": -96.18132019042969, - "logps/rejected": -108.21700286865234, - "loss": 0.6387, + "logits/chosen": -3.3478622436523438, + "logits/rejected": -3.3438973426818848, + "logps/chosen": -94.52452087402344, + "logps/rejected": -106.47977447509766, + "loss": 0.6393, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.3397810161113739, - "rewards/margins": 0.13241739571094513, - "rewards/rejected": -0.4721984267234802, + "rewards/chosen": -0.3232131004333496, + "rewards/margins": 0.13161292672157288, + "rewards/rejected": -0.4548260271549225, "step": 1130 }, { "epoch": 0.2, - "grad_norm": 1.3203125, + "grad_norm": 1.21875, "learning_rate": 4.859956591826323e-06, - "logits/chosen": -3.3522610664367676, - "logits/rejected": -3.347743272781372, - "logps/chosen": -113.341064453125, - "logps/rejected": -117.01106262207031, - "loss": 0.6554, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4580538868904114, - "rewards/margins": 0.10810957849025726, - "rewards/rejected": -0.5661634802818298, + "logits/chosen": -3.353027820587158, + "logits/rejected": -3.3485264778137207, + "logps/chosen": -110.5881118774414, + "logps/rejected": -114.19659423828125, + "loss": 0.6553, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.43052440881729126, + "rewards/margins": 0.10749445110559464, + "rewards/rejected": -0.5380188226699829, "step": 1140 }, { "epoch": 0.2, - "grad_norm": 1.3515625, + "grad_norm": 1.3125, "learning_rate": 4.854950728970905e-06, - "logits/chosen": -3.325697660446167, - "logits/rejected": -3.3218960762023926, - "logps/chosen": -110.71246337890625, - "logps/rejected": -122.715576171875, - "loss": 0.6389, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.46634015440940857, - "rewards/margins": 0.1538677215576172, - "rewards/rejected": -0.6202079057693481, + "logits/chosen": -3.328634738922119, + "logits/rejected": -3.324906826019287, + "logps/chosen": -109.4224853515625, + "logps/rejected": -121.63139343261719, + "loss": 0.6376, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4534403681755066, + "rewards/margins": 0.15592564642429352, + "rewards/rejected": -0.6093659400939941, "step": 1150 }, { "epoch": 0.2, "grad_norm": 1.46875, "learning_rate": 4.849859633360649e-06, - "logits/chosen": -3.3398876190185547, - "logits/rejected": -3.3386433124542236, - "logps/chosen": -107.8994140625, - "logps/rejected": -120.6123275756836, - "loss": 0.6295, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.41763836145401, - "rewards/margins": 0.16287484765052795, - "rewards/rejected": -0.5805131793022156, + "logits/chosen": -3.3405518531799316, + "logits/rejected": -3.3392815589904785, + "logps/chosen": -109.72029876708984, + "logps/rejected": -123.24666595458984, + "loss": 0.626, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4358472228050232, + "rewards/margins": 0.17100933194160461, + "rewards/rejected": -0.6068565249443054, "step": 1160 }, { "epoch": 0.2, - "grad_norm": 1.5234375, + "grad_norm": 1.484375, "learning_rate": 4.84468348925763e-06, - "logits/chosen": -3.3097126483917236, - "logits/rejected": -3.3056182861328125, - "logps/chosen": -118.73319244384766, - "logps/rejected": -131.2072296142578, - "loss": 0.6517, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.5627143383026123, - "rewards/margins": 0.12967202067375183, - "rewards/rejected": -0.6923863291740417, + "logits/chosen": -3.3110427856445312, + "logits/rejected": -3.307129383087158, + "logps/chosen": -120.5337905883789, + "logps/rejected": -133.09884643554688, + "loss": 0.6506, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5807204246520996, + "rewards/margins": 0.13058218359947205, + "rewards/rejected": -0.7113025188446045, "step": 1170 }, { "epoch": 0.2, - "grad_norm": 2.0, + "grad_norm": 1.5625, "learning_rate": 4.83942248400208e-06, - "logits/chosen": -3.298457384109497, - "logits/rejected": -3.293428421020508, - "logps/chosen": -129.34249877929688, - "logps/rejected": -134.14891052246094, - "loss": 0.6615, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.6600443124771118, - "rewards/margins": 0.1017390713095665, - "rewards/rejected": -0.7617834210395813, + "logits/chosen": -3.299647808074951, + "logits/rejected": -3.294851303100586, + "logps/chosen": -131.7820281982422, + "logps/rejected": -137.20223999023438, + "loss": 0.6587, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6844396591186523, + "rewards/margins": 0.10787700116634369, + "rewards/rejected": -0.792316734790802, "step": 1180 }, { "epoch": 0.21, - "grad_norm": 1.6953125, + "grad_norm": 1.4765625, "learning_rate": 4.834076808005615e-06, - "logits/chosen": -3.3326334953308105, - "logits/rejected": -3.327303409576416, - "logps/chosen": -130.8115692138672, - "logps/rejected": -134.27078247070312, - "loss": 0.645, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.625819206237793, - "rewards/margins": 0.14104318618774414, - "rewards/rejected": -0.7668623924255371, + "logits/chosen": -3.3311972618103027, + "logits/rejected": -3.3262181282043457, + "logps/chosen": -132.84390258789062, + "logps/rejected": -136.3151092529297, + "loss": 0.6451, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6461424231529236, + "rewards/margins": 0.1411634385585785, + "rewards/rejected": -0.7873059511184692, "step": 1190 }, { "epoch": 0.21, - "grad_norm": 1.2578125, + "grad_norm": 1.2421875, "learning_rate": 4.828646654744338e-06, - "logits/chosen": -3.317185640335083, - "logits/rejected": -3.3153128623962402, - "logps/chosen": -117.51283264160156, - "logps/rejected": -124.31459045410156, - "loss": 0.6549, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.5366128087043762, - "rewards/margins": 0.11689277738332748, - "rewards/rejected": -0.6535056233406067, + "logits/chosen": -3.315372943878174, + "logits/rejected": -3.3137035369873047, + "logps/chosen": -116.4529037475586, + "logps/rejected": -123.21891784667969, + "loss": 0.6543, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5260134935379028, + "rewards/margins": 0.11653532832860947, + "rewards/rejected": -0.6425488591194153, "step": 1200 }, { "epoch": 0.21, - "eval_logits/chosen": -3.317342758178711, - "eval_logits/rejected": -3.3158934116363525, - "eval_logps/chosen": -107.33206176757812, - "eval_logps/rejected": -119.39058685302734, - "eval_loss": 0.6630815267562866, - "eval_rewards/accuracies": 0.5973513126373291, - "eval_rewards/chosen": -0.3592205047607422, - "eval_rewards/margins": 0.08361494541168213, - "eval_rewards/rejected": -0.4428354501724243, - "eval_runtime": 483.8876, - "eval_samples_per_second": 8.895, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.310985565185547, + "eval_logits/rejected": -3.309568405151367, + "eval_logps/chosen": -106.52164459228516, + "eval_logps/rejected": -118.23544311523438, + "eval_loss": 0.6642152070999146, + "eval_rewards/accuracies": 0.6010687947273254, + "eval_rewards/chosen": -0.35111624002456665, + "eval_rewards/margins": 0.08016779273748398, + "eval_rewards/rejected": -0.43128401041030884, + "eval_runtime": 484.1657, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.111, "step": 1200 }, { "epoch": 0.21, - "grad_norm": 1.4375, + "grad_norm": 1.4609375, "learning_rate": 4.82313222075184e-06, - "logits/chosen": -3.3175208568573, - "logits/rejected": -3.3128483295440674, - "logps/chosen": -119.05058288574219, - "logps/rejected": -130.1739044189453, - "loss": 0.6404, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5425583124160767, - "rewards/margins": 0.14496661722660065, - "rewards/rejected": -0.6875249147415161, + "logits/chosen": -3.3166213035583496, + "logits/rejected": -3.312199354171753, + "logps/chosen": -117.9781265258789, + "logps/rejected": -128.859130859375, + "loss": 0.6414, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5318335890769958, + "rewards/margins": 0.14254365861415863, + "rewards/rejected": -0.6743772625923157, "step": 1210 }, { "epoch": 0.21, - "grad_norm": 1.3515625, + "grad_norm": 1.328125, "learning_rate": 4.8175337056120844e-06, - "logits/chosen": -3.318523406982422, - "logits/rejected": -3.313075542449951, - "logps/chosen": -114.37835693359375, - "logps/rejected": -130.38021850585938, - "loss": 0.6252, + "logits/chosen": -3.3199057579040527, + "logits/rejected": -3.314791202545166, + "logps/chosen": -114.121826171875, + "logps/rejected": -130.0734100341797, + "loss": 0.6249, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5133898258209229, - "rewards/margins": 0.1854238063097, - "rewards/rejected": -0.6988136172294617, + "rewards/chosen": -0.5108245015144348, + "rewards/margins": 0.184920996427536, + "rewards/rejected": -0.6957454681396484, "step": 1220 }, { "epoch": 0.21, - "grad_norm": 1.7578125, + "grad_norm": 1.40625, "learning_rate": 4.811851311952185e-06, - "logits/chosen": -3.3374500274658203, - "logits/rejected": -3.3314366340637207, - "logps/chosen": -117.9470443725586, - "logps/rejected": -122.54820251464844, - "loss": 0.6514, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.5068861246109009, - "rewards/margins": 0.11951172351837158, - "rewards/rejected": -0.6263978481292725, + "logits/chosen": -3.3396987915039062, + "logits/rejected": -3.3339123725891113, + "logps/chosen": -117.48822021484375, + "logps/rejected": -122.26155853271484, + "loss": 0.6497, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5022979974746704, + "rewards/margins": 0.12123336642980576, + "rewards/rejected": -0.6235313415527344, "step": 1230 }, { "epoch": 0.21, - "grad_norm": 1.5859375, + "grad_norm": 1.5234375, "learning_rate": 4.80608524543507e-06, - "logits/chosen": -3.31003999710083, - "logits/rejected": -3.307063579559326, - "logps/chosen": -114.16108703613281, - "logps/rejected": -123.75483703613281, - "loss": 0.6615, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.516891360282898, - "rewards/margins": 0.11133322864770889, - "rewards/rejected": -0.6282245516777039, + "logits/chosen": -3.308558940887451, + "logits/rejected": -3.3058955669403076, + "logps/chosen": -113.2501449584961, + "logps/rejected": -122.56099700927734, + "loss": 0.662, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5077820420265198, + "rewards/margins": 0.10850410163402557, + "rewards/rejected": -0.6162861585617065, "step": 1240 }, { "epoch": 0.22, - "grad_norm": 1.5703125, + "grad_norm": 1.625, "learning_rate": 4.800235714752042e-06, - "logits/chosen": -3.3228580951690674, - "logits/rejected": -3.317410707473755, - "logps/chosen": -107.6076431274414, - "logps/rejected": -112.98020935058594, - "loss": 0.655, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.4211878776550293, - "rewards/margins": 0.10868742316961288, - "rewards/rejected": -0.5298753380775452, + "logits/chosen": -3.3179473876953125, + "logits/rejected": -3.312861680984497, + "logps/chosen": -107.0726547241211, + "logps/rejected": -111.9657211303711, + "loss": 0.6564, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4158380925655365, + "rewards/margins": 0.10389236360788345, + "rewards/rejected": -0.5197304487228394, "step": 1250 }, { "epoch": 0.22, - "grad_norm": 1.5703125, + "grad_norm": 1.4140625, "learning_rate": 4.7943029316152235e-06, - "logits/chosen": -3.303680896759033, - "logits/rejected": -3.296949863433838, - "logps/chosen": -109.7947769165039, - "logps/rejected": -117.12117004394531, - "loss": 0.6462, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.4526275098323822, - "rewards/margins": 0.13787886500358582, - "rewards/rejected": -0.590506374835968, + "logits/chosen": -3.2991790771484375, + "logits/rejected": -3.293081283569336, + "logps/chosen": -108.0817642211914, + "logps/rejected": -114.9096450805664, + "loss": 0.6473, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4354972839355469, + "rewards/margins": 0.13289375603199005, + "rewards/rejected": -0.5683910250663757, "step": 1260 }, { "epoch": 0.22, - "grad_norm": 1.6796875, + "grad_norm": 1.6484375, "learning_rate": 4.788287110749892e-06, - "logits/chosen": -3.304987668991089, - "logits/rejected": -3.3044636249542236, - "logps/chosen": -117.20857238769531, - "logps/rejected": -128.5338134765625, - "loss": 0.6531, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5425751805305481, - "rewards/margins": 0.11775865405797958, - "rewards/rejected": -0.6603338122367859, + "logits/chosen": -3.3053946495056152, + "logits/rejected": -3.3048996925354004, + "logps/chosen": -114.3599624633789, + "logps/rejected": -125.3141860961914, + "loss": 0.6533, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5140891671180725, + "rewards/margins": 0.1140485554933548, + "rewards/rejected": -0.6281377077102661, "step": 1270 }, { "epoch": 0.22, - "grad_norm": 1.5703125, + "grad_norm": 1.4609375, "learning_rate": 4.782188469886711e-06, - "logits/chosen": -3.3322014808654785, - "logits/rejected": -3.3311448097229004, - "logps/chosen": -118.02180480957031, - "logps/rejected": -141.24038696289062, - "loss": 0.6264, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.5545832514762878, - "rewards/margins": 0.1757308542728424, - "rewards/rejected": -0.7303141355514526, + "logits/chosen": -3.335669994354248, + "logits/rejected": -3.3347504138946533, + "logps/chosen": -113.2931900024414, + "logps/rejected": -135.6788787841797, + "loss": 0.629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5072969794273376, + "rewards/margins": 0.16740207374095917, + "rewards/rejected": -0.6746990084648132, "step": 1280 }, { "epoch": 0.22, - "grad_norm": 1.65625, + "grad_norm": 1.5625, "learning_rate": 4.776007229753847e-06, - "logits/chosen": -3.297036647796631, - "logits/rejected": -3.292588710784912, - "logps/chosen": -134.52230834960938, - "logps/rejected": -141.50653076171875, - "loss": 0.6549, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6898329854011536, - "rewards/margins": 0.13372239470481873, - "rewards/rejected": -0.8235553503036499, + "logits/chosen": -3.308954954147339, + "logits/rejected": -3.304429292678833, + "logps/chosen": -129.07186889648438, + "logps/rejected": -136.5091094970703, + "loss": 0.651, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6353288888931274, + "rewards/margins": 0.13825224339962006, + "rewards/rejected": -0.7735811471939087, "step": 1290 }, { "epoch": 0.22, - "grad_norm": 1.609375, + "grad_norm": 1.421875, "learning_rate": 4.7697436140689894e-06, - "logits/chosen": -3.283554792404175, - "logits/rejected": -3.280672788619995, - "logps/chosen": -129.9937744140625, - "logps/rejected": -145.84243774414062, - "loss": 0.6536, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.7106422185897827, - "rewards/margins": 0.13219983875751495, - "rewards/rejected": -0.8428421020507812, + "logits/chosen": -3.296621799468994, + "logits/rejected": -3.2937755584716797, + "logps/chosen": -126.10877990722656, + "logps/rejected": -141.7520294189453, + "loss": 0.6535, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6717920303344727, + "rewards/margins": 0.13014563918113708, + "rewards/rejected": -0.8019376993179321, "step": 1300 }, { "epoch": 0.22, - "eval_logits/chosen": -3.2736623287200928, - "eval_logits/rejected": -3.2721986770629883, - "eval_logps/chosen": -121.81114959716797, - "eval_logps/rejected": -135.54385375976562, - "eval_loss": 0.6590760946273804, - "eval_rewards/accuracies": 0.597815990447998, - "eval_rewards/chosen": -0.5040112733840942, - "eval_rewards/margins": 0.10035695135593414, - "eval_rewards/rejected": -0.6043682098388672, - "eval_runtime": 483.6725, - "eval_samples_per_second": 8.899, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.283175468444824, + "eval_logits/rejected": -3.2817232608795166, + "eval_logps/chosen": -117.92476654052734, + "eval_logps/rejected": -131.19667053222656, + "eval_loss": 0.6605014801025391, + "eval_rewards/accuracies": 0.5989776849746704, + "eval_rewards/chosen": -0.4651474356651306, + "eval_rewards/margins": 0.09574878215789795, + "eval_rewards/rejected": -0.5608961582183838, + "eval_runtime": 484.3981, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 1.111, "step": 1300 }, { "epoch": 0.23, - "grad_norm": 1.5859375, + "grad_norm": 1.78125, "learning_rate": 4.763397849531239e-06, - "logits/chosen": -3.269982099533081, - "logits/rejected": -3.26440167427063, - "logps/chosen": -129.6372833251953, - "logps/rejected": -141.26046752929688, - "loss": 0.6313, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6593326330184937, - "rewards/margins": 0.1772792637348175, - "rewards/rejected": -0.8366119265556335, + "logits/chosen": -3.285306215286255, + "logits/rejected": -3.279937744140625, + "logps/chosen": -125.95235443115234, + "logps/rejected": -137.43260192871094, + "loss": 0.6306, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6224833726882935, + "rewards/margins": 0.17584998905658722, + "rewards/rejected": -0.7983332872390747, "step": 1310 }, { "epoch": 0.23, - "grad_norm": 1.921875, + "grad_norm": 1.765625, "learning_rate": 4.756970165812914e-06, - "logits/chosen": -3.2888553142547607, - "logits/rejected": -3.2863876819610596, - "logps/chosen": -132.07308959960938, - "logps/rejected": -137.927001953125, - "loss": 0.6499, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.6589027643203735, - "rewards/margins": 0.1327061951160431, - "rewards/rejected": -0.791608989238739, + "logits/chosen": -3.304464817047119, + "logits/rejected": -3.301752805709839, + "logps/chosen": -129.58468627929688, + "logps/rejected": -134.88809204101562, + "loss": 0.6512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6340187788009644, + "rewards/margins": 0.12720128893852234, + "rewards/rejected": -0.7612199783325195, "step": 1320 }, { "epoch": 0.23, - "grad_norm": 1.3828125, + "grad_norm": 1.265625, "learning_rate": 4.750460795551235e-06, - "logits/chosen": -3.291693925857544, - "logits/rejected": -3.2884392738342285, - "logps/chosen": -127.37693786621094, - "logps/rejected": -136.66612243652344, - "loss": 0.6309, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.6089495420455933, - "rewards/margins": 0.16879227757453918, - "rewards/rejected": -0.7777417898178101, + "logits/chosen": -3.3049120903015137, + "logits/rejected": -3.3017563819885254, + "logps/chosen": -123.66972351074219, + "logps/rejected": -132.18521118164062, + "loss": 0.6332, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5718771815299988, + "rewards/margins": 0.16105546057224274, + "rewards/rejected": -0.7329326868057251, "step": 1330 }, { "epoch": 0.23, - "grad_norm": 1.3984375, + "grad_norm": 1.3046875, "learning_rate": 4.743869974339904e-06, - "logits/chosen": -3.283473491668701, - "logits/rejected": -3.2803750038146973, - "logps/chosen": -123.50019836425781, - "logps/rejected": -133.4915008544922, - "loss": 0.6349, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5720099806785583, - "rewards/margins": 0.1526518613100052, - "rewards/rejected": -0.7246618866920471, + "logits/chosen": -3.297811985015869, + "logits/rejected": -3.2948131561279297, + "logps/chosen": -117.7374496459961, + "logps/rejected": -127.28782653808594, + "loss": 0.6355, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5143827199935913, + "rewards/margins": 0.14824241399765015, + "rewards/rejected": -0.6626251935958862, "step": 1340 }, { "epoch": 0.23, - "grad_norm": 2.390625, + "grad_norm": 1.9375, "learning_rate": 4.737197940720577e-06, - "logits/chosen": -3.2804999351501465, - "logits/rejected": -3.277661085128784, - "logps/chosen": -144.21261596679688, - "logps/rejected": -147.98348999023438, - "loss": 0.6961, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -0.8038604855537415, - "rewards/margins": 0.057847969233989716, - "rewards/rejected": -0.8617083430290222, + "logits/chosen": -3.3007519245147705, + "logits/rejected": -3.2976622581481934, + "logps/chosen": -135.472900390625, + "logps/rejected": -139.4297332763672, + "loss": 0.6914, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.7164632081985474, + "rewards/margins": 0.059707384556531906, + "rewards/rejected": -0.776170551776886, "step": 1350 }, { "epoch": 0.23, - "grad_norm": 1.6796875, + "grad_norm": 1.40625, "learning_rate": 4.730444936174233e-06, - "logits/chosen": -3.2664542198181152, - "logits/rejected": -3.265286922454834, - "logps/chosen": -131.89950561523438, - "logps/rejected": -141.6063232421875, - "loss": 0.6582, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.655137836933136, - "rewards/margins": 0.12065533548593521, - "rewards/rejected": -0.775793194770813, + "logits/chosen": -3.282017469406128, + "logits/rejected": -3.280801296234131, + "logps/chosen": -125.0278549194336, + "logps/rejected": -134.0998992919922, + "loss": 0.6585, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5864216089248657, + "rewards/margins": 0.11430720239877701, + "rewards/rejected": -0.7007287740707397, "step": 1360 }, { "epoch": 0.24, - "grad_norm": 1.78125, + "grad_norm": 1.6484375, "learning_rate": 4.723611205112431e-06, - "logits/chosen": -3.2844204902648926, - "logits/rejected": -3.2812061309814453, - "logps/chosen": -126.83049011230469, - "logps/rejected": -141.07415771484375, - "loss": 0.6365, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6328147649765015, - "rewards/margins": 0.1619427502155304, - "rewards/rejected": -0.794757604598999, + "logits/chosen": -3.2988338470458984, + "logits/rejected": -3.2955524921417236, + "logps/chosen": -120.72515869140625, + "logps/rejected": -134.67813110351562, + "loss": 0.6359, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5717614889144897, + "rewards/margins": 0.15903595089912415, + "rewards/rejected": -0.7307974696159363, "step": 1370 }, { "epoch": 0.24, - "grad_norm": 1.578125, + "grad_norm": 1.46875, "learning_rate": 4.716696994868467e-06, - "logits/chosen": -3.2731170654296875, - "logits/rejected": -3.269042491912842, - "logps/chosen": -130.5900115966797, - "logps/rejected": -138.87216186523438, - "loss": 0.6507, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.6424145698547363, - "rewards/margins": 0.1399935781955719, - "rewards/rejected": -0.7824081182479858, + "logits/chosen": -3.2845749855041504, + "logits/rejected": -3.280909776687622, + "logps/chosen": -125.68714904785156, + "logps/rejected": -133.53616333007812, + "loss": 0.6502, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5933858156204224, + "rewards/margins": 0.1356620490550995, + "rewards/rejected": -0.7290478944778442, "step": 1380 }, { "epoch": 0.24, - "grad_norm": 1.453125, + "grad_norm": 1.3671875, "learning_rate": 4.70970255568842e-06, - "logits/chosen": -3.2995362281799316, - "logits/rejected": -3.2949798107147217, - "logps/chosen": -133.54818725585938, - "logps/rejected": -139.49545288085938, - "loss": 0.6564, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.6576114892959595, - "rewards/margins": 0.1290406435728073, - "rewards/rejected": -0.7866522073745728, + "logits/chosen": -3.304932117462158, + "logits/rejected": -3.300816059112549, + "logps/chosen": -130.515869140625, + "logps/rejected": -136.6916046142578, + "loss": 0.6547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6272884607315063, + "rewards/margins": 0.13132533431053162, + "rewards/rejected": -0.7586137652397156, "step": 1390 }, { "epoch": 0.24, - "grad_norm": 1.3359375, + "grad_norm": 1.25, "learning_rate": 4.702628140722096e-06, - "logits/chosen": -3.266584873199463, - "logits/rejected": -3.2621452808380127, - "logps/chosen": -122.13375091552734, - "logps/rejected": -132.69252014160156, - "loss": 0.6303, + "logits/chosen": -3.2679882049560547, + "logits/rejected": -3.2638022899627686, + "logps/chosen": -120.34420013427734, + "logps/rejected": -130.57388305664062, + "loss": 0.6315, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5562669634819031, - "rewards/margins": 0.16778331995010376, - "rewards/rejected": -0.7240502238273621, + "rewards/chosen": -0.538371205329895, + "rewards/margins": 0.16449260711669922, + "rewards/rejected": -0.702863872051239, "step": 1400 }, { "epoch": 0.24, - "eval_logits/chosen": -3.279047966003418, - "eval_logits/rejected": -3.2774879932403564, - "eval_logps/chosen": -111.65291595458984, - "eval_logps/rejected": -124.72959899902344, - "eval_loss": 0.6592565774917603, - "eval_rewards/accuracies": 0.6054832935333252, - "eval_rewards/chosen": -0.40242883563041687, - "eval_rewards/margins": 0.09379658102989197, - "eval_rewards/rejected": -0.4962254762649536, - "eval_runtime": 483.833, - "eval_samples_per_second": 8.896, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.2787537574768066, + "eval_logits/rejected": -3.2772786617279053, + "eval_logps/chosen": -109.42464447021484, + "eval_logps/rejected": -122.14970397949219, + "eval_loss": 0.6605932116508484, + "eval_rewards/accuracies": 0.6138476133346558, + "eval_rewards/chosen": -0.38014617562294006, + "eval_rewards/margins": 0.0902804583311081, + "eval_rewards/rejected": -0.47042664885520935, + "eval_runtime": 484.6215, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.11, "step": 1400 }, { "epoch": 0.24, - "grad_norm": 1.703125, + "grad_norm": 1.625, "learning_rate": 4.695474006013865e-06, - "logits/chosen": -3.281132459640503, - "logits/rejected": -3.2756476402282715, - "logps/chosen": -119.60661315917969, - "logps/rejected": -133.5, - "loss": 0.6398, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.561919629573822, - "rewards/margins": 0.16075286269187927, - "rewards/rejected": -0.7226725220680237, + "logits/chosen": -3.282273769378662, + "logits/rejected": -3.276782989501953, + "logps/chosen": -116.5535888671875, + "logps/rejected": -130.62254333496094, + "loss": 0.6384, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5313894748687744, + "rewards/margins": 0.16250842809677124, + "rewards/rejected": -0.6938979029655457, "step": 1410 }, { "epoch": 0.24, - "grad_norm": 1.4609375, + "grad_norm": 1.4140625, "learning_rate": 4.688240410493394e-06, - "logits/chosen": -3.2566521167755127, - "logits/rejected": -3.2544357776641846, - "logps/chosen": -121.44035339355469, - "logps/rejected": -143.53671264648438, - "loss": 0.6311, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6094900965690613, - "rewards/margins": 0.19036082923412323, - "rewards/rejected": -0.7998508214950562, + "logits/chosen": -3.258557081222534, + "logits/rejected": -3.256169080734253, + "logps/chosen": -118.12491607666016, + "logps/rejected": -140.1725311279297, + "loss": 0.6306, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.576335608959198, + "rewards/margins": 0.18987338244915009, + "rewards/rejected": -0.7662090063095093, "step": 1420 }, { "epoch": 0.25, - "grad_norm": 2.1875, + "grad_norm": 2.3125, "learning_rate": 4.6809276159662785e-06, - "logits/chosen": -3.2630069255828857, - "logits/rejected": -3.262012004852295, - "logps/chosen": -140.1363983154297, - "logps/rejected": -158.3402557373047, - "loss": 0.6233, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7564435005187988, - "rewards/margins": 0.19494546949863434, - "rewards/rejected": -0.9513890147209167, + "logits/chosen": -3.264193296432495, + "logits/rejected": -3.263188123703003, + "logps/chosen": -136.53036499023438, + "logps/rejected": -155.8864288330078, + "loss": 0.6172, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7203831076622009, + "rewards/margins": 0.2064674347639084, + "rewards/rejected": -0.9268506169319153, "step": 1430 }, { "epoch": 0.25, - "grad_norm": 2.296875, + "grad_norm": 2.15625, "learning_rate": 4.673535887104561e-06, - "logits/chosen": -3.2323620319366455, - "logits/rejected": -3.22904896736145, - "logps/chosen": -142.92724609375, - "logps/rejected": -149.91787719726562, - "loss": 0.6612, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.8208937644958496, - "rewards/margins": 0.1224142462015152, - "rewards/rejected": -0.9433080554008484, + "logits/chosen": -3.230701446533203, + "logits/rejected": -3.2279746532440186, + "logps/chosen": -140.97439575195312, + "logps/rejected": -147.69012451171875, + "loss": 0.662, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8013652563095093, + "rewards/margins": 0.11966520547866821, + "rewards/rejected": -0.9210304021835327, "step": 1440 }, { "epoch": 0.25, - "grad_norm": 2.0, + "grad_norm": 1.9921875, "learning_rate": 4.6660654914371575e-06, - "logits/chosen": -3.2538230419158936, - "logits/rejected": -3.250257968902588, - "logps/chosen": -145.50843811035156, - "logps/rejected": -160.82852172851562, - "loss": 0.626, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.8006998300552368, - "rewards/margins": 0.19958093762397766, - "rewards/rejected": -1.000280737876892, + "logits/chosen": -3.2500369548797607, + "logits/rejected": -3.2470641136169434, + "logps/chosen": -144.15005493164062, + "logps/rejected": -159.1547088623047, + "loss": 0.6271, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7871159911155701, + "rewards/margins": 0.19642707705497742, + "rewards/rejected": -0.9835430383682251, "step": 1450 }, { "epoch": 0.25, - "grad_norm": 2.046875, + "grad_norm": 1.9921875, "learning_rate": 4.658516699340171e-06, - "logits/chosen": -3.237740993499756, - "logits/rejected": -3.236027479171753, - "logps/chosen": -146.0584716796875, - "logps/rejected": -158.69305419921875, - "loss": 0.6456, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.8310154676437378, - "rewards/margins": 0.17097845673561096, - "rewards/rejected": -1.0019938945770264, + "logits/chosen": -3.2364182472229004, + "logits/rejected": -3.2350940704345703, + "logps/chosen": -142.92050170898438, + "logps/rejected": -155.60977172851562, + "loss": 0.644, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7996357679367065, + "rewards/margins": 0.1715250462293625, + "rewards/rejected": -0.9711607098579407, "step": 1460 }, { "epoch": 0.25, - "grad_norm": 1.53125, + "grad_norm": 1.46875, "learning_rate": 4.650889784027109e-06, - "logits/chosen": -3.275007724761963, - "logits/rejected": -3.270531415939331, - "logps/chosen": -137.88790893554688, - "logps/rejected": -147.54556274414062, - "loss": 0.6391, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.6958156824111938, - "rewards/margins": 0.15993863344192505, - "rewards/rejected": -0.8557542562484741, + "logits/chosen": -3.2780442237854004, + "logits/rejected": -3.273808240890503, + "logps/chosen": -133.93801879882812, + "logps/rejected": -143.4498748779297, + "loss": 0.6389, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.656316876411438, + "rewards/margins": 0.15848079323768616, + "rewards/rejected": -0.814797580242157, "step": 1470 }, { "epoch": 0.26, - "grad_norm": 1.875, + "grad_norm": 1.828125, "learning_rate": 4.64318502153899e-06, - "logits/chosen": -3.270411252975464, - "logits/rejected": -3.2654106616973877, - "logps/chosen": -133.38864135742188, - "logps/rejected": -146.6710968017578, - "loss": 0.646, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7125257253646851, - "rewards/margins": 0.14800339937210083, - "rewards/rejected": -0.8605291247367859, + "logits/chosen": -3.274941921234131, + "logits/rejected": -3.2701942920684814, + "logps/chosen": -127.15447998046875, + "logps/rejected": -139.1614227294922, + "loss": 0.6487, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6501842141151428, + "rewards/margins": 0.13524849712848663, + "rewards/rejected": -0.7854325771331787, "step": 1480 }, { "epoch": 0.26, - "grad_norm": 2.0, + "grad_norm": 1.8671875, "learning_rate": 4.635402690734362e-06, - "logits/chosen": -3.2487213611602783, - "logits/rejected": -3.2444915771484375, - "logps/chosen": -146.71372985839844, - "logps/rejected": -153.9183807373047, - "loss": 0.6525, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.7878586649894714, - "rewards/margins": 0.14298923313617706, - "rewards/rejected": -0.9308478236198425, + "logits/chosen": -3.253192186355591, + "logits/rejected": -3.2492566108703613, + "logps/chosen": -138.96067810058594, + "logps/rejected": -145.4950714111328, + "loss": 0.653, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.710328221321106, + "rewards/margins": 0.13628648221492767, + "rewards/rejected": -0.84661465883255, "step": 1490 }, { "epoch": 0.26, - "grad_norm": 1.96875, + "grad_norm": 1.8203125, "learning_rate": 4.627543073279197e-06, - "logits/chosen": -3.250176191329956, - "logits/rejected": -3.25103759765625, - "logps/chosen": -146.10208129882812, - "logps/rejected": -159.60694885253906, - "loss": 0.6611, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.8511150479316711, - "rewards/margins": 0.13717147707939148, - "rewards/rejected": -0.9882864952087402, + "logits/chosen": -3.2492332458496094, + "logits/rejected": -3.2501883506774902, + "logps/chosen": -138.17279052734375, + "logps/rejected": -151.1929931640625, + "loss": 0.6595, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7718222141265869, + "rewards/margins": 0.13232490420341492, + "rewards/rejected": -0.9041470289230347, "step": 1500 }, { "epoch": 0.26, - "eval_logits/chosen": -3.2471868991851807, - "eval_logits/rejected": -3.2453789710998535, - "eval_logps/chosen": -132.2458038330078, - "eval_logps/rejected": -148.12802124023438, - "eval_loss": 0.6527114510536194, - "eval_rewards/accuracies": 0.6138476133346558, - "eval_rewards/chosen": -0.6083579063415527, - "eval_rewards/margins": 0.12185192108154297, - "eval_rewards/rejected": -0.7302098274230957, - "eval_runtime": 483.8595, - "eval_samples_per_second": 8.895, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.2445855140686035, + "eval_logits/rejected": -3.2428994178771973, + "eval_logps/chosen": -127.01957702636719, + "eval_logps/rejected": -142.2230987548828, + "eval_loss": 0.6543667912483215, + "eval_rewards/accuracies": 0.6196561455726624, + "eval_rewards/chosen": -0.556095540523529, + "eval_rewards/margins": 0.11506481468677521, + "eval_rewards/rejected": -0.6711603999137878, + "eval_runtime": 484.307, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.111, "step": 1500 }, { "epoch": 0.26, - "grad_norm": 1.3203125, + "grad_norm": 1.3359375, "learning_rate": 4.619606453636708e-06, - "logits/chosen": -3.2485604286193848, - "logits/rejected": -3.245424270629883, - "logps/chosen": -145.904296875, - "logps/rejected": -158.60733032226562, - "loss": 0.6205, + "logits/chosen": -3.2467637062072754, + "logits/rejected": -3.2440028190612793, + "logps/chosen": -141.313232421875, + "logps/rejected": -154.34750366210938, + "loss": 0.6182, "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7720716595649719, - "rewards/margins": 0.1980876922607422, - "rewards/rejected": -0.9701593518257141, + "rewards/chosen": -0.7261611819267273, + "rewards/margins": 0.2013995349407196, + "rewards/rejected": -0.9275606274604797, "step": 1510 }, { "epoch": 0.26, - "grad_norm": 1.7890625, + "grad_norm": 1.6953125, "learning_rate": 4.611593119057047e-06, - "logits/chosen": -3.263840436935425, - "logits/rejected": -3.2636096477508545, - "logps/chosen": -144.23703002929688, - "logps/rejected": -151.71929931640625, - "loss": 0.6676, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.8146566152572632, - "rewards/margins": 0.10837093740701675, - "rewards/rejected": -0.9230276942253113, + "logits/chosen": -3.2586662769317627, + "logits/rejected": -3.258805751800537, + "logps/chosen": -140.69796752929688, + "logps/rejected": -148.06845092773438, + "loss": 0.667, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7792658805847168, + "rewards/margins": 0.10725333541631699, + "rewards/rejected": -0.8865191340446472, "step": 1520 }, { "epoch": 0.26, - "grad_norm": 2.015625, + "grad_norm": 2.171875, "learning_rate": 4.603503359566912e-06, - "logits/chosen": -3.2455577850341797, - "logits/rejected": -3.243157148361206, - "logps/chosen": -143.5106658935547, - "logps/rejected": -151.66490173339844, - "loss": 0.6637, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.8118158578872681, - "rewards/margins": 0.12848272919654846, - "rewards/rejected": -0.9402983784675598, + "logits/chosen": -3.236619472503662, + "logits/rejected": -3.23425030708313, + "logps/chosen": -139.57363891601562, + "logps/rejected": -148.51255798339844, + "loss": 0.6584, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7724454998970032, + "rewards/margins": 0.13632960617542267, + "rewards/rejected": -0.9087749719619751, "step": 1530 }, { "epoch": 0.27, - "grad_norm": 2.171875, + "grad_norm": 2.65625, "learning_rate": 4.595337467959046e-06, - "logits/chosen": -3.275472640991211, - "logits/rejected": -3.269941806793213, - "logps/chosen": -129.93222045898438, - "logps/rejected": -141.9746856689453, - "loss": 0.6267, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.6406632661819458, - "rewards/margins": 0.19468863308429718, - "rewards/rejected": -0.8353517651557922, + "logits/chosen": -3.266460418701172, + "logits/rejected": -3.261094570159912, + "logps/chosen": -130.5712890625, + "logps/rejected": -142.80465698242188, + "loss": 0.6258, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6470538377761841, + "rewards/margins": 0.1965980976819992, + "rewards/rejected": -0.8436519503593445, "step": 1540 }, { "epoch": 0.27, - "grad_norm": 1.921875, + "grad_norm": 2.125, "learning_rate": 4.587095739781645e-06, - "logits/chosen": -3.2549495697021484, - "logits/rejected": -3.2512078285217285, - "logps/chosen": -123.9298095703125, - "logps/rejected": -132.6895751953125, - "loss": 0.6375, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.5939527750015259, - "rewards/margins": 0.1560622900724411, - "rewards/rejected": -0.7500149607658386, + "logits/chosen": -3.246718645095825, + "logits/rejected": -3.242940902709961, + "logps/chosen": -126.24415588378906, + "logps/rejected": -134.73782348632812, + "loss": 0.6393, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6170962452888489, + "rewards/margins": 0.1534012109041214, + "rewards/rejected": -0.7704974412918091, "step": 1550 }, { "epoch": 0.27, - "grad_norm": 1.890625, + "grad_norm": 2.046875, "learning_rate": 4.578778473327659e-06, - "logits/chosen": -3.2367331981658936, - "logits/rejected": -3.2333521842956543, - "logps/chosen": -124.5437240600586, - "logps/rejected": -136.7928466796875, - "loss": 0.6426, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6369401216506958, - "rewards/margins": 0.14378319680690765, - "rewards/rejected": -0.7807233333587646, + "logits/chosen": -3.226879835128784, + "logits/rejected": -3.223694324493408, + "logps/chosen": -128.14688110351562, + "logps/rejected": -140.37246704101562, + "loss": 0.643, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6729716658592224, + "rewards/margins": 0.14354786276817322, + "rewards/rejected": -0.816519558429718, "step": 1560 }, { "epoch": 0.27, - "grad_norm": 2.25, + "grad_norm": 2.171875, "learning_rate": 4.570385969623993e-06, - "logits/chosen": -3.254424571990967, - "logits/rejected": -3.2544474601745605, - "logps/chosen": -132.67491149902344, - "logps/rejected": -152.40585327148438, - "loss": 0.6331, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6961439251899719, - "rewards/margins": 0.1778034120798111, - "rewards/rejected": -0.8739473223686218, + "logits/chosen": -3.250641345977783, + "logits/rejected": -3.2509448528289795, + "logps/chosen": -133.70693969726562, + "logps/rejected": -152.85145568847656, + "loss": 0.6358, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7064641118049622, + "rewards/margins": 0.17193913459777832, + "rewards/rejected": -0.8784033060073853, "step": 1570 }, { "epoch": 0.27, "grad_norm": 1.625, "learning_rate": 4.561918532420615e-06, - "logits/chosen": -3.2401375770568848, - "logits/rejected": -3.2378089427948, - "logps/chosen": -142.2194061279297, - "logps/rejected": -158.73133850097656, - "loss": 0.6443, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.8008275032043457, - "rewards/margins": 0.1721135675907135, - "rewards/rejected": -0.9729412198066711, + "logits/chosen": -3.24027681350708, + "logits/rejected": -3.238123655319214, + "logps/chosen": -140.45701599121094, + "logps/rejected": -155.67108154296875, + "loss": 0.6473, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7832036018371582, + "rewards/margins": 0.15913492441177368, + "rewards/rejected": -0.9423385858535767, "step": 1580 }, { "epoch": 0.27, - "grad_norm": 1.9921875, + "grad_norm": 1.96875, "learning_rate": 4.553376468179564e-06, - "logits/chosen": -3.2497572898864746, - "logits/rejected": -3.2477023601531982, - "logps/chosen": -139.50601196289062, - "logps/rejected": -152.8370819091797, - "loss": 0.65, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.7493128776550293, - "rewards/margins": 0.1411186009645462, - "rewards/rejected": -0.8904315233230591, + "logits/chosen": -3.2495689392089844, + "logits/rejected": -3.2477195262908936, + "logps/chosen": -136.52931213378906, + "logps/rejected": -150.0518035888672, + "loss": 0.6475, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7195461392402649, + "rewards/margins": 0.14303259551525116, + "rewards/rejected": -0.8625787496566772, "step": 1590 }, { "epoch": 0.28, - "grad_norm": 1.6640625, + "grad_norm": 1.7890625, "learning_rate": 4.544760086063856e-06, - "logits/chosen": -3.2545619010925293, - "logits/rejected": -3.2462317943573, - "logps/chosen": -132.92030334472656, - "logps/rejected": -144.53115844726562, - "loss": 0.6395, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6942413449287415, - "rewards/margins": 0.1516888588666916, - "rewards/rejected": -0.845930278301239, + "logits/chosen": -3.2430386543273926, + "logits/rejected": -3.2349789142608643, + "logps/chosen": -135.8370361328125, + "logps/rejected": -147.910400390625, + "loss": 0.6383, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7234088182449341, + "rewards/margins": 0.15631389617919922, + "rewards/rejected": -0.8797225952148438, "step": 1600 }, { "epoch": 0.28, - "eval_logits/chosen": -3.252528190612793, - "eval_logits/rejected": -3.250530958175659, - "eval_logps/chosen": -126.27056121826172, - "eval_logps/rejected": -141.61703491210938, - "eval_loss": 0.6536267399787903, - "eval_rewards/accuracies": 0.6154739856719971, - "eval_rewards/chosen": -0.5486056208610535, - "eval_rewards/margins": 0.11649421602487564, - "eval_rewards/rejected": -0.6650997996330261, - "eval_runtime": 483.7907, - "eval_samples_per_second": 8.896, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.233834743499756, + "eval_logits/rejected": -3.231811046600342, + "eval_logps/chosen": -130.09262084960938, + "eval_logps/rejected": -145.6309356689453, + "eval_loss": 0.6538301706314087, + "eval_rewards/accuracies": 0.6177973747253418, + "eval_rewards/chosen": -0.5868260264396667, + "eval_rewards/margins": 0.11841286718845367, + "eval_rewards/rejected": -0.7052388787269592, + "eval_runtime": 484.6899, + "eval_samples_per_second": 8.88, + "eval_steps_per_second": 1.11, "step": 1600 }, { "epoch": 0.28, - "grad_norm": 1.53125, + "grad_norm": 1.578125, "learning_rate": 4.536069697926291e-06, - "logits/chosen": -3.2715904712677, - "logits/rejected": -3.268829345703125, - "logps/chosen": -142.8290557861328, - "logps/rejected": -159.7598419189453, - "loss": 0.6385, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7724953293800354, - "rewards/margins": 0.1773601919412613, - "rewards/rejected": -0.9498555064201355, + "logits/chosen": -3.2569375038146973, + "logits/rejected": -3.254868984222412, + "logps/chosen": -145.26028442382812, + "logps/rejected": -162.26365661621094, + "loss": 0.638, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7968076467514038, + "rewards/margins": 0.1780860722064972, + "rewards/rejected": -0.9748937487602234, "step": 1610 }, { "epoch": 0.28, - "grad_norm": 2.640625, + "grad_norm": 2.578125, "learning_rate": 4.527305618298173e-06, - "logits/chosen": -3.2432327270507812, - "logits/rejected": -3.238971710205078, - "logps/chosen": -154.33795166015625, - "logps/rejected": -174.77117919921875, - "loss": 0.6388, + "logits/chosen": -3.2288143634796143, + "logits/rejected": -3.2248358726501465, + "logps/chosen": -153.36053466796875, + "logps/rejected": -174.30075073242188, + "loss": 0.6367, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.9221574664115906, - "rewards/margins": 0.2066594660282135, - "rewards/rejected": -1.128816843032837, + "rewards/chosen": -0.9123834371566772, + "rewards/margins": 0.2117292433977127, + "rewards/rejected": -1.1241127252578735, "step": 1620 }, { "epoch": 0.28, - "grad_norm": 2.125, + "grad_norm": 1.765625, "learning_rate": 4.518468164377923e-06, - "logits/chosen": -3.2425739765167236, - "logits/rejected": -3.236532211303711, - "logps/chosen": -153.01971435546875, - "logps/rejected": -165.77304077148438, - "loss": 0.6226, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.8580877184867859, - "rewards/margins": 0.2107679396867752, - "rewards/rejected": -1.0688556432724, + "logits/chosen": -3.23083758354187, + "logits/rejected": -3.224977493286133, + "logps/chosen": -150.68533325195312, + "logps/rejected": -162.896728515625, + "loss": 0.6251, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8347437977790833, + "rewards/margins": 0.20534896850585938, + "rewards/rejected": -1.0400927066802979, "step": 1630 }, { "epoch": 0.28, - "grad_norm": 1.71875, + "grad_norm": 1.75, "learning_rate": 4.5095576560195975e-06, - "logits/chosen": -3.2647926807403564, - "logits/rejected": -3.2611937522888184, - "logps/chosen": -146.43026733398438, - "logps/rejected": -149.50869750976562, - "loss": 0.6614, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.7866324186325073, - "rewards/margins": 0.12221284210681915, - "rewards/rejected": -0.9088452458381653, + "logits/chosen": -3.257983446121216, + "logits/rejected": -3.2549941539764404, + "logps/chosen": -143.98660278320312, + "logps/rejected": -147.51876831054688, + "loss": 0.6591, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7621957063674927, + "rewards/margins": 0.1267503798007965, + "rewards/rejected": -0.8889460563659668, "step": 1640 }, { "epoch": 0.28, - "grad_norm": 2.03125, + "grad_norm": 1.828125, "learning_rate": 4.500574415721311e-06, - "logits/chosen": -3.27087664604187, - "logits/rejected": -3.2663211822509766, - "logps/chosen": -131.85440063476562, - "logps/rejected": -147.97909545898438, - "loss": 0.6225, + "logits/chosen": -3.2671570777893066, + "logits/rejected": -3.2629342079162598, + "logps/chosen": -130.56808471679688, + "logps/rejected": -147.02005004882812, + "loss": 0.6212, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6372531652450562, - "rewards/margins": 0.20080938935279846, - "rewards/rejected": -0.838062584400177, + "rewards/chosen": -0.6243900656700134, + "rewards/margins": 0.20408205687999725, + "rewards/rejected": -0.8284721374511719, "step": 1650 }, { "epoch": 0.29, - "grad_norm": 1.4140625, + "grad_norm": 1.453125, "learning_rate": 4.491518768613569e-06, - "logits/chosen": -3.263221025466919, - "logits/rejected": -3.2580618858337402, - "logps/chosen": -135.19631958007812, - "logps/rejected": -139.4954071044922, - "loss": 0.6525, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6873170137405396, - "rewards/margins": 0.13492907583713531, - "rewards/rejected": -0.8222460746765137, + "logits/chosen": -3.263577699661255, + "logits/rejected": -3.2591965198516846, + "logps/chosen": -132.37673950195312, + "logps/rejected": -136.60537719726562, + "loss": 0.6528, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6591209769248962, + "rewards/margins": 0.1342247724533081, + "rewards/rejected": -0.7933458089828491, "step": 1660 }, { "epoch": 0.29, - "grad_norm": 1.3828125, + "grad_norm": 1.3359375, "learning_rate": 4.482391042447497e-06, - "logits/chosen": -3.263942003250122, - "logits/rejected": -3.2601189613342285, - "logps/chosen": -129.59695434570312, - "logps/rejected": -143.82839965820312, - "loss": 0.6372, + "logits/chosen": -3.268989086151123, + "logits/rejected": -3.265378475189209, + "logps/chosen": -124.76434326171875, + "logps/rejected": -138.3180389404297, + "loss": 0.6385, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6334278583526611, - "rewards/margins": 0.16828377544879913, - "rewards/rejected": -0.8017115592956543, + "rewards/chosen": -0.5851017236709595, + "rewards/margins": 0.16150644421577454, + "rewards/rejected": -0.7466081380844116, "step": 1670 }, { "epoch": 0.29, - "grad_norm": 1.9921875, + "grad_norm": 1.609375, "learning_rate": 4.473191567582975e-06, - "logits/chosen": -3.239767074584961, - "logits/rejected": -3.2362465858459473, - "logps/chosen": -133.9244842529297, - "logps/rejected": -149.71267700195312, - "loss": 0.6279, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.67822265625, - "rewards/margins": 0.19946426153182983, - "rewards/rejected": -0.8776868581771851, + "logits/chosen": -3.249393939971924, + "logits/rejected": -3.246347427368164, + "logps/chosen": -127.35566711425781, + "logps/rejected": -143.05728149414062, + "loss": 0.6265, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6125344634056091, + "rewards/margins": 0.19859835505485535, + "rewards/rejected": -0.8111329078674316, "step": 1680 }, { "epoch": 0.29, - "grad_norm": 1.8359375, + "grad_norm": 1.921875, "learning_rate": 4.46392067697669e-06, - "logits/chosen": -3.2343764305114746, - "logits/rejected": -3.2323060035705566, - "logps/chosen": -127.72633361816406, - "logps/rejected": -136.99798583984375, - "loss": 0.6509, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6233143210411072, - "rewards/margins": 0.14099445939064026, - "rewards/rejected": -0.7643088102340698, + "logits/chosen": -3.245945692062378, + "logits/rejected": -3.244025468826294, + "logps/chosen": -123.40696716308594, + "logps/rejected": -132.4766082763672, + "loss": 0.6511, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5801206827163696, + "rewards/margins": 0.13897430896759033, + "rewards/rejected": -0.7190949320793152, "step": 1690 }, { "epoch": 0.29, - "grad_norm": 2.15625, + "grad_norm": 1.78125, "learning_rate": 4.454578706170075e-06, - "logits/chosen": -3.202291488647461, - "logits/rejected": -3.204252243041992, - "logps/chosen": -124.95072937011719, - "logps/rejected": -136.50686645507812, - "loss": 0.678, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.647601306438446, - "rewards/margins": 0.07817380130290985, - "rewards/rejected": -0.7257751226425171, + "logits/chosen": -3.2143101692199707, + "logits/rejected": -3.2159550189971924, + "logps/chosen": -124.13203430175781, + "logps/rejected": -136.1236114501953, + "loss": 0.6775, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.6394143104553223, + "rewards/margins": 0.08252833038568497, + "rewards/rejected": -0.7219426035881042, "step": 1700 }, { "epoch": 0.29, - "eval_logits/chosen": -3.2125089168548584, - "eval_logits/rejected": -3.2106754779815674, - "eval_logps/chosen": -117.87283325195312, - "eval_logps/rejected": -131.22845458984375, - "eval_loss": 0.6587392687797546, - "eval_rewards/accuracies": 0.6168680191040039, - "eval_rewards/chosen": -0.464628130197525, - "eval_rewards/margins": 0.09658578038215637, - "eval_rewards/rejected": -0.5612139701843262, - "eval_runtime": 484.1087, - "eval_samples_per_second": 8.891, + "eval_logits/chosen": -3.2211709022521973, + "eval_logits/rejected": -3.219395637512207, + "eval_logps/chosen": -118.28197479248047, + "eval_logps/rejected": -132.27479553222656, + "eval_loss": 0.6568416357040405, + "eval_rewards/accuracies": 0.6173326969146729, + "eval_rewards/chosen": -0.46871957182884216, + "eval_rewards/margins": 0.10295785218477249, + "eval_rewards/rejected": -0.5716773867607117, + "eval_runtime": 484.3291, + "eval_samples_per_second": 8.887, "eval_steps_per_second": 1.111, "step": 1700 }, { "epoch": 0.29, - "grad_norm": 2.40625, + "grad_norm": 2.140625, "learning_rate": 4.445165993277171e-06, - "logits/chosen": -3.2155022621154785, - "logits/rejected": -3.2090964317321777, - "logps/chosen": -130.38784790039062, - "logps/rejected": -139.3701629638672, - "loss": 0.6437, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6083312630653381, - "rewards/margins": 0.1592980921268463, - "rewards/rejected": -0.7676293253898621, + "logits/chosen": -3.222552537918091, + "logits/rejected": -3.2166829109191895, + "logps/chosen": -132.51321411132812, + "logps/rejected": -142.26742553710938, + "loss": 0.6422, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6295849084854126, + "rewards/margins": 0.16701695322990417, + "rewards/rejected": -0.7966018915176392, "step": 1710 }, { "epoch": 0.3, - "grad_norm": 2.328125, + "grad_norm": 2.078125, "learning_rate": 4.435682878972389e-06, - "logits/chosen": -3.2114365100860596, - "logits/rejected": -3.206124782562256, - "logps/chosen": -134.76806640625, - "logps/rejected": -137.71871948242188, - "loss": 0.6645, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.6806883811950684, - "rewards/margins": 0.10775679349899292, - "rewards/rejected": -0.7884451746940613, + "logits/chosen": -3.2195911407470703, + "logits/rejected": -3.214392900466919, + "logps/chosen": -137.64071655273438, + "logps/rejected": -141.55099487304688, + "loss": 0.6611, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7094148397445679, + "rewards/margins": 0.11735300719738007, + "rewards/rejected": -0.8267678022384644, "step": 1720 }, { "epoch": 0.3, - "grad_norm": 2.515625, + "grad_norm": 2.578125, "learning_rate": 4.426129706478178e-06, - "logits/chosen": -3.191399574279785, - "logits/rejected": -3.1889612674713135, - "logps/chosen": -136.0282745361328, - "logps/rejected": -142.6835174560547, - "loss": 0.6721, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.7149096131324768, - "rewards/margins": 0.10171695053577423, - "rewards/rejected": -0.8166265487670898, + "logits/chosen": -3.2044854164123535, + "logits/rejected": -3.202407121658325, + "logps/chosen": -136.70947265625, + "logps/rejected": -143.6942901611328, + "loss": 0.6724, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7217217683792114, + "rewards/margins": 0.10501249134540558, + "rewards/rejected": -0.8267343640327454, "step": 1730 }, { "epoch": 0.3, - "grad_norm": 1.6484375, + "grad_norm": 1.546875, "learning_rate": 4.416506821552603e-06, - "logits/chosen": -3.1809914112091064, - "logits/rejected": -3.1785573959350586, - "logps/chosen": -129.92225646972656, - "logps/rejected": -149.59298706054688, - "loss": 0.6364, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.6773825883865356, - "rewards/margins": 0.19856294989585876, - "rewards/rejected": -0.8759455680847168, + "logits/chosen": -3.2000412940979004, + "logits/rejected": -3.197852849960327, + "logps/chosen": -129.83609008789062, + "logps/rejected": -149.46237182617188, + "loss": 0.636, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6765210032463074, + "rewards/margins": 0.1981184333562851, + "rewards/rejected": -0.8746395111083984, "step": 1740 }, { "epoch": 0.3, - "grad_norm": 1.96875, + "grad_norm": 1.890625, "learning_rate": 4.406814572476833e-06, - "logits/chosen": -3.1837871074676514, - "logits/rejected": -3.1800642013549805, - "logps/chosen": -128.7941436767578, - "logps/rejected": -145.5009765625, - "loss": 0.6338, + "logits/chosen": -3.2070159912109375, + "logits/rejected": -3.2036139965057373, + "logps/chosen": -127.25260925292969, + "logps/rejected": -143.64280700683594, + "loss": 0.6352, "rewards/accuracies": 0.625, - "rewards/chosen": -0.6254864931106567, - "rewards/margins": 0.17526546120643616, - "rewards/rejected": -0.8007518649101257, + "rewards/chosen": -0.610071063041687, + "rewards/margins": 0.17209911346435547, + "rewards/rejected": -0.7821701765060425, "step": 1750 }, { "epoch": 0.3, - "grad_norm": 2.046875, + "grad_norm": 2.171875, "learning_rate": 4.397053310042533e-06, - "logits/chosen": -3.178910493850708, - "logits/rejected": -3.1739954948425293, - "logps/chosen": -136.34449768066406, - "logps/rejected": -150.8899688720703, - "loss": 0.627, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.6980650424957275, - "rewards/margins": 0.2017349898815155, - "rewards/rejected": -0.8998001217842102, + "logits/chosen": -3.204622268676758, + "logits/rejected": -3.200345993041992, + "logps/chosen": -132.13955688476562, + "logps/rejected": -146.16519165039062, + "loss": 0.6286, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6560156345367432, + "rewards/margins": 0.1965368688106537, + "rewards/rejected": -0.852552592754364, "step": 1760 }, { "epoch": 0.3, - "grad_norm": 1.9140625, + "grad_norm": 1.7421875, "learning_rate": 4.3872233875391715e-06, - "logits/chosen": -3.1531150341033936, - "logits/rejected": -3.1482579708099365, - "logps/chosen": -144.26266479492188, - "logps/rejected": -154.70135498046875, - "loss": 0.6282, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7813176512718201, - "rewards/margins": 0.20064513385295868, - "rewards/rejected": -0.9819628596305847, + "logits/chosen": -3.1852593421936035, + "logits/rejected": -3.180729627609253, + "logps/chosen": -139.60780334472656, + "logps/rejected": -150.09140014648438, + "loss": 0.6271, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7347689867019653, + "rewards/margins": 0.2010943442583084, + "rewards/rejected": -0.9358633160591125, "step": 1770 }, { "epoch": 0.31, - "grad_norm": 2.328125, + "grad_norm": 2.15625, "learning_rate": 4.3773251607412294e-06, - "logits/chosen": -3.1487860679626465, - "logits/rejected": -3.147592782974243, - "logps/chosen": -138.44879150390625, - "logps/rejected": -167.2222442626953, - "loss": 0.6012, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7416083216667175, - "rewards/margins": 0.2687363028526306, - "rewards/rejected": -1.0103447437286377, + "logits/chosen": -3.1705119609832764, + "logits/rejected": -3.169437885284424, + "logps/chosen": -136.10638427734375, + "logps/rejected": -164.87498474121094, + "loss": 0.6009, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7181843519210815, + "rewards/margins": 0.2686876654624939, + "rewards/rejected": -0.9868720769882202, "step": 1780 }, { "epoch": 0.31, - "grad_norm": 1.8515625, + "grad_norm": 1.8828125, "learning_rate": 4.367358987895327e-06, - "logits/chosen": -3.1224420070648193, - "logits/rejected": -3.1191248893737793, - "logps/chosen": -145.96359252929688, - "logps/rejected": -164.42001342773438, - "loss": 0.6217, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.8266698122024536, - "rewards/margins": 0.2106233835220337, - "rewards/rejected": -1.0372931957244873, + "logits/chosen": -3.1425235271453857, + "logits/rejected": -3.1398634910583496, + "logps/chosen": -144.38807678222656, + "logps/rejected": -163.69662475585938, + "loss": 0.6184, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8109146952629089, + "rewards/margins": 0.21914473176002502, + "rewards/rejected": -1.0300594568252563, "step": 1790 }, { "epoch": 0.31, - "grad_norm": 2.46875, + "grad_norm": 2.375, "learning_rate": 4.3573252297072544e-06, - "logits/chosen": -3.1158227920532227, - "logits/rejected": -3.111827850341797, - "logps/chosen": -153.73141479492188, - "logps/rejected": -172.12696838378906, - "loss": 0.629, + "logits/chosen": -3.1346559524536133, + "logits/rejected": -3.1308445930480957, + "logps/chosen": -151.2903594970703, + "logps/rejected": -169.28575134277344, + "loss": 0.6312, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.9160531163215637, - "rewards/margins": 0.20312508940696716, - "rewards/rejected": -1.1191781759262085, + "rewards/chosen": -0.8916425704956055, + "rewards/margins": 0.19912366569042206, + "rewards/rejected": -1.090766191482544, "step": 1800 }, { "epoch": 0.31, - "eval_logits/chosen": -3.1113333702087402, - "eval_logits/rejected": -3.1087486743927, - "eval_logps/chosen": -146.88597106933594, - "eval_logps/rejected": -164.902587890625, - "eval_loss": 0.6488604545593262, - "eval_rewards/accuracies": 0.6187267899513245, - "eval_rewards/chosen": -0.7547595500946045, - "eval_rewards/margins": 0.14319583773612976, - "eval_rewards/rejected": -0.8979554176330566, - "eval_runtime": 483.4992, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 1.113, + "eval_logits/chosen": -3.1236960887908936, + "eval_logits/rejected": -3.1213295459747314, + "eval_logps/chosen": -143.4406280517578, + "eval_logps/rejected": -161.27671813964844, + "eval_loss": 0.6496570706367493, + "eval_rewards/accuracies": 0.6110594868659973, + "eval_rewards/chosen": -0.7203060388565063, + "eval_rewards/margins": 0.14139072597026825, + "eval_rewards/rejected": -0.861696720123291, + "eval_runtime": 483.6469, + "eval_samples_per_second": 8.899, + "eval_steps_per_second": 1.112, "step": 1800 }, { "epoch": 0.31, - "grad_norm": 2.1875, + "grad_norm": 2.40625, "learning_rate": 4.347224249328922e-06, - "logits/chosen": -3.119220018386841, - "logits/rejected": -3.1162116527557373, - "logps/chosen": -164.25729370117188, - "logps/rejected": -172.886962890625, - "loss": 0.6747, + "logits/chosen": -3.1321170330047607, + "logits/rejected": -3.1298587322235107, + "logps/chosen": -162.84423828125, + "logps/rejected": -170.39773559570312, + "loss": 0.6818, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.96868497133255, - "rewards/margins": 0.13878265023231506, - "rewards/rejected": -1.107467532157898, + "rewards/chosen": -0.9545540809631348, + "rewards/margins": 0.1280214488506317, + "rewards/rejected": -1.0825755596160889, "step": 1810 }, { "epoch": 0.31, - "grad_norm": 2.484375, + "grad_norm": 2.6875, "learning_rate": 4.337056412345209e-06, - "logits/chosen": -3.1267404556274414, - "logits/rejected": -3.121772527694702, - "logps/chosen": -152.7144775390625, - "logps/rejected": -158.19595336914062, + "logits/chosen": -3.134734630584717, + "logits/rejected": -3.130065679550171, + "logps/chosen": -152.81954956054688, + "logps/rejected": -158.31002807617188, "loss": 0.6566, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.8691015243530273, - "rewards/margins": 0.1269044280052185, - "rewards/rejected": -0.9960060119628906, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.8701522946357727, + "rewards/margins": 0.1269942671060562, + "rewards/rejected": -0.997146487236023, "step": 1820 }, { "epoch": 0.32, - "grad_norm": 1.9765625, + "grad_norm": 1.921875, "learning_rate": 4.326822086760743e-06, - "logits/chosen": -3.1357274055480957, - "logits/rejected": -3.128983497619629, - "logps/chosen": -140.71310424804688, - "logps/rejected": -157.28184509277344, - "loss": 0.6349, + "logits/chosen": -3.147284746170044, + "logits/rejected": -3.1407833099365234, + "logps/chosen": -138.86541748046875, + "logps/rejected": -155.19522094726562, + "loss": 0.6355, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.7482028007507324, - "rewards/margins": 0.19133971631526947, - "rewards/rejected": -0.9395424723625183, + "rewards/chosen": -0.7297259569168091, + "rewards/margins": 0.18895025551319122, + "rewards/rejected": -0.9186761975288391, "step": 1830 }, { "epoch": 0.32, - "grad_norm": 2.265625, + "grad_norm": 2.046875, "learning_rate": 4.316521642986566e-06, - "logits/chosen": -3.1679399013519287, - "logits/rejected": -3.1663129329681396, - "logps/chosen": -146.0639190673828, - "logps/rejected": -159.45327758789062, - "loss": 0.6507, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.8074442148208618, - "rewards/margins": 0.1614806205034256, - "rewards/rejected": -0.9689248204231262, + "logits/chosen": -3.1870529651641846, + "logits/rejected": -3.185727596282959, + "logps/chosen": -143.051513671875, + "logps/rejected": -155.83590698242188, + "loss": 0.6529, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7773202657699585, + "rewards/margins": 0.15543068945407867, + "rewards/rejected": -0.9327509999275208, "step": 1840 }, { "epoch": 0.32, - "grad_norm": 1.984375, + "grad_norm": 2.0625, "learning_rate": 4.3061554538267444e-06, - "logits/chosen": -3.163782835006714, - "logits/rejected": -3.1629064083099365, - "logps/chosen": -135.1237030029297, - "logps/rejected": -145.4768524169922, - "loss": 0.6644, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.7083430290222168, - "rewards/margins": 0.12782330811023712, - "rewards/rejected": -0.8361663818359375, + "logits/chosen": -3.1828761100769043, + "logits/rejected": -3.182145595550537, + "logps/chosen": -132.54562377929688, + "logps/rejected": -142.9228057861328, + "loss": 0.6637, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6825622916221619, + "rewards/margins": 0.12806358933448792, + "rewards/rejected": -0.8106260299682617, "step": 1850 }, { "epoch": 0.32, - "grad_norm": 1.7734375, + "grad_norm": 1.796875, "learning_rate": 4.295723894464862e-06, - "logits/chosen": -3.167217969894409, - "logits/rejected": -3.1629233360290527, - "logps/chosen": -129.897216796875, - "logps/rejected": -137.09112548828125, - "loss": 0.6383, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.6009452939033508, - "rewards/margins": 0.15543445944786072, - "rewards/rejected": -0.7563797235488892, + "logits/chosen": -3.1849961280822754, + "logits/rejected": -3.1808555126190186, + "logps/chosen": -128.9047393798828, + "logps/rejected": -136.94223022460938, + "loss": 0.6357, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5910203456878662, + "rewards/margins": 0.16387054324150085, + "rewards/rejected": -0.7548909187316895, "step": 1860 }, { "epoch": 0.32, "grad_norm": 1.7109375, "learning_rate": 4.285227342450449e-06, - "logits/chosen": -3.169278621673584, - "logits/rejected": -3.1676392555236816, - "logps/chosen": -123.1519546508789, - "logps/rejected": -136.8570098876953, - "loss": 0.6279, + "logits/chosen": -3.1886954307556152, + "logits/rejected": -3.187377691268921, + "logps/chosen": -124.11856842041016, + "logps/rejected": -138.12887573242188, + "loss": 0.6282, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5699488520622253, - "rewards/margins": 0.19082696735858917, - "rewards/rejected": -0.7607758045196533, + "rewards/chosen": -0.57961505651474, + "rewards/margins": 0.19387942552566528, + "rewards/rejected": -0.7734946012496948, "step": 1870 }, { "epoch": 0.32, - "grad_norm": 1.7578125, + "grad_norm": 1.78125, "learning_rate": 4.274666177685317e-06, - "logits/chosen": -3.168717622756958, - "logits/rejected": -3.162923574447632, - "logps/chosen": -125.75395202636719, - "logps/rejected": -139.8086395263672, - "loss": 0.6249, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.6144317984580994, - "rewards/margins": 0.192767933011055, - "rewards/rejected": -0.8071997761726379, + "logits/chosen": -3.187439203262329, + "logits/rejected": -3.1822094917297363, + "logps/chosen": -127.79396057128906, + "logps/rejected": -141.5155029296875, + "loss": 0.6277, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6348319053649902, + "rewards/margins": 0.1894364058971405, + "rewards/rejected": -0.8242682218551636, "step": 1880 }, { "epoch": 0.33, - "grad_norm": 2.125, + "grad_norm": 2.0, "learning_rate": 4.264040782409804e-06, - "logits/chosen": -3.169412612915039, - "logits/rejected": -3.1647706031799316, - "logps/chosen": -125.84661865234375, - "logps/rejected": -145.9022216796875, - "loss": 0.6215, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6267572641372681, - "rewards/margins": 0.21151788532733917, - "rewards/rejected": -0.838275134563446, + "logits/chosen": -3.1903483867645264, + "logits/rejected": -3.1858248710632324, + "logps/chosen": -125.4483413696289, + "logps/rejected": -145.291748046875, + "loss": 0.6229, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6227745413780212, + "rewards/margins": 0.20939584076404572, + "rewards/rejected": -0.8321703672409058, "step": 1890 }, { "epoch": 0.33, - "grad_norm": 1.9453125, + "grad_norm": 1.796875, "learning_rate": 4.253351541188947e-06, - "logits/chosen": -3.1573798656463623, - "logits/rejected": -3.154900074005127, - "logps/chosen": -139.20106506347656, - "logps/rejected": -142.49725341796875, - "loss": 0.6622, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7105225324630737, - "rewards/margins": 0.11785916239023209, - "rewards/rejected": -0.8283816576004028, + "logits/chosen": -3.1822569370269775, + "logits/rejected": -3.180145740509033, + "logps/chosen": -137.6725616455078, + "logps/rejected": -140.46498107910156, + "loss": 0.665, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6952374577522278, + "rewards/margins": 0.11282126605510712, + "rewards/rejected": -0.8080587387084961, "step": 1900 }, { "epoch": 0.33, - "eval_logits/chosen": -3.141881227493286, - "eval_logits/rejected": -3.1398515701293945, - "eval_logps/chosen": -125.99919891357422, - "eval_logps/rejected": -140.6699981689453, - "eval_loss": 0.6555050015449524, - "eval_rewards/accuracies": 0.606877326965332, - "eval_rewards/chosen": -0.5458918213844299, - "eval_rewards/margins": 0.10973773896694183, - "eval_rewards/rejected": -0.655629575252533, - "eval_runtime": 483.8573, - "eval_samples_per_second": 8.895, + "eval_logits/chosen": -3.1679985523223877, + "eval_logits/rejected": -3.165999412536621, + "eval_logps/chosen": -123.16142272949219, + "eval_logps/rejected": -137.88668823242188, + "eval_loss": 0.6550623178482056, + "eval_rewards/accuracies": 0.613382875919342, + "eval_rewards/chosen": -0.517514169216156, + "eval_rewards/margins": 0.1102820485830307, + "eval_rewards/rejected": -0.6277962923049927, + "eval_runtime": 483.7577, + "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 1900 }, { "epoch": 0.33, - "grad_norm": 2.109375, + "grad_norm": 1.96875, "learning_rate": 4.242598840898558e-06, - "logits/chosen": -3.1242589950561523, - "logits/rejected": -3.121159791946411, - "logps/chosen": -140.1244659423828, - "logps/rejected": -156.30972290039062, - "loss": 0.6312, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7592523694038391, - "rewards/margins": 0.1901467740535736, - "rewards/rejected": -0.9493991732597351, + "logits/chosen": -3.1519510746002197, + "logits/rejected": -3.148864507675171, + "logps/chosen": -137.3428192138672, + "logps/rejected": -153.29315185546875, + "loss": 0.6314, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7314358949661255, + "rewards/margins": 0.18779778480529785, + "rewards/rejected": -0.9192337989807129, "step": 1910 }, { "epoch": 0.33, - "grad_norm": 2.328125, + "grad_norm": 2.109375, "learning_rate": 4.231783070711223e-06, - "logits/chosen": -3.15392804145813, - "logits/rejected": -3.1520493030548096, - "logps/chosen": -145.32569885253906, - "logps/rejected": -157.24732971191406, - "loss": 0.6421, + "logits/chosen": -3.1809704303741455, + "logits/rejected": -3.1791326999664307, + "logps/chosen": -143.34425354003906, + "logps/rejected": -155.4877471923828, + "loss": 0.6395, "rewards/accuracies": 0.625, - "rewards/chosen": -0.7951496243476868, - "rewards/margins": 0.16277767717838287, - "rewards/rejected": -0.9579272270202637, + "rewards/chosen": -0.7753351330757141, + "rewards/margins": 0.16499650478363037, + "rewards/rejected": -0.9403316378593445, "step": 1920 }, { "epoch": 0.33, - "grad_norm": 2.1875, + "grad_norm": 2.234375, "learning_rate": 4.22090462208222e-06, - "logits/chosen": -3.1390862464904785, - "logits/rejected": -3.134338617324829, - "logps/chosen": -148.74644470214844, - "logps/rejected": -165.2567596435547, - "loss": 0.622, + "logits/chosen": -3.1596155166625977, + "logits/rejected": -3.1550707817077637, + "logps/chosen": -147.46380615234375, + "logps/rejected": -163.8795623779297, + "loss": 0.6219, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8294029235839844, - "rewards/margins": 0.2131255865097046, - "rewards/rejected": -1.042528510093689, + "rewards/chosen": -0.8165766596794128, + "rewards/margins": 0.21217963099479675, + "rewards/rejected": -1.0287562608718872, "step": 1930 }, { "epoch": 0.33, - "grad_norm": 2.046875, + "grad_norm": 2.109375, "learning_rate": 4.209963888735346e-06, - "logits/chosen": -3.169201374053955, - "logits/rejected": -3.1704461574554443, - "logps/chosen": -141.8172607421875, - "logps/rejected": -159.37860107421875, + "logits/chosen": -3.178684949874878, + "logits/rejected": -3.1801915168762207, + "logps/chosen": -141.09107971191406, + "logps/rejected": -158.66387939453125, "loss": 0.628, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.8085492849349976, - "rewards/margins": 0.20195230841636658, - "rewards/rejected": -1.0105016231536865, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8012874722480774, + "rewards/margins": 0.20206686854362488, + "rewards/rejected": -1.0033543109893799, "step": 1940 }, { "epoch": 0.34, - "grad_norm": 2.453125, + "grad_norm": 2.34375, "learning_rate": 4.198961266648671e-06, - "logits/chosen": -3.156470537185669, - "logits/rejected": -3.1528687477111816, - "logps/chosen": -154.35397338867188, - "logps/rejected": -175.27505493164062, - "loss": 0.6384, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.9214574098587036, - "rewards/margins": 0.19601932168006897, - "rewards/rejected": -1.1174767017364502, + "logits/chosen": -3.1610703468322754, + "logits/rejected": -3.157761335372925, + "logps/chosen": -152.76663208007812, + "logps/rejected": -173.17831420898438, + "loss": 0.6416, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9055836796760559, + "rewards/margins": 0.19092567265033722, + "rewards/rejected": -1.096509337425232, "step": 1950 }, { "epoch": 0.34, - "grad_norm": 2.5625, + "grad_norm": 2.578125, "learning_rate": 4.187897154040205e-06, - "logits/chosen": -3.1518867015838623, - "logits/rejected": -3.1515769958496094, - "logps/chosen": -159.7874298095703, - "logps/rejected": -172.596923828125, - "loss": 0.6478, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.9553605318069458, - "rewards/margins": 0.1520630419254303, - "rewards/rejected": -1.1074237823486328, + "logits/chosen": -3.15240740776062, + "logits/rejected": -3.1524808406829834, + "logps/chosen": -156.12435913085938, + "logps/rejected": -169.71414184570312, + "loss": 0.6442, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9187299013137817, + "rewards/margins": 0.15986597537994385, + "rewards/rejected": -1.0785958766937256, "step": 1960 }, { "epoch": 0.34, - "grad_norm": 2.015625, + "grad_norm": 1.890625, "learning_rate": 4.176771951353481e-06, - "logits/chosen": -3.169792652130127, - "logits/rejected": -3.165846347808838, - "logps/chosen": -151.9120330810547, - "logps/rejected": -157.11441040039062, - "loss": 0.6641, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.8450883626937866, - "rewards/margins": 0.11743706464767456, - "rewards/rejected": -0.9625255465507507, + "logits/chosen": -3.1625571250915527, + "logits/rejected": -3.158508777618408, + "logps/chosen": -147.187744140625, + "logps/rejected": -153.29031372070312, + "loss": 0.6603, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7978456616401672, + "rewards/margins": 0.12643900513648987, + "rewards/rejected": -0.9242845773696899, "step": 1970 }, { "epoch": 0.34, - "grad_norm": 2.0625, + "grad_norm": 1.9609375, "learning_rate": 4.165586061243074e-06, - "logits/chosen": -3.1747801303863525, - "logits/rejected": -3.170104503631592, - "logps/chosen": -153.86459350585938, - "logps/rejected": -167.02542114257812, - "loss": 0.6464, + "logits/chosen": -3.160752058029175, + "logits/rejected": -3.156141757965088, + "logps/chosen": -148.16342163085938, + "logps/rejected": -160.54156494140625, + "loss": 0.6501, "rewards/accuracies": 0.65625, - "rewards/chosen": -0.8814204335212708, - "rewards/margins": 0.16699686646461487, - "rewards/rejected": -1.048417329788208, + "rewards/chosen": -0.8244088888168335, + "rewards/margins": 0.15916991233825684, + "rewards/rejected": -0.9835788607597351, "step": 1980 }, { "epoch": 0.34, - "grad_norm": 1.6796875, + "grad_norm": 1.6328125, "learning_rate": 4.154339888560008e-06, - "logits/chosen": -3.1978580951690674, - "logits/rejected": -3.195103645324707, - "logps/chosen": -152.34295654296875, - "logps/rejected": -165.3287811279297, - "loss": 0.6294, + "logits/chosen": -3.1824774742126465, + "logits/rejected": -3.1802866458892822, + "logps/chosen": -145.84805297851562, + "logps/rejected": -158.97315979003906, + "loss": 0.6279, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.9096066355705261, - "rewards/margins": 0.1885077953338623, - "rewards/rejected": -1.0981144905090332, + "rewards/chosen": -0.8446575999259949, + "rewards/margins": 0.1899007260799408, + "rewards/rejected": -1.0345582962036133, "step": 1990 }, { "epoch": 0.34, - "grad_norm": 2.140625, + "grad_norm": 1.9921875, "learning_rate": 4.1430338403371275e-06, - "logits/chosen": -3.1856448650360107, - "logits/rejected": -3.186370372772217, - "logps/chosen": -153.1247100830078, - "logps/rejected": -176.1088409423828, - "loss": 0.64, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9345780611038208, - "rewards/margins": 0.1859598159790039, - "rewards/rejected": -1.1205378770828247, + "logits/chosen": -3.177934408187866, + "logits/rejected": -3.179072856903076, + "logps/chosen": -146.23016357421875, + "logps/rejected": -168.99826049804688, + "loss": 0.6385, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8656326532363892, + "rewards/margins": 0.18379953503608704, + "rewards/rejected": -1.0494321584701538, "step": 2000 }, { "epoch": 0.34, - "eval_logits/chosen": -3.1846792697906494, - "eval_logits/rejected": -3.182372570037842, - "eval_logps/chosen": -140.17138671875, - "eval_logps/rejected": -156.38426208496094, - "eval_loss": 0.6522988080978394, - "eval_rewards/accuracies": 0.6101301312446594, - "eval_rewards/chosen": -0.6876136660575867, - "eval_rewards/margins": 0.12515859305858612, - "eval_rewards/rejected": -0.8127721548080444, - "eval_runtime": 483.6746, - "eval_samples_per_second": 8.899, + "eval_logits/chosen": -3.1845004558563232, + "eval_logits/rejected": -3.1822633743286133, + "eval_logps/chosen": -133.07003784179688, + "eval_logps/rejected": -148.8959197998047, + "eval_loss": 0.6522409319877625, + "eval_rewards/accuracies": 0.6161710023880005, + "eval_rewards/chosen": -0.6166000962257385, + "eval_rewards/margins": 0.12128852307796478, + "eval_rewards/rejected": -0.7378886938095093, + "eval_runtime": 483.8898, + "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 2000 }, { "epoch": 0.35, - "grad_norm": 2.046875, + "grad_norm": 1.8828125, "learning_rate": 4.131668325774343e-06, - "logits/chosen": -3.1847736835479736, - "logits/rejected": -3.180018424987793, - "logps/chosen": -151.97802734375, - "logps/rejected": -163.89077758789062, - "loss": 0.6384, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8523083925247192, - "rewards/margins": 0.19287896156311035, - "rewards/rejected": -1.0451873540878296, + "logits/chosen": -3.1839537620544434, + "logits/rejected": -3.1793434619903564, + "logps/chosen": -146.1009979248047, + "logps/rejected": -157.49703979492188, + "loss": 0.6393, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7935379147529602, + "rewards/margins": 0.1877121478319168, + "rewards/rejected": -0.981249988079071, "step": 2010 }, { "epoch": 0.35, - "grad_norm": 2.546875, + "grad_norm": 2.65625, "learning_rate": 4.120243756223835e-06, - "logits/chosen": -3.1633083820343018, - "logits/rejected": -3.157390594482422, - "logps/chosen": -151.98509216308594, - "logps/rejected": -178.520751953125, - "loss": 0.6136, + "logits/chosen": -3.1638169288635254, + "logits/rejected": -3.1577069759368896, + "logps/chosen": -148.914794921875, + "logps/rejected": -175.17535400390625, + "loss": 0.6138, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9141998291015625, - "rewards/margins": 0.24424409866333008, - "rewards/rejected": -1.1584439277648926, + "rewards/chosen": -0.8834971189498901, + "rewards/margins": 0.24149295687675476, + "rewards/rejected": -1.1249901056289673, "step": 2020 }, { "epoch": 0.35, - "grad_norm": 2.4375, + "grad_norm": 2.34375, "learning_rate": 4.108760545175163e-06, - "logits/chosen": -3.1734821796417236, - "logits/rejected": -3.169147491455078, - "logps/chosen": -160.19607543945312, - "logps/rejected": -176.56297302246094, - "loss": 0.6368, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.928693950176239, - "rewards/margins": 0.1943078488111496, - "rewards/rejected": -1.1230019330978394, + "logits/chosen": -3.173308849334717, + "logits/rejected": -3.169128894805908, + "logps/chosen": -156.82955932617188, + "logps/rejected": -173.0848846435547, + "loss": 0.6359, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8950290679931641, + "rewards/margins": 0.19319191575050354, + "rewards/rejected": -1.0882209539413452, "step": 2030 }, { "epoch": 0.35, - "grad_norm": 2.0, + "grad_norm": 2.03125, "learning_rate": 4.097219108240295e-06, - "logits/chosen": -3.1459593772888184, - "logits/rejected": -3.143369674682617, - "logps/chosen": -152.8044891357422, - "logps/rejected": -172.223876953125, - "loss": 0.6289, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.887675940990448, - "rewards/margins": 0.21855147182941437, - "rewards/rejected": -1.1062272787094116, + "logits/chosen": -3.149092197418213, + "logits/rejected": -3.1467485427856445, + "logps/chosen": -149.60000610351562, + "logps/rejected": -168.51541137695312, + "loss": 0.6299, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8556310534477234, + "rewards/margins": 0.21351167559623718, + "rewards/rejected": -1.0691426992416382, "step": 2040 }, { "epoch": 0.35, - "grad_norm": 2.90625, + "grad_norm": 2.65625, "learning_rate": 4.085619863138574e-06, - "logits/chosen": -3.141997814178467, - "logits/rejected": -3.1412582397460938, - "logps/chosen": -143.8739776611328, - "logps/rejected": -167.2696075439453, - "loss": 0.6185, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.8224587440490723, - "rewards/margins": 0.2269144058227539, - "rewards/rejected": -1.0493730306625366, + "logits/chosen": -3.1532864570617676, + "logits/rejected": -3.1526474952697754, + "logps/chosen": -140.12538146972656, + "logps/rejected": -162.52774047851562, + "loss": 0.6204, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7849727272987366, + "rewards/margins": 0.21698176860809326, + "rewards/rejected": -1.0019545555114746, "step": 2050 }, { "epoch": 0.35, - "grad_norm": 2.09375, + "grad_norm": 1.765625, "learning_rate": 4.0739632296815886e-06, - "logits/chosen": -3.1491408348083496, - "logits/rejected": -3.142490863800049, - "logps/chosen": -146.3575439453125, - "logps/rejected": -159.93038940429688, - "loss": 0.6363, + "logits/chosen": -3.1688714027404785, + "logits/rejected": -3.1625566482543945, + "logps/chosen": -142.4561767578125, + "logps/rejected": -155.34713745117188, + "loss": 0.6376, "rewards/accuracies": 0.59375, - "rewards/chosen": -0.8111427426338196, - "rewards/margins": 0.1872086524963379, - "rewards/rejected": -0.9983514547348022, + "rewards/chosen": -0.7721291780471802, + "rewards/margins": 0.18038949370384216, + "rewards/rejected": -0.9525187611579895, "step": 2060 }, { "epoch": 0.36, - "grad_norm": 2.078125, + "grad_norm": 1.875, "learning_rate": 4.0622496297579905e-06, - "logits/chosen": -3.157477855682373, - "logits/rejected": -3.154186725616455, - "logps/chosen": -143.78921508789062, - "logps/rejected": -161.87496948242188, - "loss": 0.6229, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7739632725715637, - "rewards/margins": 0.21119961142539978, - "rewards/rejected": -0.9851628541946411, + "logits/chosen": -3.1799235343933105, + "logits/rejected": -3.1771349906921387, + "logps/chosen": -139.90472412109375, + "logps/rejected": -156.67532348632812, + "loss": 0.6266, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7351183295249939, + "rewards/margins": 0.19804798066616058, + "rewards/rejected": -0.9331663250923157, "step": 2070 }, { "epoch": 0.36, - "grad_norm": 2.3125, + "grad_norm": 2.15625, "learning_rate": 4.0504794873182144e-06, - "logits/chosen": -3.13997220993042, - "logits/rejected": -3.1348297595977783, - "logps/chosen": -145.42245483398438, - "logps/rejected": -164.66867065429688, - "loss": 0.6139, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.8327577710151672, - "rewards/margins": 0.23421470820903778, - "rewards/rejected": -1.0669724941253662, + "logits/chosen": -3.162339687347412, + "logits/rejected": -3.157538890838623, + "logps/chosen": -140.0246124267578, + "logps/rejected": -158.8914337158203, + "loss": 0.613, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.778779149055481, + "rewards/margins": 0.23042099177837372, + "rewards/rejected": -1.0092002153396606, "step": 2080 }, { "epoch": 0.36, - "grad_norm": 2.515625, + "grad_norm": 2.671875, "learning_rate": 4.038653228359143e-06, - "logits/chosen": -3.138305902481079, - "logits/rejected": -3.138002634048462, - "logps/chosen": -156.08377075195312, - "logps/rejected": -172.61456298828125, - "loss": 0.6431, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.9054716229438782, - "rewards/margins": 0.1922367513179779, - "rewards/rejected": -1.0977084636688232, + "logits/chosen": -3.157104015350342, + "logits/rejected": -3.156982421875, + "logps/chosen": -151.57472229003906, + "logps/rejected": -167.0546875, + "loss": 0.6456, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8603813052177429, + "rewards/margins": 0.18172840774059296, + "rewards/rejected": -1.042109727859497, "step": 2090 }, { "epoch": 0.36, - "grad_norm": 2.25, + "grad_norm": 2.0625, "learning_rate": 4.026771280908682e-06, - "logits/chosen": -3.1057121753692627, - "logits/rejected": -3.102771282196045, - "logps/chosen": -163.07972717285156, - "logps/rejected": -173.98348999023438, - "loss": 0.6479, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.975011944770813, - "rewards/margins": 0.1709790974855423, - "rewards/rejected": -1.1459910869598389, + "logits/chosen": -3.1234302520751953, + "logits/rejected": -3.1203956604003906, + "logps/chosen": -157.09375, + "logps/rejected": -168.24038696289062, + "loss": 0.6452, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9151522517204285, + "rewards/margins": 0.17340800166130066, + "rewards/rejected": -1.0885603427886963, "step": 2100 }, { "epoch": 0.36, - "eval_logits/chosen": -3.1159632205963135, - "eval_logits/rejected": -3.112962245941162, - "eval_logps/chosen": -150.8987579345703, - "eval_logps/rejected": -167.6335906982422, - "eval_loss": 0.653691828250885, - "eval_rewards/accuracies": 0.6103624701499939, - "eval_rewards/chosen": -0.7948872447013855, - "eval_rewards/margins": 0.13037818670272827, - "eval_rewards/rejected": -0.9252654314041138, - "eval_runtime": 483.6902, - "eval_samples_per_second": 8.898, + "eval_logits/chosen": -3.136946201324463, + "eval_logits/rejected": -3.1343982219696045, + "eval_logps/chosen": -142.29122924804688, + "eval_logps/rejected": -158.35354614257812, + "eval_loss": 0.6538400053977966, + "eval_rewards/accuracies": 0.604786217212677, + "eval_rewards/chosen": -0.7088120579719543, + "eval_rewards/margins": 0.12365300208330154, + "eval_rewards/rejected": -0.8324649930000305, + "eval_runtime": 483.8762, + "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 2100 }, { "epoch": 0.36, - "grad_norm": 2.46875, + "grad_norm": 2.375, "learning_rate": 4.014834075010271e-06, - "logits/chosen": -3.117692470550537, - "logits/rejected": -3.1150553226470947, - "logps/chosen": -165.73184204101562, - "logps/rejected": -192.96554565429688, - "loss": 0.6186, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0519955158233643, - "rewards/margins": 0.24650517106056213, - "rewards/rejected": -1.298500657081604, + "logits/chosen": -3.1391446590423584, + "logits/rejected": -3.136540412902832, + "logps/chosen": -156.41262817382812, + "logps/rejected": -182.945068359375, + "loss": 0.62, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9588032960891724, + "rewards/margins": 0.23949292302131653, + "rewards/rejected": -1.198296070098877, "step": 2110 }, { "epoch": 0.37, - "grad_norm": 2.671875, + "grad_norm": 2.25, "learning_rate": 4.002842042707323e-06, - "logits/chosen": -3.074627161026001, - "logits/rejected": -3.068140745162964, - "logps/chosen": -164.93692016601562, - "logps/rejected": -188.56765747070312, - "loss": 0.6003, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0069468021392822, - "rewards/margins": 0.28368431329727173, - "rewards/rejected": -1.2906310558319092, + "logits/chosen": -3.089831829071045, + "logits/rejected": -3.083256959915161, + "logps/chosen": -157.48348999023438, + "logps/rejected": -182.3844451904297, + "loss": 0.5942, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9324124455451965, + "rewards/margins": 0.2963864505290985, + "rewards/rejected": -1.2287989854812622, "step": 2120 }, { "epoch": 0.37, - "grad_norm": 2.5625, + "grad_norm": 2.359375, "learning_rate": 3.9907956180275785e-06, - "logits/chosen": -3.041842222213745, - "logits/rejected": -3.0408873558044434, - "logps/chosen": -180.57772827148438, - "logps/rejected": -207.1053466796875, - "loss": 0.5915, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1625645160675049, - "rewards/margins": 0.30793333053588867, - "rewards/rejected": -1.4704978466033936, + "logits/chosen": -3.0537140369415283, + "logits/rejected": -3.0525598526000977, + "logps/chosen": -178.11253356933594, + "logps/rejected": -203.71084594726562, + "loss": 0.5934, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1379125118255615, + "rewards/margins": 0.29864054918289185, + "rewards/rejected": -1.4365530014038086, "step": 2130 }, { "epoch": 0.37, - "grad_norm": 2.6875, + "grad_norm": 2.546875, "learning_rate": 3.978695236967405e-06, - "logits/chosen": -3.074413776397705, - "logits/rejected": -3.0732433795928955, - "logps/chosen": -173.1667022705078, - "logps/rejected": -194.44573974609375, - "loss": 0.635, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1367294788360596, - "rewards/margins": 0.19113154709339142, - "rewards/rejected": -1.3278610706329346, + "logits/chosen": -3.079768419265747, + "logits/rejected": -3.0789151191711426, + "logps/chosen": -174.4245147705078, + "logps/rejected": -195.63836669921875, + "loss": 0.6362, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1493077278137207, + "rewards/margins": 0.19047953188419342, + "rewards/rejected": -1.339787244796753, "step": 2140 }, { "epoch": 0.37, - "grad_norm": 2.390625, + "grad_norm": 2.21875, "learning_rate": 3.966541337476012e-06, - "logits/chosen": -3.066967487335205, - "logits/rejected": -3.0620956420898438, - "logps/chosen": -170.68936157226562, - "logps/rejected": -187.6675567626953, - "loss": 0.6023, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0465517044067383, - "rewards/margins": 0.26273447275161743, - "rewards/rejected": -1.30928635597229, + "logits/chosen": -3.0785582065582275, + "logits/rejected": -3.0743796825408936, + "logps/chosen": -169.74510192871094, + "logps/rejected": -184.39230346679688, + "loss": 0.612, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0371092557907104, + "rewards/margins": 0.23942425847053528, + "rewards/rejected": -1.2765334844589233, "step": 2150 }, { "epoch": 0.37, - "grad_norm": 2.71875, + "grad_norm": 2.78125, "learning_rate": 3.9543343594396035e-06, - "logits/chosen": -3.0941786766052246, - "logits/rejected": -3.0885229110717773, - "logps/chosen": -177.64736938476562, - "logps/rejected": -191.93521118164062, - "loss": 0.625, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1087653636932373, - "rewards/margins": 0.23340511322021484, - "rewards/rejected": -1.3421704769134521, + "logits/chosen": -3.107583999633789, + "logits/rejected": -3.1022396087646484, + "logps/chosen": -174.77137756347656, + "logps/rejected": -188.1559295654297, + "loss": 0.6271, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.080005407333374, + "rewards/margins": 0.2243720293045044, + "rewards/rejected": -1.304377555847168, "step": 2160 }, { "epoch": 0.37, - "grad_norm": 3.171875, + "grad_norm": 2.96875, "learning_rate": 3.942074744665456e-06, - "logits/chosen": -3.100074291229248, - "logits/rejected": -3.09078311920166, - "logps/chosen": -180.03762817382812, - "logps/rejected": -204.2498321533203, - "loss": 0.5995, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1526836156845093, - "rewards/margins": 0.295266330242157, - "rewards/rejected": -1.447949767112732, + "logits/chosen": -3.1085472106933594, + "logits/rejected": -3.0991461277008057, + "logps/chosen": -178.31637573242188, + "logps/rejected": -202.80215454101562, + "loss": 0.5988, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1354711055755615, + "rewards/margins": 0.29800236225128174, + "rewards/rejected": -1.4334733486175537, "step": 2170 }, { "epoch": 0.38, - "grad_norm": 3.390625, + "grad_norm": 3.3125, "learning_rate": 3.929762936865926e-06, - "logits/chosen": -3.1241230964660645, - "logits/rejected": -3.121016025543213, - "logps/chosen": -197.09913635253906, - "logps/rejected": -210.7930908203125, - "loss": 0.639, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2780972719192505, - "rewards/margins": 0.2249462604522705, - "rewards/rejected": -1.5030434131622314, + "logits/chosen": -3.1241629123687744, + "logits/rejected": -3.1208481788635254, + "logps/chosen": -197.42234802246094, + "logps/rejected": -211.826416015625, + "loss": 0.6394, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2813293933868408, + "rewards/margins": 0.23204727470874786, + "rewards/rejected": -1.5133765935897827, "step": 2180 }, { "epoch": 0.38, - "grad_norm": 2.125, + "grad_norm": 1.96875, "learning_rate": 3.917399381642395e-06, - "logits/chosen": -3.154695510864258, - "logits/rejected": -3.150545597076416, - "logps/chosen": -175.54129028320312, - "logps/rejected": -191.65377807617188, - "loss": 0.6552, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.126696228981018, - "rewards/margins": 0.17622551321983337, - "rewards/rejected": -1.3029218912124634, + "logits/chosen": -3.1608245372772217, + "logits/rejected": -3.1563944816589355, + "logps/chosen": -174.8191375732422, + "logps/rejected": -191.9217987060547, + "loss": 0.6506, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1194748878479004, + "rewards/margins": 0.18612749874591827, + "rewards/rejected": -1.3056023120880127, "step": 2190 }, { "epoch": 0.38, - "grad_norm": 2.40625, + "grad_norm": 2.34375, "learning_rate": 3.904984526469139e-06, - "logits/chosen": -3.1482882499694824, - "logits/rejected": -3.144016742706299, - "logps/chosen": -148.60595703125, - "logps/rejected": -175.087646484375, - "loss": 0.6023, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8616147041320801, - "rewards/margins": 0.26853346824645996, - "rewards/rejected": -1.13014817237854, - "step": 2200 + "logits/chosen": -3.1563363075256348, + "logits/rejected": -3.1518566608428955, + "logps/chosen": -145.4347686767578, + "logps/rejected": -171.84201049804688, + "loss": 0.6024, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8299029469490051, + "rewards/margins": 0.26778867840766907, + "rewards/rejected": -1.0976916551589966, + "step": 2200 }, { "epoch": 0.38, - "eval_logits/chosen": -3.147921562194824, - "eval_logits/rejected": -3.144941568374634, - "eval_logps/chosen": -137.71630859375, - "eval_logps/rejected": -153.79273986816406, - "eval_loss": 0.653631329536438, - "eval_rewards/accuracies": 0.6033921837806702, - "eval_rewards/chosen": -0.6630630493164062, - "eval_rewards/margins": 0.12379389256238937, - "eval_rewards/rejected": -0.7868569493293762, - "eval_runtime": 483.7826, - "eval_samples_per_second": 8.897, + "eval_logits/chosen": -3.159609317779541, + "eval_logits/rejected": -3.156726598739624, + "eval_logps/chosen": -135.18582153320312, + "eval_logps/rejected": -151.50193786621094, + "eval_loss": 0.6526528000831604, + "eval_rewards/accuracies": 0.6119888424873352, + "eval_rewards/chosen": -0.6377579569816589, + "eval_rewards/margins": 0.1261908859014511, + "eval_rewards/rejected": -0.7639487981796265, + "eval_runtime": 483.9194, + "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2200 }, { "epoch": 0.38, - "grad_norm": 2.515625, + "grad_norm": 2.53125, "learning_rate": 3.892518820677131e-06, - "logits/chosen": -3.1459808349609375, - "logits/rejected": -3.142317056655884, - "logps/chosen": -148.6342010498047, - "logps/rejected": -164.85739135742188, - "loss": 0.6286, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.8283321261405945, - "rewards/margins": 0.2048930823802948, - "rewards/rejected": -1.0332252979278564, + "logits/chosen": -3.1522021293640137, + "logits/rejected": -3.148188591003418, + "logps/chosen": -148.74594116210938, + "logps/rejected": -164.57009887695312, + "loss": 0.6301, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8294495344161987, + "rewards/margins": 0.2009027898311615, + "rewards/rejected": -1.0303523540496826, "step": 2210 }, { "epoch": 0.38, - "grad_norm": 2.125, + "grad_norm": 1.90625, "learning_rate": 3.880002715437786e-06, - "logits/chosen": -3.129206418991089, - "logits/rejected": -3.126734495162964, - "logps/chosen": -147.17568969726562, - "logps/rejected": -165.13308715820312, - "loss": 0.6334, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8225167393684387, - "rewards/margins": 0.20732179284095764, - "rewards/rejected": -1.0298385620117188, + "logits/chosen": -3.1346442699432373, + "logits/rejected": -3.1319546699523926, + "logps/chosen": -146.25363159179688, + "logps/rejected": -164.86203002929688, + "loss": 0.6307, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8132961392402649, + "rewards/margins": 0.2138318568468094, + "rewards/rejected": -1.027127981185913, "step": 2220 }, { "epoch": 0.38, - "grad_norm": 2.703125, + "grad_norm": 2.515625, "learning_rate": 3.867436663746622e-06, - "logits/chosen": -3.1609504222869873, - "logits/rejected": -3.1583099365234375, - "logps/chosen": -152.25564575195312, - "logps/rejected": -160.70870971679688, - "loss": 0.6572, + "logits/chosen": -3.1629092693328857, + "logits/rejected": -3.1602606773376465, + "logps/chosen": -153.08694458007812, + "logps/rejected": -161.30661010742188, + "loss": 0.6595, "rewards/accuracies": 0.59375, - "rewards/chosen": -0.8773072957992554, - "rewards/margins": 0.1397085189819336, - "rewards/rejected": -1.0170156955718994, + "rewards/chosen": -0.8856202960014343, + "rewards/margins": 0.13737449049949646, + "rewards/rejected": -1.0229947566986084, "step": 2230 }, { "epoch": 0.39, - "grad_norm": 1.8359375, + "grad_norm": 1.5859375, "learning_rate": 3.854821120406871e-06, - "logits/chosen": -3.173096179962158, - "logits/rejected": -3.171881914138794, - "logps/chosen": -145.95648193359375, - "logps/rejected": -153.50050354003906, - "loss": 0.6612, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.7911441326141357, - "rewards/margins": 0.12236903607845306, - "rewards/rejected": -0.91351318359375, + "logits/chosen": -3.1770501136779785, + "logits/rejected": -3.1756601333618164, + "logps/chosen": -144.17184448242188, + "logps/rejected": -151.9558868408203, + "loss": 0.6589, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7732978463172913, + "rewards/margins": 0.12476921081542969, + "rewards/rejected": -0.8980669975280762, "step": 2240 }, { "epoch": 0.39, - "grad_norm": 2.953125, + "grad_norm": 2.828125, "learning_rate": 3.842156542013017e-06, - "logits/chosen": -3.1604485511779785, - "logits/rejected": -3.156759738922119, - "logps/chosen": -151.40597534179688, - "logps/rejected": -166.3770294189453, - "loss": 0.6246, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.854582667350769, - "rewards/margins": 0.19725193083286285, - "rewards/rejected": -1.0518347024917603, + "logits/chosen": -3.1689164638519287, + "logits/rejected": -3.1652140617370605, + "logps/chosen": -147.66421508789062, + "logps/rejected": -162.92446899414062, + "loss": 0.6223, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8171650767326355, + "rewards/margins": 0.20014426112174988, + "rewards/rejected": -1.017309308052063, "step": 2250 }, { "epoch": 0.39, "grad_norm": 2.296875, "learning_rate": 3.8294433869342695e-06, - "logits/chosen": -3.1788666248321533, - "logits/rejected": -3.177128553390503, - "logps/chosen": -160.8689727783203, - "logps/rejected": -169.00955200195312, - "loss": 0.665, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.9033733606338501, - "rewards/margins": 0.13598643243312836, - "rewards/rejected": -1.0393598079681396, + "logits/chosen": -3.1818199157714844, + "logits/rejected": -3.180236339569092, + "logps/chosen": -157.19143676757812, + "logps/rejected": -165.59632873535156, + "loss": 0.663, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8665980100631714, + "rewards/margins": 0.13862961530685425, + "rewards/rejected": -1.0052276849746704, "step": 2260 }, { "epoch": 0.39, - "grad_norm": 2.203125, + "grad_norm": 1.9453125, "learning_rate": 3.816682115297976e-06, - "logits/chosen": -3.1626791954040527, - "logits/rejected": -3.15694260597229, - "logps/chosen": -157.9209747314453, - "logps/rejected": -171.28599548339844, - "loss": 0.6421, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.9369276762008667, - "rewards/margins": 0.18542329967021942, - "rewards/rejected": -1.122351050376892, + "logits/chosen": -3.1630165576934814, + "logits/rejected": -3.15757417678833, + "logps/chosen": -154.84857177734375, + "logps/rejected": -168.8209686279297, + "loss": 0.6382, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9062039256095886, + "rewards/margins": 0.19149689376354218, + "rewards/rejected": -1.097700834274292, "step": 2270 }, { "epoch": 0.39, - "grad_norm": 1.9296875, + "grad_norm": 1.8515625, "learning_rate": 3.803873188972966e-06, - "logits/chosen": -3.152496576309204, - "logits/rejected": -3.1473207473754883, - "logps/chosen": -154.1539764404297, - "logps/rejected": -175.39974975585938, - "loss": 0.6214, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.8972307443618774, - "rewards/margins": 0.2114594280719757, - "rewards/rejected": -1.1086901426315308, + "logits/chosen": -3.147902011871338, + "logits/rejected": -3.142792224884033, + "logps/chosen": -151.74461364746094, + "logps/rejected": -173.80873107910156, + "loss": 0.6186, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8731371164321899, + "rewards/margins": 0.21964311599731445, + "rewards/rejected": -1.0927802324295044, "step": 2280 }, { "epoch": 0.39, - "grad_norm": 2.28125, + "grad_norm": 2.46875, "learning_rate": 3.791017071552835e-06, - "logits/chosen": -3.1055197715759277, - "logits/rejected": -3.100161075592041, - "logps/chosen": -158.73663330078125, - "logps/rejected": -183.3207244873047, - "loss": 0.5997, + "logits/chosen": -3.0958430767059326, + "logits/rejected": -3.090329170227051, + "logps/chosen": -159.39187622070312, + "logps/rejected": -184.73562622070312, + "loss": 0.5988, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.9550874829292297, - "rewards/margins": 0.27279362082481384, - "rewards/rejected": -1.2278810739517212, + "rewards/chosen": -0.9616400003433228, + "rewards/margins": 0.2803899347782135, + "rewards/rejected": -1.2420299053192139, "step": 2290 }, { "epoch": 0.4, - "grad_norm": 3.15625, + "grad_norm": 3.0, "learning_rate": 3.778114228339168e-06, - "logits/chosen": -3.1239192485809326, - "logits/rejected": -3.1184122562408447, - "logps/chosen": -167.10992431640625, - "logps/rejected": -189.25596618652344, - "loss": 0.5962, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.9823795557022095, - "rewards/margins": 0.2892586290836334, - "rewards/rejected": -1.2716381549835205, + "logits/chosen": -3.109088182449341, + "logits/rejected": -3.103534698486328, + "logps/chosen": -168.2469940185547, + "logps/rejected": -192.3583984375, + "loss": 0.5912, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9937500953674316, + "rewards/margins": 0.30891233682632446, + "rewards/rejected": -1.3026624917984009, "step": 2300 }, { "epoch": 0.4, - "eval_logits/chosen": -3.1011734008789062, - "eval_logits/rejected": -3.097487449645996, - "eval_logps/chosen": -159.4140625, - "eval_logps/rejected": -177.23011779785156, - "eval_loss": 0.6523010730743408, - "eval_rewards/accuracies": 0.6078066825866699, - "eval_rewards/chosen": -0.8800405263900757, - "eval_rewards/margins": 0.14119039475917816, - "eval_rewards/rejected": -1.021230936050415, - "eval_runtime": 483.6223, - "eval_samples_per_second": 8.9, - "eval_steps_per_second": 1.112, + "eval_logits/chosen": -3.08532452583313, + "eval_logits/rejected": -3.081225872039795, + "eval_logps/chosen": -161.3302459716797, + "eval_logps/rejected": -180.7163543701172, + "eval_loss": 0.6485186219215393, + "eval_rewards/accuracies": 0.6105948090553284, + "eval_rewards/chosen": -0.8992023468017578, + "eval_rewards/margins": 0.15689080953598022, + "eval_rewards/rejected": -1.0560930967330933, + "eval_runtime": 484.0391, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.111, "step": 2300 }, { "epoch": 0.4, - "grad_norm": 3.296875, + "grad_norm": 3.3125, "learning_rate": 3.7651651263246947e-06, - "logits/chosen": -3.0919950008392334, - "logits/rejected": -3.086912155151367, - "logps/chosen": -166.24557495117188, - "logps/rejected": -190.60391235351562, - "loss": 0.5973, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0429760217666626, - "rewards/margins": 0.2770213186740875, - "rewards/rejected": -1.3199971914291382, + "logits/chosen": -3.0716493129730225, + "logits/rejected": -3.066314935684204, + "logps/chosen": -170.21990966796875, + "logps/rejected": -194.90985107421875, + "loss": 0.5975, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.082719326019287, + "rewards/margins": 0.280337393283844, + "rewards/rejected": -1.3630567789077759, "step": 2310 }, { "epoch": 0.4, - "grad_norm": 2.96875, + "grad_norm": 3.109375, "learning_rate": 3.752170234176392e-06, - "logits/chosen": -3.055863857269287, - "logits/rejected": -3.051103115081787, - "logps/chosen": -181.3167724609375, - "logps/rejected": -201.3364715576172, - "loss": 0.6001, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1310415267944336, - "rewards/margins": 0.29059791564941406, - "rewards/rejected": -1.4216395616531372, + "logits/chosen": -3.0347900390625, + "logits/rejected": -3.029919385910034, + "logps/chosen": -185.3545684814453, + "logps/rejected": -206.3814239501953, + "loss": 0.598, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.171419620513916, + "rewards/margins": 0.30066943168640137, + "rewards/rejected": -1.4720890522003174, "step": 2320 }, { "epoch": 0.4, - "grad_norm": 2.671875, + "grad_norm": 2.734375, "learning_rate": 3.739130022218519e-06, - "logits/chosen": -3.0658938884735107, - "logits/rejected": -3.0609793663024902, - "logps/chosen": -178.57443237304688, - "logps/rejected": -207.0660400390625, - "loss": 0.5796, + "logits/chosen": -3.039088487625122, + "logits/rejected": -3.0342190265655518, + "logps/chosen": -182.79171752929688, + "logps/rejected": -212.6605987548828, + "loss": 0.5765, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1136971712112427, - "rewards/margins": 0.35894858837127686, - "rewards/rejected": -1.4726457595825195, + "rewards/chosen": -1.1558698415756226, + "rewards/margins": 0.3727215528488159, + "rewards/rejected": -1.5285913944244385, "step": 2330 }, { "epoch": 0.4, - "grad_norm": 2.625, + "grad_norm": 2.6875, "learning_rate": 3.726044962415595e-06, - "logits/chosen": -3.0586750507354736, - "logits/rejected": -3.0569536685943604, - "logps/chosen": -184.21685791015625, - "logps/rejected": -205.1505889892578, - "loss": 0.6387, + "logits/chosen": -3.02746844291687, + "logits/rejected": -3.0260281562805176, + "logps/chosen": -189.47915649414062, + "logps/rejected": -212.01712036132812, + "loss": 0.6343, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2161709070205688, - "rewards/margins": 0.19973869621753693, - "rewards/rejected": -1.4159094095230103, + "rewards/chosen": -1.2687938213348389, + "rewards/margins": 0.2157808244228363, + "rewards/rejected": -1.484574794769287, "step": 2340 }, { "epoch": 0.4, - "grad_norm": 2.515625, + "grad_norm": 2.625, "learning_rate": 3.712915528355317e-06, - "logits/chosen": -3.0611510276794434, - "logits/rejected": -3.0518746376037598, - "logps/chosen": -177.49746704101562, - "logps/rejected": -198.83865356445312, - "loss": 0.6174, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.0889817476272583, - "rewards/margins": 0.27505603432655334, - "rewards/rejected": -1.3640376329421997, + "logits/chosen": -3.0346224308013916, + "logits/rejected": -3.0251426696777344, + "logps/chosen": -183.71942138671875, + "logps/rejected": -206.18161010742188, + "loss": 0.6177, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1512013673782349, + "rewards/margins": 0.2862662374973297, + "rewards/rejected": -1.4374675750732422, "step": 2350 }, { "epoch": 0.41, - "grad_norm": 3.171875, + "grad_norm": 3.21875, "learning_rate": 3.6997421952314223e-06, - "logits/chosen": -3.062241792678833, - "logits/rejected": -3.0571401119232178, - "logps/chosen": -172.83241271972656, - "logps/rejected": -198.0221710205078, - "loss": 0.6132, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1134238243103027, - "rewards/margins": 0.2823295295238495, - "rewards/rejected": -1.3957535028457642, + "logits/chosen": -3.037364959716797, + "logits/rejected": -3.032649517059326, + "logps/chosen": -178.09678649902344, + "logps/rejected": -203.7887420654297, + "loss": 0.6128, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.166067361831665, + "rewards/margins": 0.2873513996601105, + "rewards/rejected": -1.4534189701080322, "step": 2360 }, { "epoch": 0.41, - "grad_norm": 2.578125, + "grad_norm": 2.671875, "learning_rate": 3.686525439826484e-06, - "logits/chosen": -3.058692216873169, - "logits/rejected": -3.050508975982666, - "logps/chosen": -177.1395263671875, - "logps/rejected": -203.19869995117188, - "loss": 0.6078, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1302058696746826, - "rewards/margins": 0.2856082022190094, - "rewards/rejected": -1.4158140420913696, + "logits/chosen": -3.043933391571045, + "logits/rejected": -3.0355417728424072, + "logps/chosen": -180.43582153320312, + "logps/rejected": -206.72732543945312, + "loss": 0.6098, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1631687879562378, + "rewards/margins": 0.28793150186538696, + "rewards/rejected": -1.451100468635559, "step": 2370 }, { "epoch": 0.41, - "grad_norm": 3.53125, + "grad_norm": 3.5625, "learning_rate": 3.6732657404946624e-06, - "logits/chosen": -3.0415139198303223, - "logits/rejected": -3.03214955329895, - "logps/chosen": -170.95999145507812, - "logps/rejected": -192.58023071289062, - "loss": 0.6288, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0619453191757202, - "rewards/margins": 0.251602441072464, - "rewards/rejected": -1.3135477304458618, + "logits/chosen": -3.028273582458496, + "logits/rejected": -3.0188450813293457, + "logps/chosen": -171.57801818847656, + "logps/rejected": -193.34634399414062, + "loss": 0.6294, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.068125605583191, + "rewards/margins": 0.25308340787887573, + "rewards/rejected": -1.3212089538574219, "step": 2380 }, { "epoch": 0.41, - "grad_norm": 2.765625, + "grad_norm": 2.453125, "learning_rate": 3.6599635771443844e-06, - "logits/chosen": -3.0108182430267334, - "logits/rejected": -3.004117488861084, - "logps/chosen": -192.63619995117188, - "logps/rejected": -220.47128295898438, - "loss": 0.5854, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2579948902130127, - "rewards/margins": 0.34344083070755005, - "rewards/rejected": -1.601435661315918, + "logits/chosen": -2.99981427192688, + "logits/rejected": -2.993446111679077, + "logps/chosen": -193.1945343017578, + "logps/rejected": -221.90512084960938, + "loss": 0.5843, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2635782957077026, + "rewards/margins": 0.3521956205368042, + "rewards/rejected": -1.615774154663086, "step": 2390 }, { "epoch": 0.41, - "grad_norm": 2.859375, + "grad_norm": 2.75, "learning_rate": 3.646619431220978e-06, - "logits/chosen": -3.014111042022705, - "logits/rejected": -3.0105881690979004, - "logps/chosen": -183.05506896972656, - "logps/rejected": -208.9149627685547, - "loss": 0.6176, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.204532265663147, - "rewards/margins": 0.27077314257621765, - "rewards/rejected": -1.4753053188323975, + "logits/chosen": -3.006399631500244, + "logits/rejected": -3.0024609565734863, + "logps/chosen": -183.29205322265625, + "logps/rejected": -209.1800994873047, + "loss": 0.6188, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.206902265548706, + "rewards/margins": 0.27105480432510376, + "rewards/rejected": -1.4779571294784546, "step": 2400 }, { "epoch": 0.41, - "eval_logits/chosen": -3.03197979927063, - "eval_logits/rejected": -3.0264739990234375, - "eval_logps/chosen": -172.7088623046875, - "eval_logps/rejected": -192.77476501464844, - "eval_loss": 0.6506057977676392, - "eval_rewards/accuracies": 0.6026951670646667, - "eval_rewards/chosen": -1.0129884481430054, - "eval_rewards/margins": 0.16368862986564636, - "eval_rewards/rejected": -1.1766771078109741, - "eval_runtime": 483.6491, - "eval_samples_per_second": 8.899, + "eval_logits/chosen": -3.0275797843933105, + "eval_logits/rejected": -3.021939277648926, + "eval_logps/chosen": -171.0099639892578, + "eval_logps/rejected": -191.72683715820312, + "eval_loss": 0.6487921476364136, + "eval_rewards/accuracies": 0.6203531622886658, + "eval_rewards/chosen": -0.995999276638031, + "eval_rewards/margins": 0.17019876837730408, + "eval_rewards/rejected": -1.1661980152130127, + "eval_runtime": 483.8998, + "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2400 }, { "epoch": 0.42, - "grad_norm": 3.34375, + "grad_norm": 3.5, "learning_rate": 3.6332337856892475e-06, - "logits/chosen": -3.0102365016937256, - "logits/rejected": -3.0060172080993652, - "logps/chosen": -186.672119140625, - "logps/rejected": -205.01229858398438, - "loss": 0.6124, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.209280014038086, - "rewards/margins": 0.2705342769622803, - "rewards/rejected": -1.4798142910003662, + "logits/chosen": -3.0032505989074707, + "logits/rejected": -2.999051809310913, + "logps/chosen": -185.5613555908203, + "logps/rejected": -204.23507690429688, + "loss": 0.6102, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1981723308563232, + "rewards/margins": 0.2738698422908783, + "rewards/rejected": -1.4720420837402344, "step": 2410 }, { "epoch": 0.42, - "grad_norm": 2.75, + "grad_norm": 2.828125, "learning_rate": 3.6198071250159945e-06, - "logits/chosen": -3.0345890522003174, - "logits/rejected": -3.0291411876678467, - "logps/chosen": -187.89105224609375, - "logps/rejected": -214.79861450195312, - "loss": 0.5981, + "logits/chosen": -3.0181069374084473, + "logits/rejected": -3.0125536918640137, + "logps/chosen": -188.25917053222656, + "logps/rejected": -215.7293701171875, + "loss": 0.5964, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2190016508102417, - "rewards/margins": 0.3049257695674896, - "rewards/rejected": -1.5239274501800537, + "rewards/chosen": -1.2226827144622803, + "rewards/margins": 0.310552179813385, + "rewards/rejected": -1.53323495388031, "step": 2420 }, { "epoch": 0.42, - "grad_norm": 2.703125, + "grad_norm": 2.5, "learning_rate": 3.6063399351524793e-06, - "logits/chosen": -3.056429386138916, - "logits/rejected": -3.0526039600372314, - "logps/chosen": -181.78106689453125, - "logps/rejected": -193.84323120117188, - "loss": 0.6579, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1559642553329468, - "rewards/margins": 0.18488237261772156, - "rewards/rejected": -1.3408466577529907, + "logits/chosen": -3.036452531814575, + "logits/rejected": -3.0326380729675293, + "logps/chosen": -180.85311889648438, + "logps/rejected": -194.0204315185547, + "loss": 0.6539, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1466847658157349, + "rewards/margins": 0.19593380391597748, + "rewards/rejected": -1.3426185846328735, "step": 2430 }, { "epoch": 0.42, - "grad_norm": 2.21875, + "grad_norm": 2.234375, "learning_rate": 3.592832703516836e-06, - "logits/chosen": -3.0940728187561035, - "logits/rejected": -3.0855565071105957, - "logps/chosen": -167.07205200195312, - "logps/rejected": -191.03402709960938, - "loss": 0.6023, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.002306580543518, - "rewards/margins": 0.28310757875442505, - "rewards/rejected": -1.285414218902588, + "logits/chosen": -3.077695846557617, + "logits/rejected": -3.0692319869995117, + "logps/chosen": -168.24078369140625, + "logps/rejected": -192.09884643554688, + "loss": 0.6049, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0139938592910767, + "rewards/margins": 0.28206855058670044, + "rewards/rejected": -1.2960623502731323, "step": 2440 }, { "epoch": 0.42, - "grad_norm": 2.390625, + "grad_norm": 2.40625, "learning_rate": 3.5792859189764335e-06, - "logits/chosen": -3.092224597930908, - "logits/rejected": -3.0879569053649902, - "logps/chosen": -167.45254516601562, - "logps/rejected": -182.4717254638672, - "loss": 0.639, + "logits/chosen": -3.078490972518921, + "logits/rejected": -3.0740816593170166, + "logps/chosen": -166.83689880371094, + "logps/rejected": -182.9640655517578, + "loss": 0.6344, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.0350295305252075, - "rewards/margins": 0.18815535306930542, - "rewards/rejected": -1.2231849431991577, + "rewards/chosen": -1.0288734436035156, + "rewards/margins": 0.1992349624633789, + "rewards/rejected": -1.2281081676483154, "step": 2450 }, { "epoch": 0.42, - "grad_norm": 1.9765625, + "grad_norm": 1.984375, "learning_rate": 3.5657000718301765e-06, - "logits/chosen": -3.0908710956573486, - "logits/rejected": -3.0826985836029053, - "logps/chosen": -161.75523376464844, - "logps/rejected": -174.8667449951172, - "loss": 0.6481, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.9404891133308411, - "rewards/margins": 0.19990110397338867, - "rewards/rejected": -1.1403902769088745, + "logits/chosen": -3.0803191661834717, + "logits/rejected": -3.0721633434295654, + "logps/chosen": -160.91036987304688, + "logps/rejected": -174.44577026367188, + "loss": 0.6483, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9320403337478638, + "rewards/margins": 0.20414026081562042, + "rewards/rejected": -1.1361806392669678, "step": 2460 }, { "epoch": 0.43, - "grad_norm": 2.484375, + "grad_norm": 2.5, "learning_rate": 3.5520756537907645e-06, - "logits/chosen": -3.103588819503784, - "logits/rejected": -3.0951483249664307, - "logps/chosen": -156.4438934326172, - "logps/rejected": -175.2889862060547, - "loss": 0.622, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.9040303230285645, - "rewards/margins": 0.23538950085639954, - "rewards/rejected": -1.1394197940826416, + "logits/chosen": -3.0954861640930176, + "logits/rejected": -3.087409019470215, + "logps/chosen": -156.04385375976562, + "logps/rejected": -174.39959716796875, + "loss": 0.624, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9000299572944641, + "rewards/margins": 0.23049604892730713, + "rewards/rejected": -1.1305259466171265, "step": 2470 }, { "epoch": 0.43, - "grad_norm": 2.328125, + "grad_norm": 2.203125, "learning_rate": 3.538413157966893e-06, - "logits/chosen": -3.073765993118286, - "logits/rejected": -3.0662198066711426, - "logps/chosen": -166.85813903808594, - "logps/rejected": -182.69912719726562, - "loss": 0.6287, + "logits/chosen": -3.06768536567688, + "logits/rejected": -3.0605628490448, + "logps/chosen": -163.9343719482422, + "logps/rejected": -179.1577911376953, + "loss": 0.6316, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.026293158531189, - "rewards/margins": 0.2210662066936493, - "rewards/rejected": -1.2473593950271606, + "rewards/chosen": -0.9970556497573853, + "rewards/margins": 0.21489039063453674, + "rewards/rejected": -1.2119461297988892, "step": 2480 }, { "epoch": 0.43, - "grad_norm": 2.296875, + "grad_norm": 2.359375, "learning_rate": 3.5247130788454076e-06, - "logits/chosen": -3.06006121635437, - "logits/rejected": -3.0558817386627197, - "logps/chosen": -157.5069580078125, - "logps/rejected": -180.8059539794922, - "loss": 0.6243, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.9881949424743652, - "rewards/margins": 0.22879700362682343, - "rewards/rejected": -1.2169920206069946, + "logits/chosen": -3.056560516357422, + "logits/rejected": -3.052340030670166, + "logps/chosen": -154.87538146972656, + "logps/rejected": -178.25985717773438, + "loss": 0.6238, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9618793725967407, + "rewards/margins": 0.2296515256166458, + "rewards/rejected": -1.1915308237075806, "step": 2490 }, { "epoch": 0.43, "grad_norm": 2.390625, "learning_rate": 3.510975912273406e-06, - "logits/chosen": -3.0647358894348145, - "logits/rejected": -3.0556392669677734, - "logps/chosen": -175.3211212158203, - "logps/rejected": -197.31922912597656, - "loss": 0.6255, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.0933088064193726, - "rewards/margins": 0.23709645867347717, - "rewards/rejected": -1.3304052352905273, + "logits/chosen": -3.0538134574890137, + "logits/rejected": -3.044461250305176, + "logps/chosen": -176.61367797851562, + "logps/rejected": -198.43502807617188, + "loss": 0.6286, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1062344312667847, + "rewards/margins": 0.2353288233280182, + "rewards/rejected": -1.3415632247924805, "step": 2500 }, { "epoch": 0.43, - "eval_logits/chosen": -3.0629374980926514, - "eval_logits/rejected": -3.058375358581543, - "eval_logps/chosen": -156.9641571044922, - "eval_logps/rejected": -175.3397979736328, - "eval_loss": 0.6506990790367126, - "eval_rewards/accuracies": 0.6101301312446594, - "eval_rewards/chosen": -0.8555412888526917, - "eval_rewards/margins": 0.1467861831188202, - "eval_rewards/rejected": -1.002327561378479, - "eval_runtime": 483.7437, + "eval_logits/chosen": -3.0475451946258545, + "eval_logits/rejected": -3.0428457260131836, + "eval_logps/chosen": -159.05422973632812, + "eval_logps/rejected": -178.43539428710938, + "eval_loss": 0.6483173966407776, + "eval_rewards/accuracies": 0.6075743436813354, + "eval_rewards/chosen": -0.876442015171051, + "eval_rewards/margins": 0.15684130787849426, + "eval_rewards/rejected": -1.0332833528518677, + "eval_runtime": 483.7484, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 2500 }, { "epoch": 0.43, - "grad_norm": 2.0625, + "grad_norm": 2.140625, "learning_rate": 3.4972021554402924e-06, - "logits/chosen": -3.0609078407287598, - "logits/rejected": -3.0537776947021484, - "logps/chosen": -174.7488555908203, - "logps/rejected": -198.6224365234375, - "loss": 0.6094, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1154464483261108, - "rewards/margins": 0.27513909339904785, - "rewards/rejected": -1.3905855417251587, + "logits/chosen": -3.0428929328918457, + "logits/rejected": -3.035857677459717, + "logps/chosen": -177.7244873046875, + "logps/rejected": -201.36776733398438, + "loss": 0.6123, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1452029943466187, + "rewards/margins": 0.272835910320282, + "rewards/rejected": -1.4180389642715454, "step": 2510 }, { "epoch": 0.43, "grad_norm": 2.515625, "learning_rate": 3.483392306859784e-06, - "logits/chosen": -3.042370319366455, - "logits/rejected": -3.0388169288635254, - "logps/chosen": -176.43917846679688, - "logps/rejected": -196.3103790283203, - "loss": 0.6254, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.134268045425415, - "rewards/margins": 0.2571788430213928, - "rewards/rejected": -1.391446828842163, + "logits/chosen": -3.020996570587158, + "logits/rejected": -3.017580509185791, + "logps/chosen": -179.0875244140625, + "logps/rejected": -199.23764038085938, + "loss": 0.6248, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1607515811920166, + "rewards/margins": 0.25996798276901245, + "rewards/rejected": -1.4207196235656738, "step": 2520 }, { "epoch": 0.44, - "grad_norm": 2.796875, + "grad_norm": 2.859375, "learning_rate": 3.469546866351866e-06, - "logits/chosen": -3.0680062770843506, - "logits/rejected": -3.0632071495056152, - "logps/chosen": -171.026123046875, - "logps/rejected": -188.93869018554688, - "loss": 0.6492, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.0946629047393799, - "rewards/margins": 0.17611399292945862, - "rewards/rejected": -1.2707767486572266, + "logits/chosen": -3.047424793243408, + "logits/rejected": -3.042585611343384, + "logps/chosen": -172.9007110595703, + "logps/rejected": -190.90298461914062, + "loss": 0.6494, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.1134084463119507, + "rewards/margins": 0.17701123654842377, + "rewards/rejected": -1.2904198169708252, "step": 2530 }, { "epoch": 0.44, - "grad_norm": 3.03125, + "grad_norm": 2.984375, "learning_rate": 3.455666335024701e-06, - "logits/chosen": -3.0372817516326904, - "logits/rejected": -3.0323173999786377, - "logps/chosen": -186.8362274169922, - "logps/rejected": -209.86343383789062, - "loss": 0.6371, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2337965965270996, - "rewards/margins": 0.24707002937793732, - "rewards/rejected": -1.480866551399231, + "logits/chosen": -3.025081157684326, + "logits/rejected": -3.0198981761932373, + "logps/chosen": -184.22813415527344, + "logps/rejected": -208.3061981201172, + "loss": 0.6328, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.207715630531311, + "rewards/margins": 0.25757837295532227, + "rewards/rejected": -1.4652938842773438, "step": 2540 }, { "epoch": 0.44, - "grad_norm": 2.703125, + "grad_norm": 2.765625, "learning_rate": 3.4417512152564976e-06, - "logits/chosen": -3.0695526599884033, - "logits/rejected": -3.061310291290283, - "logps/chosen": -177.91978454589844, - "logps/rejected": -196.3243865966797, - "loss": 0.628, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0750468969345093, - "rewards/margins": 0.26663774251937866, - "rewards/rejected": -1.3416846990585327, + "logits/chosen": -3.060671806335449, + "logits/rejected": -3.0524652004241943, + "logps/chosen": -173.49700927734375, + "logps/rejected": -191.56179809570312, + "loss": 0.6276, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0308191776275635, + "rewards/margins": 0.2632397413253784, + "rewards/rejected": -1.2940590381622314, "step": 2550 }, { "epoch": 0.44, - "grad_norm": 2.140625, + "grad_norm": 2.078125, "learning_rate": 3.42780201067732e-06, - "logits/chosen": -3.0941805839538574, - "logits/rejected": -3.0907230377197266, - "logps/chosen": -160.5797119140625, - "logps/rejected": -178.7494354248047, - "loss": 0.6354, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.9490875005722046, - "rewards/margins": 0.21290269494056702, - "rewards/rejected": -1.1619904041290283, + "logits/chosen": -3.0848729610443115, + "logits/rejected": -3.082109212875366, + "logps/chosen": -158.57064819335938, + "logps/rejected": -175.95895385742188, + "loss": 0.6376, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9289971590042114, + "rewards/margins": 0.20508842170238495, + "rewards/rejected": -1.1340854167938232, "step": 2560 }, { "epoch": 0.44, - "grad_norm": 1.953125, + "grad_norm": 1.9609375, "learning_rate": 3.413819226150868e-06, - "logits/chosen": -3.108309268951416, - "logits/rejected": -3.102802038192749, - "logps/chosen": -162.13426208496094, - "logps/rejected": -179.61190795898438, - "loss": 0.6276, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.9272769689559937, - "rewards/margins": 0.22005334496498108, - "rewards/rejected": -1.1473302841186523, + "logits/chosen": -3.0963122844696045, + "logits/rejected": -3.0910143852233887, + "logps/chosen": -162.51937866210938, + "logps/rejected": -180.8915252685547, + "loss": 0.6246, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9311281442642212, + "rewards/margins": 0.22899818420410156, + "rewards/rejected": -1.1601263284683228, "step": 2570 }, { "epoch": 0.44, - "grad_norm": 2.578125, + "grad_norm": 2.734375, "learning_rate": 3.399803367756198e-06, - "logits/chosen": -3.0917704105377197, - "logits/rejected": -3.086387872695923, - "logps/chosen": -163.72564697265625, - "logps/rejected": -176.5128936767578, - "loss": 0.6543, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.9989093542098999, - "rewards/margins": 0.15462008118629456, - "rewards/rejected": -1.153529405593872, + "logits/chosen": -3.0756285190582275, + "logits/rejected": -3.0704190731048584, + "logps/chosen": -166.87071228027344, + "logps/rejected": -179.3004150390625, + "loss": 0.6574, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.0303601026535034, + "rewards/margins": 0.15104451775550842, + "rewards/rejected": -1.1814045906066895, "step": 2580 }, { "epoch": 0.45, - "grad_norm": 2.609375, + "grad_norm": 2.546875, "learning_rate": 3.3857549427694114e-06, - "logits/chosen": -3.1144728660583496, - "logits/rejected": -3.10710072517395, - "logps/chosen": -150.10018920898438, - "logps/rejected": -160.92153930664062, - "loss": 0.6288, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.8386899828910828, - "rewards/margins": 0.19079235196113586, - "rewards/rejected": -1.029482364654541, + "logits/chosen": -3.0996594429016113, + "logits/rejected": -3.092259168624878, + "logps/chosen": -151.70034790039062, + "logps/rejected": -163.020263671875, + "loss": 0.626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8546916842460632, + "rewards/margins": 0.19577807188034058, + "rewards/rejected": -1.0504697561264038, "step": 2590 }, { "epoch": 0.45, - "grad_norm": 3.0625, + "grad_norm": 2.984375, "learning_rate": 3.3716744596452918e-06, - "logits/chosen": -3.0912861824035645, - "logits/rejected": -3.0834295749664307, - "logps/chosen": -157.7835235595703, - "logps/rejected": -171.8795928955078, - "loss": 0.6075, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.8950634002685547, - "rewards/margins": 0.25771036744117737, - "rewards/rejected": -1.1527738571166992, + "logits/chosen": -3.0825467109680176, + "logits/rejected": -3.0753118991851807, + "logps/chosen": -159.81790161132812, + "logps/rejected": -173.5633087158203, + "loss": 0.61, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9154074788093567, + "rewards/margins": 0.25420355796813965, + "rewards/rejected": -1.1696109771728516, "step": 2600 }, { "epoch": 0.45, - "eval_logits/chosen": -3.087653160095215, - "eval_logits/rejected": -3.0838546752929688, - "eval_logps/chosen": -146.07357788085938, - "eval_logps/rejected": -162.31468200683594, - "eval_loss": 0.6547107100486755, - "eval_rewards/accuracies": 0.6045538783073425, - "eval_rewards/chosen": -0.7466354370117188, - "eval_rewards/margins": 0.12544085085391998, - "eval_rewards/rejected": -0.8720762729644775, - "eval_runtime": 483.6537, - "eval_samples_per_second": 8.899, + "eval_logits/chosen": -3.080397605895996, + "eval_logits/rejected": -3.076663017272949, + "eval_logps/chosen": -145.68942260742188, + "eval_logps/rejected": -162.40744018554688, + "eval_loss": 0.6531721949577332, + "eval_rewards/accuracies": 0.6017658114433289, + "eval_rewards/chosen": -0.7427940368652344, + "eval_rewards/margins": 0.13020974397659302, + "eval_rewards/rejected": -0.8730038404464722, + "eval_runtime": 483.7174, + "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 2600 }, { "epoch": 0.45, - "grad_norm": 2.359375, + "grad_norm": 2.28125, "learning_rate": 3.3575624279989017e-06, - "logits/chosen": -3.077409029006958, - "logits/rejected": -3.0710222721099854, - "logps/chosen": -161.51071166992188, - "logps/rejected": -175.16867065429688, - "loss": 0.6298, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9341617822647095, - "rewards/margins": 0.21895930171012878, - "rewards/rejected": -1.153120994567871, + "logits/chosen": -3.0729708671569824, + "logits/rejected": -3.0670769214630127, + "logps/chosen": -161.18968200683594, + "logps/rejected": -174.8655548095703, + "loss": 0.6313, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9309514760971069, + "rewards/margins": 0.21913857758045197, + "rewards/rejected": -1.1500900983810425, "step": 2610 }, { "epoch": 0.45, - "grad_norm": 2.5625, + "grad_norm": 2.359375, "learning_rate": 3.3434193585871405e-06, - "logits/chosen": -3.0789408683776855, - "logits/rejected": -3.07112193107605, - "logps/chosen": -160.45144653320312, - "logps/rejected": -184.2188720703125, - "loss": 0.6051, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9884397387504578, - "rewards/margins": 0.2745136320590973, - "rewards/rejected": -1.262953281402588, + "logits/chosen": -3.0856680870056152, + "logits/rejected": -3.0778071880340576, + "logps/chosen": -156.5973663330078, + "logps/rejected": -180.6645965576172, + "loss": 0.6021, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.949898898601532, + "rewards/margins": 0.2775116562843323, + "rewards/rejected": -1.2274105548858643, "step": 2620 }, { "epoch": 0.45, - "grad_norm": 2.265625, + "grad_norm": 2.25, "learning_rate": 3.3292457632902603e-06, - "logits/chosen": -3.0605623722076416, - "logits/rejected": -3.0544679164886475, - "logps/chosen": -169.61282348632812, - "logps/rejected": -192.62991333007812, - "loss": 0.6109, + "logits/chosen": -3.070791721343994, + "logits/rejected": -3.0645086765289307, + "logps/chosen": -163.17092895507812, + "logps/rejected": -185.933837890625, + "loss": 0.6115, "rewards/accuracies": 0.6875, - "rewards/chosen": -1.03895103931427, - "rewards/margins": 0.27460330724716187, - "rewards/rejected": -1.313554286956787, + "rewards/chosen": -0.9745320081710815, + "rewards/margins": 0.27206122875213623, + "rewards/rejected": -1.2465932369232178, "step": 2630 }, { "epoch": 0.45, - "grad_norm": 2.65625, + "grad_norm": 2.890625, "learning_rate": 3.315042155093334e-06, - "logits/chosen": -3.048037052154541, - "logits/rejected": -3.0404555797576904, - "logps/chosen": -171.00094604492188, - "logps/rejected": -193.4027557373047, - "loss": 0.6088, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0972424745559692, - "rewards/margins": 0.26539820432662964, - "rewards/rejected": -1.362640619277954, + "logits/chosen": -3.0559096336364746, + "logits/rejected": -3.048644781112671, + "logps/chosen": -165.0961151123047, + "logps/rejected": -187.40628051757812, + "loss": 0.6092, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0381942987442017, + "rewards/margins": 0.2644815742969513, + "rewards/rejected": -1.3026758432388306, "step": 2640 }, { "epoch": 0.46, - "grad_norm": 2.640625, + "grad_norm": 2.40625, "learning_rate": 3.300809048067692e-06, - "logits/chosen": -3.036726474761963, - "logits/rejected": -3.0282886028289795, - "logps/chosen": -178.0996856689453, - "logps/rejected": -201.49282836914062, - "loss": 0.6315, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1393941640853882, - "rewards/margins": 0.25805196166038513, - "rewards/rejected": -1.3974461555480957, + "logits/chosen": -3.0413713455200195, + "logits/rejected": -3.0334982872009277, + "logps/chosen": -174.01637268066406, + "logps/rejected": -196.48086547851562, + "loss": 0.6335, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0985610485076904, + "rewards/margins": 0.24876561760902405, + "rewards/rejected": -1.3473265171051025, "step": 2650 }, { "epoch": 0.46, - "grad_norm": 3.109375, + "grad_norm": 3.1875, "learning_rate": 3.2865469573523163e-06, - "logits/chosen": -3.0682671070098877, - "logits/rejected": -3.0621070861816406, - "logps/chosen": -177.4022979736328, - "logps/rejected": -191.1795654296875, - "loss": 0.6378, + "logits/chosen": -3.069272756576538, + "logits/rejected": -3.0634963512420654, + "logps/chosen": -175.96026611328125, + "logps/rejected": -190.06216430664062, + "loss": 0.6375, "rewards/accuracies": 0.625, - "rewards/chosen": -1.0945355892181396, - "rewards/margins": 0.21731004118919373, - "rewards/rejected": -1.3118455410003662, + "rewards/chosen": -1.0801150798797607, + "rewards/margins": 0.22055652737617493, + "rewards/rejected": -1.3006716966629028, "step": 2660 }, { "epoch": 0.46, - "grad_norm": 2.046875, + "grad_norm": 2.15625, "learning_rate": 3.2722563991351965e-06, - "logits/chosen": -3.0843520164489746, - "logits/rejected": -3.077232837677002, - "logps/chosen": -162.65525817871094, - "logps/rejected": -180.09799194335938, - "loss": 0.6406, + "logits/chosen": -3.0801496505737305, + "logits/rejected": -3.0725297927856445, + "logps/chosen": -162.62802124023438, + "logps/rejected": -181.03094482421875, + "loss": 0.6374, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.9851263165473938, - "rewards/margins": 0.2147480696439743, - "rewards/rejected": -1.1998745203018188, + "rewards/chosen": -0.9848538637161255, + "rewards/margins": 0.22435028851032257, + "rewards/rejected": -1.209204077720642, "step": 2670 }, { "epoch": 0.46, - "grad_norm": 1.96875, + "grad_norm": 1.984375, "learning_rate": 3.2579378906346464e-06, - "logits/chosen": -3.1301608085632324, - "logits/rejected": -3.1294732093811035, - "logps/chosen": -152.4020538330078, - "logps/rejected": -161.99658203125, - "loss": 0.6408, + "logits/chosen": -3.118258237838745, + "logits/rejected": -3.1177947521209717, + "logps/chosen": -155.3919219970703, + "logps/rejected": -165.44732666015625, + "loss": 0.6409, "rewards/accuracies": 0.625, - "rewards/chosen": -0.8279502987861633, - "rewards/margins": 0.17696353793144226, - "rewards/rejected": -1.0049139261245728, + "rewards/chosen": -0.8578490018844604, + "rewards/margins": 0.18157216906547546, + "rewards/rejected": -1.0394213199615479, "step": 2680 }, { "epoch": 0.46, - "grad_norm": 1.7578125, + "grad_norm": 1.859375, "learning_rate": 3.243591950080584e-06, - "logits/chosen": -3.1527717113494873, - "logits/rejected": -3.1481423377990723, - "logps/chosen": -139.64170837402344, - "logps/rejected": -159.52427673339844, - "loss": 0.6076, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.7912130951881409, - "rewards/margins": 0.23410753905773163, - "rewards/rejected": -1.0253206491470337, + "logits/chosen": -3.1438679695129395, + "logits/rejected": -3.1390693187713623, + "logps/chosen": -140.6311798095703, + "logps/rejected": -160.849853515625, + "loss": 0.607, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8011075854301453, + "rewards/margins": 0.2374686896800995, + "rewards/rejected": -1.038576364517212, "step": 2690 }, { "epoch": 0.47, - "grad_norm": 2.25, + "grad_norm": 2.3125, "learning_rate": 3.2292190966957776e-06, - "logits/chosen": -3.1262025833129883, - "logits/rejected": -3.1222102642059326, - "logps/chosen": -148.70486450195312, - "logps/rejected": -168.97586059570312, - "loss": 0.6282, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8543796539306641, - "rewards/margins": 0.22417394816875458, - "rewards/rejected": -1.078553557395935, + "logits/chosen": -3.122014045715332, + "logits/rejected": -3.1181957721710205, + "logps/chosen": -149.11021423339844, + "logps/rejected": -168.99923706054688, + "loss": 0.6295, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8584330677986145, + "rewards/margins": 0.22035415470600128, + "rewards/rejected": -1.0787872076034546, "step": 2700 }, { "epoch": 0.47, - "eval_logits/chosen": -3.122093915939331, - "eval_logits/rejected": -3.118544340133667, - "eval_logps/chosen": -140.73252868652344, - "eval_logps/rejected": -157.2623748779297, - "eval_loss": 0.6531208157539368, - "eval_rewards/accuracies": 0.6101301312446594, - "eval_rewards/chosen": -0.693225085735321, - "eval_rewards/margins": 0.12832820415496826, - "eval_rewards/rejected": -0.8215532302856445, - "eval_runtime": 483.8913, - "eval_samples_per_second": 8.895, + "eval_logits/chosen": -3.1114442348480225, + "eval_logits/rejected": -3.107997179031372, + "eval_logps/chosen": -139.27479553222656, + "eval_logps/rejected": -155.93223571777344, + "eval_loss": 0.6526122689247131, + "eval_rewards/accuracies": 0.6138476133346558, + "eval_rewards/chosen": -0.6786475777626038, + "eval_rewards/margins": 0.1296042650938034, + "eval_rewards/rejected": -0.8082518577575684, + "eval_runtime": 483.8293, + "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 2700 }, { "epoch": 0.47, - "grad_norm": 2.4375, + "grad_norm": 2.46875, "learning_rate": 3.21481985067705e-06, - "logits/chosen": -3.1101999282836914, - "logits/rejected": -3.1087584495544434, - "logps/chosen": -159.03683471679688, - "logps/rejected": -170.97689819335938, - "loss": 0.6379, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.8768089413642883, - "rewards/margins": 0.18777629733085632, - "rewards/rejected": -1.0645853281021118, + "logits/chosen": -3.1047282218933105, + "logits/rejected": -3.1033012866973877, + "logps/chosen": -157.62673950195312, + "logps/rejected": -169.35943603515625, + "loss": 0.6389, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8627079129219055, + "rewards/margins": 0.18570269644260406, + "rewards/rejected": -1.0484106540679932, "step": 2710 }, { "epoch": 0.47, - "grad_norm": 2.6875, + "grad_norm": 2.8125, "learning_rate": 3.200394733176454e-06, - "logits/chosen": -3.107483148574829, - "logits/rejected": -3.1052358150482178, - "logps/chosen": -151.2173309326172, - "logps/rejected": -175.96107482910156, - "loss": 0.6139, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.8823947906494141, - "rewards/margins": 0.2533818185329437, - "rewards/rejected": -1.1357766389846802, + "logits/chosen": -3.1050503253936768, + "logits/rejected": -3.1030993461608887, + "logps/chosen": -149.4019775390625, + "logps/rejected": -173.2121124267578, + "loss": 0.6179, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8642409443855286, + "rewards/margins": 0.24404600262641907, + "rewards/rejected": -1.1082870960235596, "step": 2720 }, { "epoch": 0.47, - "grad_norm": 3.25, + "grad_norm": 3.171875, "learning_rate": 3.1859442662824085e-06, - "logits/chosen": -3.103843927383423, - "logits/rejected": -3.0988357067108154, - "logps/chosen": -162.48377990722656, - "logps/rejected": -177.31251525878906, - "loss": 0.6416, + "logits/chosen": -3.099372386932373, + "logits/rejected": -3.094510555267334, + "logps/chosen": -160.30758666992188, + "logps/rejected": -175.3297576904297, + "loss": 0.6395, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.929785430431366, - "rewards/margins": 0.21908466517925262, - "rewards/rejected": -1.1488702297210693, + "rewards/chosen": -0.9080232381820679, + "rewards/margins": 0.2210191935300827, + "rewards/rejected": -1.1290425062179565, "step": 2730 }, { "epoch": 0.47, - "grad_norm": 2.15625, + "grad_norm": 2.328125, "learning_rate": 3.1714689730008043e-06, - "logits/chosen": -3.117023468017578, - "logits/rejected": -3.1135358810424805, - "logps/chosen": -149.63278198242188, - "logps/rejected": -160.5523223876953, - "loss": 0.6527, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.857519805431366, - "rewards/margins": 0.16857053339481354, - "rewards/rejected": -1.026090383529663, + "logits/chosen": -3.11287522315979, + "logits/rejected": -3.109023332595825, + "logps/chosen": -150.24667358398438, + "logps/rejected": -162.3335723876953, + "loss": 0.6471, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8636587262153625, + "rewards/margins": 0.18024428188800812, + "rewards/rejected": -1.043903112411499, "step": 2740 }, { "epoch": 0.47, - "grad_norm": 2.15625, + "grad_norm": 2.03125, "learning_rate": 3.156969377236072e-06, - "logits/chosen": -3.1075711250305176, - "logits/rejected": -3.0988144874572754, - "logps/chosen": -140.55865478515625, - "logps/rejected": -167.2582244873047, - "loss": 0.6084, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.8101563453674316, - "rewards/margins": 0.25592103600502014, - "rewards/rejected": -1.066077470779419, + "logits/chosen": -3.104238748550415, + "logits/rejected": -3.0955731868743896, + "logps/chosen": -141.8778839111328, + "logps/rejected": -168.66220092773438, + "loss": 0.6094, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8233487010002136, + "rewards/margins": 0.2567684054374695, + "rewards/rejected": -1.080117106437683, "step": 2750 }, { "epoch": 0.48, - "grad_norm": 3.3125, + "grad_norm": 3.125, "learning_rate": 3.1424460037722237e-06, - "logits/chosen": -3.096191644668579, - "logits/rejected": -3.0911002159118652, - "logps/chosen": -143.87744140625, - "logps/rejected": -162.12014770507812, - "loss": 0.6342, + "logits/chosen": -3.0943779945373535, + "logits/rejected": -3.089038133621216, + "logps/chosen": -144.514404296875, + "logps/rejected": -164.00999450683594, + "loss": 0.6297, "rewards/accuracies": 0.625, - "rewards/chosen": -0.7865437269210815, - "rewards/margins": 0.20428061485290527, - "rewards/rejected": -0.9908244013786316, + "rewards/chosen": -0.7929133176803589, + "rewards/margins": 0.21680936217308044, + "rewards/rejected": -1.0097228288650513, "step": 2760 }, { "epoch": 0.48, - "grad_norm": 2.4375, + "grad_norm": 2.5, "learning_rate": 3.127899378253858e-06, - "logits/chosen": -3.111184597015381, - "logits/rejected": -3.106191635131836, - "logps/chosen": -155.27023315429688, - "logps/rejected": -169.6674346923828, - "loss": 0.6283, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.8901050686836243, - "rewards/margins": 0.1963515430688858, - "rewards/rejected": -1.0864566564559937, + "logits/chosen": -3.110429525375366, + "logits/rejected": -3.1051766872406006, + "logps/chosen": -156.11631774902344, + "logps/rejected": -171.1414031982422, + "loss": 0.6267, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8985657691955566, + "rewards/margins": 0.20263083279132843, + "rewards/rejected": -1.1011966466903687, "step": 2770 }, { "epoch": 0.48, - "grad_norm": 2.21875, + "grad_norm": 2.234375, "learning_rate": 3.1133300271671354e-06, - "logits/chosen": -3.073341131210327, - "logits/rejected": -3.0657191276550293, - "logps/chosen": -164.946044921875, - "logps/rejected": -182.87684631347656, - "loss": 0.6265, + "logits/chosen": -3.076634168624878, + "logits/rejected": -3.069157600402832, + "logps/chosen": -164.27456665039062, + "logps/rejected": -182.2312469482422, + "loss": 0.6262, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.0226364135742188, - "rewards/margins": 0.22265009582042694, - "rewards/rejected": -1.2452863454818726, + "rewards/chosen": -1.0159214735031128, + "rewards/margins": 0.22290892899036407, + "rewards/rejected": -1.238830327987671, "step": 2780 }, { "epoch": 0.48, - "grad_norm": 2.265625, + "grad_norm": 2.328125, "learning_rate": 3.0987384778207218e-06, - "logits/chosen": -3.07016658782959, - "logits/rejected": -3.06632661819458, - "logps/chosen": -160.828857421875, - "logps/rejected": -181.21853637695312, - "loss": 0.6099, + "logits/chosen": -3.0719902515411377, + "logits/rejected": -3.0675671100616455, + "logps/chosen": -160.24789428710938, + "logps/rejected": -180.9109649658203, + "loss": 0.6101, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.9760662913322449, - "rewards/margins": 0.2529025971889496, - "rewards/rejected": -1.228968858718872, + "rewards/chosen": -0.9702569246292114, + "rewards/margins": 0.2556365132331848, + "rewards/rejected": -1.225893259048462, "step": 2790 }, { "epoch": 0.48, - "grad_norm": 2.203125, + "grad_norm": 2.40625, "learning_rate": 3.0841252583267067e-06, - "logits/chosen": -3.08182954788208, - "logits/rejected": -3.0774779319763184, - "logps/chosen": -178.40939331054688, - "logps/rejected": -192.86624145507812, - "loss": 0.6495, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.087641954421997, - "rewards/margins": 0.19885390996932983, - "rewards/rejected": -1.2864959239959717, + "logits/chosen": -3.0815820693969727, + "logits/rejected": -3.0771849155426025, + "logps/chosen": -179.7408447265625, + "logps/rejected": -193.85435485839844, + "loss": 0.6504, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1009565591812134, + "rewards/margins": 0.19542057812213898, + "rewards/rejected": -1.2963770627975464, "step": 2800 }, { "epoch": 0.48, - "eval_logits/chosen": -3.0925514698028564, - "eval_logits/rejected": -3.0887064933776855, - "eval_logps/chosen": -148.7371826171875, - "eval_logps/rejected": -166.3008575439453, - "eval_loss": 0.6517302989959717, - "eval_rewards/accuracies": 0.6080390214920044, - "eval_rewards/chosen": -0.7732716798782349, - "eval_rewards/margins": 0.1386662721633911, - "eval_rewards/rejected": -0.9119380116462708, - "eval_runtime": 483.7013, - "eval_samples_per_second": 8.898, + "eval_logits/chosen": -3.0914981365203857, + "eval_logits/rejected": -3.087722063064575, + "eval_logps/chosen": -149.51148986816406, + "eval_logps/rejected": -167.5323486328125, + "eval_loss": 0.651041567325592, + "eval_rewards/accuracies": 0.6105948090553284, + "eval_rewards/chosen": -0.7810146808624268, + "eval_rewards/margins": 0.14323832094669342, + "eval_rewards/rejected": -0.924252986907959, + "eval_runtime": 483.9389, + "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2800 }, { "epoch": 0.48, - "grad_norm": 2.59375, + "grad_norm": 2.46875, "learning_rate": 3.069490897581486e-06, - "logits/chosen": -3.089717388153076, - "logits/rejected": -3.0846445560455322, - "logps/chosen": -158.58767700195312, - "logps/rejected": -185.58558654785156, + "logits/chosen": -3.0918047428131104, + "logits/rejected": -3.0865800380706787, + "logps/chosen": -160.14015197753906, + "logps/rejected": -187.4204864501953, "loss": 0.5971, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9179424047470093, - "rewards/margins": 0.29216495156288147, - "rewards/rejected": -1.2101073265075684, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.933466911315918, + "rewards/margins": 0.29498928785324097, + "rewards/rejected": -1.2284562587738037, "step": 2810 }, { "epoch": 0.49, - "grad_norm": 2.875, + "grad_norm": 2.671875, "learning_rate": 3.054835925246622e-06, - "logits/chosen": -3.0747859477996826, - "logits/rejected": -3.0709011554718018, - "logps/chosen": -172.0844268798828, - "logps/rejected": -184.31924438476562, - "loss": 0.6543, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.068491816520691, - "rewards/margins": 0.17100703716278076, - "rewards/rejected": -1.2394988536834717, + "logits/chosen": -3.0843358039855957, + "logits/rejected": -3.080287218093872, + "logps/chosen": -170.4581756591797, + "logps/rejected": -182.52093505859375, + "loss": 0.655, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.0522292852401733, + "rewards/margins": 0.16928645968437195, + "rewards/rejected": -1.2215157747268677, "step": 2820 }, { "epoch": 0.49, - "grad_norm": 2.34375, + "grad_norm": 2.28125, "learning_rate": 3.040160871729672e-06, - "logits/chosen": -3.0449886322021484, - "logits/rejected": -3.0391552448272705, - "logps/chosen": -173.50198364257812, - "logps/rejected": -198.10621643066406, - "loss": 0.6103, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0645406246185303, - "rewards/margins": 0.28309398889541626, - "rewards/rejected": -1.3476345539093018, + "logits/chosen": -3.0598201751708984, + "logits/rejected": -3.0533511638641357, + "logps/chosen": -169.25344848632812, + "logps/rejected": -193.43211364746094, + "loss": 0.6093, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0220553874969482, + "rewards/margins": 0.2788383364677429, + "rewards/rejected": -1.3008936643600464, "step": 2830 }, { "epoch": 0.49, - "grad_norm": 2.671875, + "grad_norm": 2.4375, "learning_rate": 3.025466268164992e-06, - "logits/chosen": -3.068354368209839, - "logits/rejected": -3.0641281604766846, - "logps/chosen": -173.51577758789062, - "logps/rejected": -186.96096801757812, - "loss": 0.6479, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.0937964916229248, - "rewards/margins": 0.18635380268096924, - "rewards/rejected": -1.2801504135131836, + "logits/chosen": -3.083550453186035, + "logits/rejected": -3.0787971019744873, + "logps/chosen": -168.7285614013672, + "logps/rejected": -182.9897918701172, + "loss": 0.6431, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.045924186706543, + "rewards/margins": 0.1945144236087799, + "rewards/rejected": -1.24043869972229, "step": 2840 }, { "epoch": 0.49, - "grad_norm": 3.328125, + "grad_norm": 3.671875, "learning_rate": 3.0107526463945124e-06, - "logits/chosen": -3.0679664611816406, - "logits/rejected": -3.062669277191162, - "logps/chosen": -166.79176330566406, - "logps/rejected": -194.7895050048828, - "loss": 0.6058, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.03748619556427, - "rewards/margins": 0.28654587268829346, - "rewards/rejected": -1.3240320682525635, + "logits/chosen": -3.0826773643493652, + "logits/rejected": -3.077467679977417, + "logps/chosen": -163.4428253173828, + "logps/rejected": -191.730712890625, + "loss": 0.6031, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.003996729850769, + "rewards/margins": 0.2894473969936371, + "rewards/rejected": -1.293444275856018, "step": 2850 }, { "epoch": 0.49, - "grad_norm": 2.953125, + "grad_norm": 3.21875, "learning_rate": 2.9960205389484918e-06, - "logits/chosen": -3.078962564468384, - "logits/rejected": -3.074683666229248, - "logps/chosen": -166.84970092773438, - "logps/rejected": -187.77969360351562, - "loss": 0.6094, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.020708680152893, - "rewards/margins": 0.25686854124069214, - "rewards/rejected": -1.27757728099823, + "logits/chosen": -3.085514545440674, + "logits/rejected": -3.0808959007263184, + "logps/chosen": -165.62918090820312, + "logps/rejected": -186.27366638183594, + "loss": 0.6111, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0085035562515259, + "rewards/margins": 0.25401344895362854, + "rewards/rejected": -1.262516975402832, "step": 2860 }, { "epoch": 0.49, - "grad_norm": 2.609375, + "grad_norm": 2.78125, "learning_rate": 2.981270479026239e-06, - "logits/chosen": -3.0889625549316406, - "logits/rejected": -3.0859227180480957, - "logps/chosen": -175.95822143554688, - "logps/rejected": -191.1319122314453, - "loss": 0.6327, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.0878520011901855, - "rewards/margins": 0.22063109278678894, - "rewards/rejected": -1.3084831237792969, + "logits/chosen": -3.0913071632385254, + "logits/rejected": -3.0883071422576904, + "logps/chosen": -173.51954650878906, + "logps/rejected": -189.17471313476562, + "loss": 0.6285, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0634653568267822, + "rewards/margins": 0.22544582188129425, + "rewards/rejected": -1.28891122341156, "step": 2870 }, { "epoch": 0.5, - "grad_norm": 2.65625, + "grad_norm": 2.6875, "learning_rate": 2.9665030004768158e-06, - "logits/chosen": -3.0921990871429443, - "logits/rejected": -3.0831103324890137, - "logps/chosen": -172.875244140625, - "logps/rejected": -190.37728881835938, - "loss": 0.6286, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0688822269439697, - "rewards/margins": 0.24558386206626892, - "rewards/rejected": -1.314465880393982, + "logits/chosen": -3.0986485481262207, + "logits/rejected": -3.089444637298584, + "logps/chosen": -169.74688720703125, + "logps/rejected": -188.04348754882812, + "loss": 0.6236, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0375983715057373, + "rewards/margins": 0.25352948904037476, + "rewards/rejected": -1.2911279201507568, "step": 2880 }, { "epoch": 0.5, - "grad_norm": 3.34375, + "grad_norm": 3.09375, "learning_rate": 2.9517186377797203e-06, - "logits/chosen": -3.0864691734313965, - "logits/rejected": -3.081881046295166, - "logps/chosen": -163.46083068847656, - "logps/rejected": -187.11343383789062, - "loss": 0.6167, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0008008480072021, - "rewards/margins": 0.25917181372642517, - "rewards/rejected": -1.2599728107452393, + "logits/chosen": -3.0926268100738525, + "logits/rejected": -3.0881187915802, + "logps/chosen": -161.2325439453125, + "logps/rejected": -184.8660125732422, + "loss": 0.6159, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9785183072090149, + "rewards/margins": 0.25898051261901855, + "rewards/rejected": -1.2374987602233887, "step": 2890 }, { "epoch": 0.5, - "grad_norm": 2.703125, + "grad_norm": 2.609375, "learning_rate": 2.936917926025536e-06, - "logits/chosen": -3.0828022956848145, - "logits/rejected": -3.078491687774658, - "logps/chosen": -159.6892547607422, - "logps/rejected": -181.262451171875, - "loss": 0.6202, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9650337100028992, - "rewards/margins": 0.24236789345741272, - "rewards/rejected": -1.2074015140533447, + "logits/chosen": -3.0904502868652344, + "logits/rejected": -3.0861709117889404, + "logps/chosen": -155.18263244628906, + "logps/rejected": -175.84353637695312, + "loss": 0.6226, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.919967532157898, + "rewards/margins": 0.2332448959350586, + "rewards/rejected": -1.153212547302246, "step": 2900 }, { "epoch": 0.5, - "eval_logits/chosen": -3.0786919593811035, - "eval_logits/rejected": -3.074411392211914, - "eval_logps/chosen": -152.96585083007812, - "eval_logps/rejected": -170.98321533203125, - "eval_loss": 0.6511818170547485, - "eval_rewards/accuracies": 0.604786217212677, - "eval_rewards/chosen": -0.815558135509491, - "eval_rewards/margins": 0.143203467130661, - "eval_rewards/rejected": -0.9587615728378296, - "eval_runtime": 483.7714, - "eval_samples_per_second": 8.897, + "eval_logits/chosen": -3.086979389190674, + "eval_logits/rejected": -3.0831146240234375, + "eval_logps/chosen": -147.7837371826172, + "eval_logps/rejected": -165.611572265625, + "eval_loss": 0.6512519121170044, + "eval_rewards/accuracies": 0.6126858592033386, + "eval_rewards/chosen": -0.763737142086029, + "eval_rewards/margins": 0.14130805432796478, + "eval_rewards/rejected": -0.905045211315155, + "eval_runtime": 483.9421, + "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2900 }, { "epoch": 0.5, - "grad_norm": 2.359375, + "grad_norm": 2.484375, "learning_rate": 2.9221014008965686e-06, - "logits/chosen": -3.0856261253356934, - "logits/rejected": -3.0778799057006836, - "logps/chosen": -171.34780883789062, - "logps/rejected": -201.55828857421875, - "loss": 0.5884, + "logits/chosen": -3.097363233566284, + "logits/rejected": -3.0895471572875977, + "logps/chosen": -167.21290588378906, + "logps/rejected": -197.3682861328125, + "loss": 0.5858, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.042557716369629, - "rewards/margins": 0.33153969049453735, - "rewards/rejected": -1.3740973472595215, + "rewards/chosen": -1.0012085437774658, + "rewards/margins": 0.3309888243675232, + "rewards/rejected": -1.3321974277496338, "step": 2910 }, { "epoch": 0.5, - "grad_norm": 2.875, + "grad_norm": 3.015625, "learning_rate": 2.907269598647457e-06, - "logits/chosen": -3.02229905128479, - "logits/rejected": -3.0171494483947754, - "logps/chosen": -185.18301391601562, - "logps/rejected": -215.0197296142578, - "loss": 0.6061, + "logits/chosen": -3.0325138568878174, + "logits/rejected": -3.0270192623138428, + "logps/chosen": -183.08514404296875, + "logps/rejected": -214.2899627685547, + "loss": 0.5986, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2088510990142822, - "rewards/margins": 0.3223053812980652, - "rewards/rejected": -1.5311565399169922, + "rewards/chosen": -1.1878725290298462, + "rewards/margins": 0.3359866142272949, + "rewards/rejected": -1.5238590240478516, "step": 2920 }, { "epoch": 0.5, - "grad_norm": 2.53125, + "grad_norm": 2.78125, "learning_rate": 2.8924230560857657e-06, - "logits/chosen": -3.023981809616089, - "logits/rejected": -3.0177435874938965, - "logps/chosen": -178.6295623779297, - "logps/rejected": -202.44158935546875, - "loss": 0.6029, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1470363140106201, - "rewards/margins": 0.2915676534175873, - "rewards/rejected": -1.4386039972305298, + "logits/chosen": -3.0343759059906006, + "logits/rejected": -3.0279102325439453, + "logps/chosen": -178.95266723632812, + "logps/rejected": -201.8841094970703, + "loss": 0.6083, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.150267243385315, + "rewards/margins": 0.2827618718147278, + "rewards/rejected": -1.4330291748046875, "step": 2930 }, { "epoch": 0.51, - "grad_norm": 2.5, + "grad_norm": 2.5625, "learning_rate": 2.8775623105525557e-06, - "logits/chosen": -3.05517840385437, - "logits/rejected": -3.053927183151245, - "logps/chosen": -166.50283813476562, - "logps/rejected": -187.00949096679688, - "loss": 0.6236, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.0485191345214844, - "rewards/margins": 0.23477879166603088, - "rewards/rejected": -1.2832978963851929, + "logits/chosen": -3.0594565868377686, + "logits/rejected": -3.0583560466766357, + "logps/chosen": -169.06393432617188, + "logps/rejected": -190.04153442382812, + "loss": 0.6225, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0741300582885742, + "rewards/margins": 0.2394881695508957, + "rewards/rejected": -1.3136181831359863, "step": 2940 }, { "epoch": 0.51, - "grad_norm": 3.09375, + "grad_norm": 3.15625, "learning_rate": 2.8626878999029354e-06, - "logits/chosen": -3.046323299407959, - "logits/rejected": -3.040755033493042, - "logps/chosen": -176.8887939453125, - "logps/rejected": -198.10455322265625, - "loss": 0.6306, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.115063190460205, - "rewards/margins": 0.23778335750102997, - "rewards/rejected": -1.352846622467041, + "logits/chosen": -3.0534815788269043, + "logits/rejected": -3.048189878463745, + "logps/chosen": -179.89453125, + "logps/rejected": -201.58236694335938, + "loss": 0.6313, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.145120620727539, + "rewards/margins": 0.24250420928001404, + "rewards/rejected": -1.387624979019165, "step": 2950 }, { "epoch": 0.51, "grad_norm": 2.453125, "learning_rate": 2.847800362486596e-06, - "logits/chosen": -3.0453903675079346, - "logits/rejected": -3.0353028774261475, - "logps/chosen": -170.50367736816406, - "logps/rejected": -198.71578979492188, - "loss": 0.5976, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0572386980056763, - "rewards/margins": 0.32733428478240967, - "rewards/rejected": -1.384572982788086, + "logits/chosen": -3.055997848510742, + "logits/rejected": -3.0456926822662354, + "logps/chosen": -172.91397094726562, + "logps/rejected": -202.24942016601562, + "loss": 0.5928, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0813416242599487, + "rewards/margins": 0.3385675549507141, + "rewards/rejected": -1.4199092388153076, "step": 2960 }, { "epoch": 0.51, - "grad_norm": 3.1875, + "grad_norm": 3.125, "learning_rate": 2.832900237128325e-06, - "logits/chosen": -3.0243773460388184, - "logits/rejected": -3.02040958404541, - "logps/chosen": -179.3520965576172, - "logps/rejected": -198.31239318847656, - "loss": 0.6296, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1419975757598877, - "rewards/margins": 0.24163858592510223, - "rewards/rejected": -1.383636236190796, + "logits/chosen": -3.0329508781433105, + "logits/rejected": -3.029247760772705, + "logps/chosen": -182.95108032226562, + "logps/rejected": -201.630126953125, + "loss": 0.6314, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1779873371124268, + "rewards/margins": 0.2388262301683426, + "rewards/rejected": -1.416813611984253, "step": 2970 }, { "epoch": 0.51, - "grad_norm": 3.875, + "grad_norm": 3.640625, "learning_rate": 2.8179880631085053e-06, - "logits/chosen": -3.0174291133880615, - "logits/rejected": -3.0091402530670166, - "logps/chosen": -177.42556762695312, - "logps/rejected": -202.75509643554688, - "loss": 0.6054, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1225911378860474, - "rewards/margins": 0.30541062355041504, - "rewards/rejected": -1.4280017614364624, + "logits/chosen": -3.0308852195739746, + "logits/rejected": -3.022143840789795, + "logps/chosen": -176.36842346191406, + "logps/rejected": -202.55227661132812, + "loss": 0.603, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1120197772979736, + "rewards/margins": 0.31395381689071655, + "rewards/rejected": -1.4259734153747559, "step": 2980 }, { "epoch": 0.52, - "grad_norm": 3.03125, + "grad_norm": 3.015625, "learning_rate": 2.803064380143598e-06, - "logits/chosen": -3.0254709720611572, - "logits/rejected": -3.0239815711975098, - "logps/chosen": -189.5607452392578, - "logps/rejected": -203.75823974609375, - "loss": 0.6487, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2223767042160034, - "rewards/margins": 0.1916547566652298, - "rewards/rejected": -1.4140313863754272, + "logits/chosen": -3.0388948917388916, + "logits/rejected": -3.037651538848877, + "logps/chosen": -189.60171508789062, + "logps/rejected": -203.8285675048828, + "loss": 0.6495, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2227863073349, + "rewards/margins": 0.19194839894771576, + "rewards/rejected": -1.4147346019744873, "step": 2990 }, { "epoch": 0.52, - "grad_norm": 2.546875, + "grad_norm": 2.5625, "learning_rate": 2.7881297283666063e-06, - "logits/chosen": -3.0681405067443848, - "logits/rejected": -3.0591464042663574, - "logps/chosen": -164.3688201904297, - "logps/rejected": -189.56399536132812, - "loss": 0.6252, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.0210239887237549, - "rewards/margins": 0.24706561863422394, - "rewards/rejected": -1.2680894136428833, + "logits/chosen": -3.08270001411438, + "logits/rejected": -3.074023723602295, + "logps/chosen": -163.1550750732422, + "logps/rejected": -188.59869384765625, + "loss": 0.6226, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0088860988616943, + "rewards/margins": 0.24955037236213684, + "rewards/rejected": -1.2584365606307983, "step": 3000 }, { "epoch": 0.52, - "eval_logits/chosen": -3.082378625869751, - "eval_logits/rejected": -3.0782182216644287, - "eval_logps/chosen": -148.42666625976562, - "eval_logps/rejected": -166.3867950439453, - "eval_loss": 0.6505253314971924, - "eval_rewards/accuracies": 0.6054832935333252, - "eval_rewards/chosen": -0.7701665759086609, - "eval_rewards/margins": 0.14263089001178741, - "eval_rewards/rejected": -0.9127974510192871, - "eval_runtime": 483.7394, - "eval_samples_per_second": 8.897, + "eval_logits/chosen": -3.09553599357605, + "eval_logits/rejected": -3.0915791988372803, + "eval_logps/chosen": -145.16188049316406, + "eval_logps/rejected": -163.44444274902344, + "eval_loss": 0.6493727564811707, + "eval_rewards/accuracies": 0.6078066825866699, + "eval_rewards/chosen": -0.7375187873840332, + "eval_rewards/margins": 0.14585508406162262, + "eval_rewards/rejected": -0.8833737969398499, + "eval_runtime": 483.7912, + "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 3000 }, { "epoch": 0.52, - "grad_norm": 2.515625, + "grad_norm": 2.359375, "learning_rate": 2.77318464830753e-06, - "logits/chosen": -3.0817339420318604, - "logits/rejected": -3.075918674468994, - "logps/chosen": -163.93724060058594, - "logps/rejected": -179.66001892089844, - "loss": 0.6391, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9847095608711243, - "rewards/margins": 0.20721586048603058, - "rewards/rejected": -1.191925287246704, + "logits/chosen": -3.101335287094116, + "logits/rejected": -3.0958926677703857, + "logps/chosen": -160.12254333496094, + "logps/rejected": -175.15676879882812, + "loss": 0.6403, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.946562647819519, + "rewards/margins": 0.2003302127122879, + "rewards/rejected": -1.146892786026001, "step": 3010 }, { "epoch": 0.52, - "grad_norm": 2.171875, + "grad_norm": 2.09375, "learning_rate": 2.7582296808737964e-06, - "logits/chosen": -3.099862575531006, - "logits/rejected": -3.0948238372802734, - "logps/chosen": -164.39498901367188, - "logps/rejected": -180.0300750732422, - "loss": 0.633, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.9961506128311157, - "rewards/margins": 0.23695461452007294, - "rewards/rejected": -1.2331053018569946, + "logits/chosen": -3.123887300491333, + "logits/rejected": -3.1185977458953857, + "logps/chosen": -159.97726440429688, + "logps/rejected": -175.16766357421875, + "loss": 0.6336, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9519734382629395, + "rewards/margins": 0.23250770568847656, + "rewards/rejected": -1.184481143951416, "step": 3020 }, { "epoch": 0.52, - "grad_norm": 2.390625, + "grad_norm": 2.25, "learning_rate": 2.7432653673306896e-06, - "logits/chosen": -3.1014318466186523, - "logits/rejected": -3.0969395637512207, - "logps/chosen": -155.86209106445312, - "logps/rejected": -171.91366577148438, - "loss": 0.6356, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.9031952619552612, - "rewards/margins": 0.1848280131816864, - "rewards/rejected": -1.08802330493927, + "logits/chosen": -3.1237471103668213, + "logits/rejected": -3.119204521179199, + "logps/chosen": -151.10260009765625, + "logps/rejected": -166.8441162109375, + "loss": 0.6358, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8556004762649536, + "rewards/margins": 0.18172724545001984, + "rewards/rejected": -1.0373276472091675, "step": 3030 }, { "epoch": 0.52, - "grad_norm": 2.109375, + "grad_norm": 1.859375, "learning_rate": 2.7282922492817565e-06, - "logits/chosen": -3.1041133403778076, - "logits/rejected": -3.0992465019226074, - "logps/chosen": -151.39370727539062, - "logps/rejected": -179.11111450195312, + "logits/chosen": -3.132171154022217, + "logits/rejected": -3.1270885467529297, + "logps/chosen": -147.31736755371094, + "logps/rejected": -174.3421173095703, "loss": 0.6077, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.905647873878479, - "rewards/margins": 0.27351897954940796, - "rewards/rejected": -1.1791667938232422, + "rewards/chosen": -0.8648845553398132, + "rewards/margins": 0.26659253239631653, + "rewards/rejected": -1.1314771175384521, "step": 3040 }, { "epoch": 0.53, - "grad_norm": 2.875, + "grad_norm": 2.734375, "learning_rate": 2.7133108686492054e-06, - "logits/chosen": -3.074476957321167, - "logits/rejected": -3.074582576751709, - "logps/chosen": -155.0388641357422, - "logps/rejected": -175.10678100585938, - "loss": 0.631, + "logits/chosen": -3.099966049194336, + "logits/rejected": -3.1003365516662598, + "logps/chosen": -150.30726623535156, + "logps/rejected": -169.76614379882812, + "loss": 0.6307, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9062402844429016, - "rewards/margins": 0.2049740105867386, - "rewards/rejected": -1.111214280128479, + "rewards/chosen": -0.8589240908622742, + "rewards/margins": 0.1988839954137802, + "rewards/rejected": -1.0578080415725708, "step": 3050 }, { "epoch": 0.53, - "grad_norm": 2.5, + "grad_norm": 2.421875, "learning_rate": 2.6983217676542927e-06, - "logits/chosen": -3.061591148376465, - "logits/rejected": -3.0571389198303223, - "logps/chosen": -162.79901123046875, - "logps/rejected": -183.30624389648438, - "loss": 0.6187, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.964520275592804, - "rewards/margins": 0.23759868741035461, - "rewards/rejected": -1.2021191120147705, + "logits/chosen": -3.0891611576080322, + "logits/rejected": -3.0840840339660645, + "logps/chosen": -158.2369384765625, + "logps/rejected": -178.24795532226562, + "loss": 0.6191, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.918899655342102, + "rewards/margins": 0.23263654112815857, + "rewards/rejected": -1.151536226272583, "step": 3060 }, { "epoch": 0.53, - "grad_norm": 2.625, + "grad_norm": 2.609375, "learning_rate": 2.6833254887976974e-06, - "logits/chosen": -3.0584092140197754, - "logits/rejected": -3.0508856773376465, - "logps/chosen": -170.1864776611328, - "logps/rejected": -185.8191680908203, - "loss": 0.6235, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0173214673995972, - "rewards/margins": 0.22812402248382568, - "rewards/rejected": -1.2454454898834229, + "logits/chosen": -3.0840859413146973, + "logits/rejected": -3.076413631439209, + "logps/chosen": -166.41143798828125, + "logps/rejected": -182.2546844482422, + "loss": 0.6218, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9795713424682617, + "rewards/margins": 0.2302292138338089, + "rewards/rejected": -1.2098004817962646, "step": 3070 }, { "epoch": 0.53, - "grad_norm": 2.625, + "grad_norm": 2.421875, "learning_rate": 2.6683225748398877e-06, - "logits/chosen": -3.055368185043335, - "logits/rejected": -3.0461692810058594, - "logps/chosen": -166.15420532226562, - "logps/rejected": -192.15284729003906, - "loss": 0.6274, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.0636614561080933, - "rewards/margins": 0.24721452593803406, - "rewards/rejected": -1.3108760118484497, + "logits/chosen": -3.07651424407959, + "logits/rejected": -3.067636013031006, + "logps/chosen": -164.54656982421875, + "logps/rejected": -189.8894805908203, + "loss": 0.6297, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.047585129737854, + "rewards/margins": 0.2406575232744217, + "rewards/rejected": -1.2882425785064697, "step": 3080 }, { "epoch": 0.53, - "grad_norm": 2.890625, + "grad_norm": 2.796875, "learning_rate": 2.6533135687814753e-06, - "logits/chosen": -3.0698859691619873, - "logits/rejected": -3.0649948120117188, - "logps/chosen": -162.9439697265625, - "logps/rejected": -186.91714477539062, - "loss": 0.6026, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9822311401367188, - "rewards/margins": 0.2814427614212036, - "rewards/rejected": -1.2636739015579224, + "logits/chosen": -3.0946550369262695, + "logits/rejected": -3.089721202850342, + "logps/chosen": -162.77565002441406, + "logps/rejected": -187.4752960205078, + "loss": 0.5985, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9805480241775513, + "rewards/margins": 0.2887071967124939, + "rewards/rejected": -1.26925528049469, "step": 3090 }, { "epoch": 0.53, "grad_norm": 3.3125, "learning_rate": 2.638299013843564e-06, - "logits/chosen": -3.0506327152252197, - "logits/rejected": -3.0413401126861572, - "logps/chosen": -169.4080047607422, - "logps/rejected": -192.44485473632812, - "loss": 0.6082, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0290443897247314, - "rewards/margins": 0.290638267993927, - "rewards/rejected": -1.3196827173233032, + "logits/chosen": -3.072727918624878, + "logits/rejected": -3.063985824584961, + "logps/chosen": -169.7478790283203, + "logps/rejected": -192.89866638183594, + "loss": 0.6062, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0324430465698242, + "rewards/margins": 0.2917777895927429, + "rewards/rejected": -1.324220895767212, "step": 3100 }, { "epoch": 0.53, - "eval_logits/chosen": -3.072305202484131, - "eval_logits/rejected": -3.067795753479004, - "eval_logps/chosen": -149.20468139648438, - "eval_logps/rejected": -167.45477294921875, - "eval_loss": 0.6500055193901062, - "eval_rewards/accuracies": 0.6115241646766663, - "eval_rewards/chosen": -0.7779466509819031, - "eval_rewards/margins": 0.14553073048591614, - "eval_rewards/rejected": -0.9234774708747864, - "eval_runtime": 483.8699, + "eval_logits/chosen": -3.094945192337036, + "eval_logits/rejected": -3.090569257736206, + "eval_logps/chosen": -149.33984375, + "eval_logps/rejected": -168.2215118408203, + "eval_loss": 0.6485457420349121, + "eval_rewards/accuracies": 0.6129181981086731, + "eval_rewards/chosen": -0.7792982459068298, + "eval_rewards/margins": 0.15184608101844788, + "eval_rewards/rejected": -0.9311443567276001, + "eval_runtime": 483.8725, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 3100 }, { "epoch": 0.54, - "grad_norm": 2.890625, + "grad_norm": 2.703125, "learning_rate": 2.6232794534480866e-06, - "logits/chosen": -3.064969301223755, - "logits/rejected": -3.0633625984191895, - "logps/chosen": -165.35995483398438, - "logps/rejected": -188.6304168701172, - "loss": 0.6339, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.046263337135315, - "rewards/margins": 0.20431256294250488, - "rewards/rejected": -1.2505757808685303, + "logits/chosen": -3.0856406688690186, + "logits/rejected": -3.0835537910461426, + "logps/chosen": -166.58815002441406, + "logps/rejected": -190.39205932617188, + "loss": 0.6329, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0585448741912842, + "rewards/margins": 0.20964722335338593, + "rewards/rejected": -1.268192172050476, "step": 3110 }, { "epoch": 0.54, - "grad_norm": 2.4375, + "grad_norm": 2.234375, "learning_rate": 2.6082554311981425e-06, - "logits/chosen": -3.0731215476989746, - "logits/rejected": -3.0652499198913574, - "logps/chosen": -161.23680114746094, - "logps/rejected": -184.52694702148438, - "loss": 0.6057, + "logits/chosen": -3.0849928855895996, + "logits/rejected": -3.0772414207458496, + "logps/chosen": -164.77818298339844, + "logps/rejected": -188.58157348632812, + "loss": 0.6049, "rewards/accuracies": 0.65625, - "rewards/chosen": -0.9424988627433777, - "rewards/margins": 0.26860615611076355, - "rewards/rejected": -1.2111051082611084, + "rewards/chosen": -0.9779126048088074, + "rewards/margins": 0.27373841404914856, + "rewards/rejected": -1.2516510486602783, "step": 3120 }, { "epoch": 0.54, - "grad_norm": 2.984375, + "grad_norm": 3.34375, "learning_rate": 2.5932274908583146e-06, - "logits/chosen": -3.0556998252868652, - "logits/rejected": -3.0475969314575195, - "logps/chosen": -160.74111938476562, - "logps/rejected": -189.58358764648438, - "loss": 0.6158, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.9874521493911743, - "rewards/margins": 0.28721556067466736, - "rewards/rejected": -1.2746676206588745, + "logits/chosen": -3.06742787361145, + "logits/rejected": -3.0598721504211426, + "logps/chosen": -164.9550018310547, + "logps/rejected": -193.75918579101562, + "loss": 0.6183, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0295908451080322, + "rewards/margins": 0.2868326008319855, + "rewards/rejected": -1.3164234161376953, "step": 3130 }, { "epoch": 0.54, - "grad_norm": 2.6875, + "grad_norm": 2.703125, "learning_rate": 2.578196176334995e-06, - "logits/chosen": -3.0566956996917725, - "logits/rejected": -3.05082106590271, - "logps/chosen": -170.5960693359375, - "logps/rejected": -200.77806091308594, - "loss": 0.5993, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0843502283096313, - "rewards/margins": 0.3123868405818939, - "rewards/rejected": -1.3967368602752686, + "logits/chosen": -3.0695323944091797, + "logits/rejected": -3.0637080669403076, + "logps/chosen": -174.0780029296875, + "logps/rejected": -204.63009643554688, + "loss": 0.6004, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1191692352294922, + "rewards/margins": 0.316087931394577, + "rewards/rejected": -1.4352571964263916, "step": 3140 }, { "epoch": 0.54, - "grad_norm": 2.671875, + "grad_norm": 2.78125, "learning_rate": 2.5631620316566986e-06, - "logits/chosen": -3.0403594970703125, - "logits/rejected": -3.0373871326446533, - "logps/chosen": -172.71676635742188, - "logps/rejected": -191.23036193847656, - "loss": 0.6288, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.069809079170227, - "rewards/margins": 0.2294754534959793, - "rewards/rejected": -1.2992844581604004, + "logits/chosen": -3.055833578109741, + "logits/rejected": -3.0524821281433105, + "logps/chosen": -175.0342559814453, + "logps/rejected": -193.7977752685547, + "loss": 0.6291, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0929839611053467, + "rewards/margins": 0.2319747656583786, + "rewards/rejected": -1.3249588012695312, "step": 3150 }, { "epoch": 0.54, - "grad_norm": 2.171875, + "grad_norm": 2.125, "learning_rate": 2.548125600954371e-06, - "logits/chosen": -3.0161185264587402, - "logits/rejected": -3.0118935108184814, - "logps/chosen": -172.9065399169922, - "logps/rejected": -192.70851135253906, - "loss": 0.629, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.0712924003601074, - "rewards/margins": 0.25309473276138306, - "rewards/rejected": -1.3243870735168457, + "logits/chosen": -3.03719162940979, + "logits/rejected": -3.03302264213562, + "logps/chosen": -174.88192749023438, + "logps/rejected": -193.9909210205078, + "loss": 0.6338, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0910463333129883, + "rewards/margins": 0.24616487324237823, + "rewards/rejected": -1.3372113704681396, "step": 3160 }, { "epoch": 0.55, - "grad_norm": 2.3125, + "grad_norm": 2.265625, "learning_rate": 2.5330874284416956e-06, - "logits/chosen": -3.0632405281066895, - "logits/rejected": -3.053253650665283, - "logps/chosen": -173.99534606933594, - "logps/rejected": -190.8984375, - "loss": 0.6094, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.0698506832122803, - "rewards/margins": 0.27976253628730774, - "rewards/rejected": -1.3496131896972656, + "logits/chosen": -3.0866336822509766, + "logits/rejected": -3.0766632556915283, + "logps/chosen": -174.22366333007812, + "logps/rejected": -191.46463012695312, + "loss": 0.6076, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0721338987350464, + "rewards/margins": 0.2831415832042694, + "rewards/rejected": -1.3552755117416382, "step": 3170 }, { "epoch": 0.55, - "grad_norm": 2.765625, + "grad_norm": 2.609375, "learning_rate": 2.5180480583953974e-06, - "logits/chosen": -3.075591564178467, - "logits/rejected": -3.0682435035705566, - "logps/chosen": -169.34739685058594, - "logps/rejected": -191.22177124023438, - "loss": 0.6188, + "logits/chosen": -3.0933799743652344, + "logits/rejected": -3.0862841606140137, + "logps/chosen": -169.85255432128906, + "logps/rejected": -192.44534301757812, + "loss": 0.6166, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.057546615600586, - "rewards/margins": 0.26352807879447937, - "rewards/rejected": -1.3210748434066772, + "rewards/chosen": -1.0625982284545898, + "rewards/margins": 0.27071231603622437, + "rewards/rejected": -1.333310604095459, "step": 3180 }, { "epoch": 0.55, "grad_norm": 2.34375, "learning_rate": 2.5030080351355452e-06, - "logits/chosen": -3.079733371734619, - "logits/rejected": -3.0735690593719482, - "logps/chosen": -154.4625244140625, - "logps/rejected": -188.51358032226562, - "loss": 0.5911, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9075784683227539, - "rewards/margins": 0.32719746232032776, - "rewards/rejected": -1.2347759008407593, + "logits/chosen": -3.0938119888305664, + "logits/rejected": -3.0877671241760254, + "logps/chosen": -157.56088256835938, + "logps/rejected": -191.95864868164062, + "loss": 0.5895, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9385620951652527, + "rewards/margins": 0.3306645452976227, + "rewards/rejected": -1.2692269086837769, "step": 3190 }, { "epoch": 0.55, - "grad_norm": 2.171875, + "grad_norm": 2.109375, "learning_rate": 2.4879679030058478e-06, - "logits/chosen": -3.0768027305603027, - "logits/rejected": -3.0705323219299316, - "logps/chosen": -158.77996826171875, - "logps/rejected": -180.58169555664062, - "loss": 0.6072, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9090372323989868, - "rewards/margins": 0.2884618937969208, - "rewards/rejected": -1.1974990367889404, + "logits/chosen": -3.085869789123535, + "logits/rejected": -3.0796637535095215, + "logps/chosen": -163.99746704101562, + "logps/rejected": -186.37850952148438, + "loss": 0.6071, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9612120389938354, + "rewards/margins": 0.29425540566444397, + "rewards/rejected": -1.2554675340652466, "step": 3200 }, { "epoch": 0.55, - "eval_logits/chosen": -3.0862679481506348, - "eval_logits/rejected": -3.081866979598999, - "eval_logps/chosen": -147.08103942871094, - "eval_logps/rejected": -164.96694946289062, - "eval_loss": 0.6499215364456177, - "eval_rewards/accuracies": 0.6089683771133423, - "eval_rewards/chosen": -0.7567103505134583, - "eval_rewards/margins": 0.14188869297504425, - "eval_rewards/rejected": -0.8985989689826965, - "eval_runtime": 483.5034, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 1.113, + "eval_logits/chosen": -3.095604658126831, + "eval_logits/rejected": -3.0910513401031494, + "eval_logps/chosen": -151.8241729736328, + "eval_logps/rejected": -170.87753295898438, + "eval_loss": 0.6476815342903137, + "eval_rewards/accuracies": 0.6117565035820007, + "eval_rewards/chosen": -0.8041415214538574, + "eval_rewards/margins": 0.1535632312297821, + "eval_rewards/rejected": -0.9577047824859619, + "eval_runtime": 483.7242, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 1.112, "step": 3200 }, { "epoch": 0.55, - "grad_norm": 2.53125, + "grad_norm": 2.65625, "learning_rate": 2.472928206353955e-06, - "logits/chosen": -3.0501890182495117, - "logits/rejected": -3.041801929473877, - "logps/chosen": -160.53079223632812, - "logps/rejected": -183.5560302734375, - "loss": 0.6043, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.9603055715560913, - "rewards/margins": 0.26849913597106934, - "rewards/rejected": -1.228804588317871, + "logits/chosen": -3.0591721534729004, + "logits/rejected": -3.050657033920288, + "logps/chosen": -166.25405883789062, + "logps/rejected": -189.50100708007812, + "loss": 0.6049, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0175381898880005, + "rewards/margins": 0.27071613073349, + "rewards/rejected": -1.2882544994354248, "step": 3210 }, { "epoch": 0.55, - "grad_norm": 2.734375, + "grad_norm": 2.8125, "learning_rate": 2.4578894895117554e-06, - "logits/chosen": -3.0532243251800537, - "logits/rejected": -3.0485551357269287, - "logps/chosen": -156.84738159179688, - "logps/rejected": -185.34927368164062, - "loss": 0.6054, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9624822735786438, - "rewards/margins": 0.2839977741241455, - "rewards/rejected": -1.2464802265167236, + "logits/chosen": -3.061354637145996, + "logits/rejected": -3.0563549995422363, + "logps/chosen": -160.94094848632812, + "logps/rejected": -190.11013793945312, + "loss": 0.6035, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.00341796875, + "rewards/margins": 0.2906709313392639, + "rewards/rejected": -1.2940889596939087, "step": 3220 }, { "epoch": 0.56, - "grad_norm": 3.453125, + "grad_norm": 3.515625, "learning_rate": 2.442852296775674e-06, - "logits/chosen": -3.0451407432556152, - "logits/rejected": -3.040626287460327, - "logps/chosen": -164.8292694091797, - "logps/rejected": -191.13607788085938, - "loss": 0.6239, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0101099014282227, - "rewards/margins": 0.2563210129737854, - "rewards/rejected": -1.2664308547973633, + "logits/chosen": -3.0528101921081543, + "logits/rejected": -3.048557758331299, + "logps/chosen": -169.01211547851562, + "logps/rejected": -195.78076171875, + "loss": 0.6241, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.051938533782959, + "rewards/margins": 0.26093941926956177, + "rewards/rejected": -1.3128780126571655, "step": 3230 }, { "epoch": 0.56, - "grad_norm": 2.921875, + "grad_norm": 3.09375, "learning_rate": 2.427817172386977e-06, - "logits/chosen": -3.0670342445373535, - "logits/rejected": -3.06071138381958, - "logps/chosen": -174.47415161132812, - "logps/rejected": -188.95457458496094, - "loss": 0.6368, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0790221691131592, - "rewards/margins": 0.21176621317863464, - "rewards/rejected": -1.290788173675537, + "logits/chosen": -3.0747134685516357, + "logits/rejected": -3.0686464309692383, + "logps/chosen": -178.54055786132812, + "logps/rejected": -193.69073486328125, + "loss": 0.6366, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1196863651275635, + "rewards/margins": 0.2184637039899826, + "rewards/rejected": -1.338149905204773, "step": 3240 }, { "epoch": 0.56, - "grad_norm": 2.9375, + "grad_norm": 2.78125, "learning_rate": 2.412784660512068e-06, - "logits/chosen": -3.061464309692383, - "logits/rejected": -3.0560860633850098, - "logps/chosen": -163.1319580078125, - "logps/rejected": -184.18960571289062, - "loss": 0.6335, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.000069499015808, - "rewards/margins": 0.22641122341156006, - "rewards/rejected": -1.2264807224273682, + "logits/chosen": -3.0726211071014404, + "logits/rejected": -3.0671470165252686, + "logps/chosen": -165.47073364257812, + "logps/rejected": -186.94058227539062, + "loss": 0.6318, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0234571695327759, + "rewards/margins": 0.23053304851055145, + "rewards/rejected": -1.2539902925491333, "step": 3250 }, { "epoch": 0.56, - "grad_norm": 2.359375, + "grad_norm": 2.390625, "learning_rate": 2.397755305222797e-06, - "logits/chosen": -3.066709041595459, - "logits/rejected": -3.0596566200256348, - "logps/chosen": -156.2583770751953, - "logps/rejected": -181.06417846679688, - "loss": 0.6222, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9408279657363892, - "rewards/margins": 0.2763696610927582, - "rewards/rejected": -1.2171975374221802, + "logits/chosen": -3.0731208324432373, + "logits/rejected": -3.0661895275115967, + "logps/chosen": -158.9102325439453, + "logps/rejected": -184.57321166992188, + "loss": 0.6192, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9673464894294739, + "rewards/margins": 0.28494155406951904, + "rewards/rejected": -1.2522878646850586, "step": 3260 }, { "epoch": 0.56, - "grad_norm": 1.9921875, + "grad_norm": 1.9765625, "learning_rate": 2.3827296504767667e-06, - "logits/chosen": -3.0858919620513916, - "logits/rejected": -3.0763540267944336, - "logps/chosen": -166.6835479736328, - "logps/rejected": -190.68980407714844, - "loss": 0.6257, + "logits/chosen": -3.0933609008789062, + "logits/rejected": -3.083542823791504, + "logps/chosen": -169.42149353027344, + "logps/rejected": -194.79824829101562, + "loss": 0.6218, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0163527727127075, - "rewards/margins": 0.24425125122070312, - "rewards/rejected": -1.2606040239334106, + "rewards/chosen": -1.043731927871704, + "rewards/margins": 0.2579565644264221, + "rewards/rejected": -1.3016884326934814, "step": 3270 }, { "epoch": 0.57, - "grad_norm": 3.0, + "grad_norm": 2.921875, "learning_rate": 2.3677082400976473e-06, - "logits/chosen": -3.06581449508667, - "logits/rejected": -3.061218738555908, - "logps/chosen": -161.67579650878906, - "logps/rejected": -186.04104614257812, - "loss": 0.6247, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.017340064048767, - "rewards/margins": 0.23519904911518097, - "rewards/rejected": -1.2525392770767212, + "logits/chosen": -3.07395601272583, + "logits/rejected": -3.0692670345306396, + "logps/chosen": -164.40283203125, + "logps/rejected": -188.9150848388672, + "loss": 0.6246, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.044610619544983, + "rewards/margins": 0.2366691380739212, + "rewards/rejected": -1.2812796831130981, "step": 3280 }, { "epoch": 0.57, - "grad_norm": 2.6875, + "grad_norm": 2.9375, "learning_rate": 2.352691617755492e-06, - "logits/chosen": -3.052398204803467, - "logits/rejected": -3.039611339569092, - "logps/chosen": -168.47596740722656, - "logps/rejected": -199.4940185546875, - "loss": 0.5945, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.046691656112671, - "rewards/margins": 0.320436954498291, - "rewards/rejected": -1.367128610610962, + "logits/chosen": -3.0645198822021484, + "logits/rejected": -3.052039384841919, + "logps/chosen": -170.14920043945312, + "logps/rejected": -200.637451171875, + "loss": 0.5968, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0634241104125977, + "rewards/margins": 0.31513866782188416, + "rewards/rejected": -1.3785628080368042, "step": 3290 }, { "epoch": 0.57, - "grad_norm": 3.640625, + "grad_norm": 3.671875, "learning_rate": 2.3376803269470604e-06, - "logits/chosen": -3.0078794956207275, - "logits/rejected": -2.995847225189209, - "logps/chosen": -192.8760986328125, - "logps/rejected": -219.29867553710938, - "loss": 0.6142, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.276427149772644, - "rewards/margins": 0.30553415417671204, - "rewards/rejected": -1.5819613933563232, + "logits/chosen": -3.016010284423828, + "logits/rejected": -3.0037078857421875, + "logps/chosen": -195.9353485107422, + "logps/rejected": -224.1945343017578, + "loss": 0.608, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.307019591331482, + "rewards/margins": 0.3239001929759979, + "rewards/rejected": -1.6309198141098022, "step": 3300 }, { "epoch": 0.57, - "eval_logits/chosen": -3.0087318420410156, - "eval_logits/rejected": -3.002570629119873, - "eval_logps/chosen": -179.26649475097656, - "eval_logps/rejected": -200.59915161132812, - "eval_loss": 0.6467859745025635, - "eval_rewards/accuracies": 0.6175650358200073, - "eval_rewards/chosen": -1.0785648822784424, - "eval_rewards/margins": 0.17635630071163177, - "eval_rewards/rejected": -1.2549211978912354, - "eval_runtime": 484.158, - "eval_samples_per_second": 8.89, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.006425142288208, + "eval_logits/rejected": -3.0002307891845703, + "eval_logps/chosen": -182.5597381591797, + "eval_logps/rejected": -204.84671020507812, + "eval_loss": 0.6460632085800171, + "eval_rewards/accuracies": 0.6150093078613281, + "eval_rewards/chosen": -1.111497402191162, + "eval_rewards/margins": 0.18589934706687927, + "eval_rewards/rejected": -1.2973966598510742, + "eval_runtime": 483.9633, + "eval_samples_per_second": 8.893, + "eval_steps_per_second": 1.112, "step": 3300 }, { "epoch": 0.57, - "grad_norm": 2.5, + "grad_norm": 2.65625, "learning_rate": 2.3226749109761475e-06, - "logits/chosen": -2.9723334312438965, - "logits/rejected": -2.9634299278259277, - "logps/chosen": -190.951904296875, - "logps/rejected": -217.7271728515625, - "loss": 0.6006, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.264145851135254, - "rewards/margins": 0.32409703731536865, - "rewards/rejected": -1.5882428884506226, + "logits/chosen": -2.974262237548828, + "logits/rejected": -2.9650983810424805, + "logps/chosen": -196.76895141601562, + "logps/rejected": -226.045654296875, + "loss": 0.5954, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3223161697387695, + "rewards/margins": 0.3491113781929016, + "rewards/rejected": -1.6714274883270264, "step": 3310 }, { "epoch": 0.57, - "grad_norm": 2.234375, + "grad_norm": 2.3125, "learning_rate": 2.3076759129339222e-06, - "logits/chosen": -3.0012853145599365, - "logits/rejected": -2.989985704421997, - "logps/chosen": -189.26229858398438, - "logps/rejected": -215.1383056640625, - "loss": 0.5977, + "logits/chosen": -3.0030081272125244, + "logits/rejected": -2.991539478302002, + "logps/chosen": -195.56874084472656, + "logps/rejected": -222.0810089111328, + "loss": 0.5985, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2482668161392212, - "rewards/margins": 0.33457013964653015, - "rewards/rejected": -1.5828371047973633, + "rewards/chosen": -1.3113313913345337, + "rewards/margins": 0.3409323990345001, + "rewards/rejected": -1.6522636413574219, "step": 3320 }, { "epoch": 0.57, - "grad_norm": 2.640625, + "grad_norm": 2.8125, "learning_rate": 2.2926838756792668e-06, - "logits/chosen": -3.023132801055908, - "logits/rejected": -3.014843463897705, - "logps/chosen": -178.64822387695312, - "logps/rejected": -212.94601440429688, - "loss": 0.5775, + "logits/chosen": -3.024853229522705, + "logits/rejected": -3.0167689323425293, + "logps/chosen": -182.8240966796875, + "logps/rejected": -218.2233428955078, + "loss": 0.577, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1388877630233765, - "rewards/margins": 0.37283557653427124, - "rewards/rejected": -1.511723279953003, + "rewards/chosen": -1.1806464195251465, + "rewards/margins": 0.3838498890399933, + "rewards/rejected": -1.5644962787628174, "step": 3330 }, { "epoch": 0.58, - "grad_norm": 4.28125, + "grad_norm": 4.71875, "learning_rate": 2.2776993418191332e-06, - "logits/chosen": -3.0232415199279785, - "logits/rejected": -3.01299786567688, - "logps/chosen": -187.7949676513672, - "logps/rejected": -215.0971221923828, - "loss": 0.606, + "logits/chosen": -3.0294406414031982, + "logits/rejected": -3.019160509109497, + "logps/chosen": -192.11871337890625, + "logps/rejected": -219.72607421875, + "loss": 0.6074, "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2033436298370361, - "rewards/margins": 0.3320937752723694, - "rewards/rejected": -1.5354373455047607, + "rewards/chosen": -1.2465808391571045, + "rewards/margins": 0.33514589071273804, + "rewards/rejected": -1.5817267894744873, "step": 3340 }, { "epoch": 0.58, - "grad_norm": 2.21875, + "grad_norm": 2.265625, "learning_rate": 2.262722853688902e-06, - "logits/chosen": -3.015946865081787, - "logits/rejected": -3.005009889602661, - "logps/chosen": -181.58934020996094, - "logps/rejected": -211.24002075195312, - "loss": 0.5997, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1483066082000732, - "rewards/margins": 0.327314555644989, - "rewards/rejected": -1.475620985031128, + "logits/chosen": -3.0237231254577637, + "logits/rejected": -3.012474298477173, + "logps/chosen": -185.7919158935547, + "logps/rejected": -216.2095184326172, + "loss": 0.6007, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1903321743011475, + "rewards/margins": 0.3349839448928833, + "rewards/rejected": -1.5253162384033203, "step": 3350 }, { "epoch": 0.58, - "grad_norm": 3.03125, + "grad_norm": 3.375, "learning_rate": 2.247754953332754e-06, - "logits/chosen": -3.0162365436553955, - "logits/rejected": -3.0121781826019287, - "logps/chosen": -179.1669158935547, - "logps/rejected": -198.03746032714844, - "loss": 0.6294, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1394041776657104, - "rewards/margins": 0.24760189652442932, - "rewards/rejected": -1.3870060443878174, + "logits/chosen": -3.022948741912842, + "logits/rejected": -3.0198607444763184, + "logps/chosen": -184.82015991210938, + "logps/rejected": -203.89492797851562, + "loss": 0.6322, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.195936918258667, + "rewards/margins": 0.24964404106140137, + "rewards/rejected": -1.4455807209014893, "step": 3360 }, { "epoch": 0.58, - "grad_norm": 2.953125, + "grad_norm": 2.859375, "learning_rate": 2.2327961824840564e-06, - "logits/chosen": -3.0204949378967285, - "logits/rejected": -3.0136327743530273, - "logps/chosen": -171.4439239501953, - "logps/rejected": -204.19378662109375, - "loss": 0.5862, + "logits/chosen": -3.028592586517334, + "logits/rejected": -3.0219027996063232, + "logps/chosen": -175.58358764648438, + "logps/rejected": -209.2071075439453, + "loss": 0.5844, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.0856364965438843, - "rewards/margins": 0.338853657245636, - "rewards/rejected": -1.424490213394165, + "rewards/chosen": -1.1270334720611572, + "rewards/margins": 0.3475898206233978, + "rewards/rejected": -1.4746233224868774, "step": 3370 }, { "epoch": 0.58, - "grad_norm": 3.484375, + "grad_norm": 3.578125, "learning_rate": 2.2178470825457464e-06, - "logits/chosen": -3.031080484390259, - "logits/rejected": -3.026292562484741, - "logps/chosen": -176.1151580810547, - "logps/rejected": -195.78326416015625, - "loss": 0.6171, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.095861554145813, - "rewards/margins": 0.27515116333961487, - "rewards/rejected": -1.3710126876831055, + "logits/chosen": -3.043968677520752, + "logits/rejected": -3.039705753326416, + "logps/chosen": -179.0226593017578, + "logps/rejected": -198.73789978027344, + "loss": 0.6178, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1249364614486694, + "rewards/margins": 0.27562254667282104, + "rewards/rejected": -1.4005590677261353, "step": 3380 }, { "epoch": 0.58, - "grad_norm": 2.6875, + "grad_norm": 2.75, "learning_rate": 2.2029081945707473e-06, - "logits/chosen": -3.0416462421417236, - "logits/rejected": -3.0330119132995605, - "logps/chosen": -168.8120574951172, - "logps/rejected": -191.52745056152344, - "loss": 0.6212, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.056342363357544, - "rewards/margins": 0.2708873152732849, - "rewards/rejected": -1.3272297382354736, + "logits/chosen": -3.054203748703003, + "logits/rejected": -3.0460638999938965, + "logps/chosen": -171.74972534179688, + "logps/rejected": -194.30918884277344, + "loss": 0.6249, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.0857192277908325, + "rewards/margins": 0.2693277895450592, + "rewards/rejected": -1.3550468683242798, "step": 3390 }, { "epoch": 0.59, - "grad_norm": 2.625, + "grad_norm": 2.484375, "learning_rate": 2.1879800592423758e-06, - "logits/chosen": -3.0537242889404297, - "logits/rejected": -3.044168472290039, - "logps/chosen": -166.37783813476562, - "logps/rejected": -195.04409790039062, - "loss": 0.602, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.031355619430542, - "rewards/margins": 0.329393595457077, - "rewards/rejected": -1.3607490062713623, + "logits/chosen": -3.06596040725708, + "logits/rejected": -3.0564138889312744, + "logps/chosen": -168.08819580078125, + "logps/rejected": -197.4599609375, + "loss": 0.5996, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0484591722488403, + "rewards/margins": 0.33644840121269226, + "rewards/rejected": -1.3849074840545654, "step": 3400 }, { "epoch": 0.59, - "eval_logits/chosen": -3.0674045085906982, - "eval_logits/rejected": -3.0623619556427, - "eval_logps/chosen": -150.30816650390625, - "eval_logps/rejected": -168.40872192382812, - "eval_loss": 0.6503540873527527, - "eval_rewards/accuracies": 0.6136152148246765, - "eval_rewards/chosen": -0.7889814972877502, - "eval_rewards/margins": 0.14403526484966278, - "eval_rewards/rejected": -0.933016836643219, - "eval_runtime": 492.8396, - "eval_samples_per_second": 8.733, - "eval_steps_per_second": 1.092, + "eval_logits/chosen": -3.0741565227508545, + "eval_logits/rejected": -3.0691306591033936, + "eval_logps/chosen": -151.0113067626953, + "eval_logps/rejected": -169.9129180908203, + "eval_loss": 0.6485846638679504, + "eval_rewards/accuracies": 0.609897792339325, + "eval_rewards/chosen": -0.7960128784179688, + "eval_rewards/margins": 0.15204598009586334, + "eval_rewards/rejected": -0.9480588436126709, + "eval_runtime": 483.9218, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 3400 }, { "epoch": 0.59, - "grad_norm": 2.828125, + "grad_norm": 2.640625, "learning_rate": 2.1730632168547807e-06, - "logits/chosen": -3.045454263687134, - "logits/rejected": -3.0381667613983154, - "logps/chosen": -153.18984985351562, - "logps/rejected": -168.77667236328125, - "loss": 0.6296, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.931566059589386, - "rewards/margins": 0.20537595450878143, - "rewards/rejected": -1.1369420289993286, + "logits/chosen": -3.0579450130462646, + "logits/rejected": -3.050424098968506, + "logps/chosen": -154.18807983398438, + "logps/rejected": -171.04515075683594, + "loss": 0.6244, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9415484666824341, + "rewards/margins": 0.21807841956615448, + "rewards/rejected": -1.1596269607543945, "step": 3410 }, { "epoch": 0.59, - "grad_norm": 4.1875, + "grad_norm": 4.21875, "learning_rate": 2.1581582072933873e-06, - "logits/chosen": -3.0582358837127686, - "logits/rejected": -3.053211212158203, - "logps/chosen": -157.33743286132812, - "logps/rejected": -184.45919799804688, - "loss": 0.5964, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.9351167678833008, - "rewards/margins": 0.2938837707042694, - "rewards/rejected": -1.2290005683898926, + "logits/chosen": -3.071281909942627, + "logits/rejected": -3.0659172534942627, + "logps/chosen": -158.43789672851562, + "logps/rejected": -185.93618774414062, + "loss": 0.594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9461215138435364, + "rewards/margins": 0.29764896631240845, + "rewards/rejected": -1.2437703609466553, "step": 3420 }, { "epoch": 0.59, - "grad_norm": 2.640625, + "grad_norm": 2.71875, "learning_rate": 2.1432655700153496e-06, - "logits/chosen": -3.0582404136657715, - "logits/rejected": -3.050635576248169, - "logps/chosen": -165.08010864257812, - "logps/rejected": -189.5339813232422, - "loss": 0.6278, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.032097578048706, - "rewards/margins": 0.23791618645191193, - "rewards/rejected": -1.2700138092041016, + "logits/chosen": -3.0646018981933594, + "logits/rejected": -3.0569794178009033, + "logps/chosen": -168.60687255859375, + "logps/rejected": -194.6256103515625, + "loss": 0.6238, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0673654079437256, + "rewards/margins": 0.2535645067691803, + "rewards/rejected": -1.320929765701294, "step": 3430 }, { "epoch": 0.59, - "grad_norm": 2.8125, + "grad_norm": 2.953125, "learning_rate": 2.1283858440300376e-06, - "logits/chosen": -3.0382144451141357, - "logits/rejected": -3.025315761566162, - "logps/chosen": -173.4113006591797, - "logps/rejected": -204.8324432373047, - "loss": 0.5897, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.112926721572876, - "rewards/margins": 0.3371250629425049, - "rewards/rejected": -1.4500519037246704, + "logits/chosen": -3.046886920928955, + "logits/rejected": -3.034882068634033, + "logps/chosen": -177.31863403320312, + "logps/rejected": -208.75552368164062, + "loss": 0.5924, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1520001888275146, + "rewards/margins": 0.33728262782096863, + "rewards/rejected": -1.4892828464508057, "step": 3440 }, { "epoch": 0.59, - "grad_norm": 2.609375, + "grad_norm": 2.625, "learning_rate": 2.113519567879517e-06, - "logits/chosen": -3.0533881187438965, - "logits/rejected": -3.050487995147705, - "logps/chosen": -183.76519775390625, - "logps/rejected": -200.22569274902344, - "loss": 0.6285, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1319319009780884, - "rewards/margins": 0.24390754103660583, - "rewards/rejected": -1.375839352607727, + "logits/chosen": -3.0611815452575684, + "logits/rejected": -3.0584826469421387, + "logps/chosen": -188.48440551757812, + "logps/rejected": -204.76821899414062, + "loss": 0.6298, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.179123878479004, + "rewards/margins": 0.24214068055152893, + "rewards/rejected": -1.4212645292282104, "step": 3450 }, { "epoch": 0.6, - "grad_norm": 2.671875, + "grad_norm": 2.4375, "learning_rate": 2.098667279619069e-06, - "logits/chosen": -3.0273451805114746, - "logits/rejected": -3.0172743797302246, - "logps/chosen": -165.3519744873047, - "logps/rejected": -193.7327117919922, - "loss": 0.6054, + "logits/chosen": -3.0351967811584473, + "logits/rejected": -3.0250942707061768, + "logps/chosen": -169.76300048828125, + "logps/rejected": -197.97808837890625, + "loss": 0.6074, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0217342376708984, - "rewards/margins": 0.29740262031555176, - "rewards/rejected": -1.3191369771957397, + "rewards/chosen": -1.0658444166183472, + "rewards/margins": 0.2957460880279541, + "rewards/rejected": -1.3615906238555908, "step": 3460 }, { "epoch": 0.6, - "grad_norm": 2.8125, + "grad_norm": 2.421875, "learning_rate": 2.0838295167977066e-06, - "logits/chosen": -3.0574910640716553, - "logits/rejected": -3.0506603717803955, - "logps/chosen": -173.7638397216797, - "logps/rejected": -199.7215118408203, - "loss": 0.6039, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0668748617172241, - "rewards/margins": 0.31231045722961426, - "rewards/rejected": -1.379185438156128, + "logits/chosen": -3.0645134449005127, + "logits/rejected": -3.057884454727173, + "logps/chosen": -178.63186645507812, + "logps/rejected": -204.76443481445312, + "loss": 0.6068, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1155550479888916, + "rewards/margins": 0.3140595555305481, + "rewards/rejected": -1.4296146631240845, "step": 3470 }, { "epoch": 0.6, - "grad_norm": 2.96875, + "grad_norm": 2.859375, "learning_rate": 2.069006816438725e-06, - "logits/chosen": -3.0340256690979004, - "logits/rejected": -3.024568796157837, - "logps/chosen": -173.4439697265625, - "logps/rejected": -197.1393280029297, - "loss": 0.6132, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0758858919143677, - "rewards/margins": 0.2928495705127716, - "rewards/rejected": -1.368735432624817, + "logits/chosen": -3.0422568321228027, + "logits/rejected": -3.0331058502197266, + "logps/chosen": -178.1131134033203, + "logps/rejected": -202.82235717773438, + "loss": 0.6116, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.122577428817749, + "rewards/margins": 0.30298852920532227, + "rewards/rejected": -1.4255659580230713, "step": 3480 }, { "epoch": 0.6, - "grad_norm": 3.40625, + "grad_norm": 3.34375, "learning_rate": 2.054199715020266e-06, - "logits/chosen": -3.0460727214813232, - "logits/rejected": -3.0427908897399902, - "logps/chosen": -176.40223693847656, - "logps/rejected": -196.90025329589844, - "loss": 0.629, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1191596984863281, - "rewards/margins": 0.22387664020061493, - "rewards/rejected": -1.343036413192749, + "logits/chosen": -3.0578956604003906, + "logits/rejected": -3.0546748638153076, + "logps/chosen": -181.18801879882812, + "logps/rejected": -203.0323944091797, + "loss": 0.6258, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1670176982879639, + "rewards/margins": 0.23733997344970703, + "rewards/rejected": -1.404357671737671, "step": 3490 }, { "epoch": 0.6, - "grad_norm": 2.640625, + "grad_norm": 2.6875, "learning_rate": 2.039408748455894e-06, - "logits/chosen": -3.0397393703460693, - "logits/rejected": -3.0337138175964355, - "logps/chosen": -166.6004638671875, - "logps/rejected": -191.49612426757812, - "loss": 0.605, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0314760208129883, - "rewards/margins": 0.26015573740005493, - "rewards/rejected": -1.2916316986083984, + "logits/chosen": -3.053438186645508, + "logits/rejected": -3.047560453414917, + "logps/chosen": -170.10067749023438, + "logps/rejected": -194.56643676757812, + "loss": 0.6081, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0664782524108887, + "rewards/margins": 0.2558566927909851, + "rewards/rejected": -1.3223350048065186, "step": 3500 }, { "epoch": 0.6, - "eval_logits/chosen": -3.058988332748413, - "eval_logits/rejected": -3.0537822246551514, - "eval_logps/chosen": -154.17901611328125, - "eval_logps/rejected": -172.9109344482422, - "eval_loss": 0.6496783494949341, - "eval_rewards/accuracies": 0.6122211813926697, - "eval_rewards/chosen": -0.8276901245117188, - "eval_rewards/margins": 0.15034890174865723, - "eval_rewards/rejected": -0.9780389070510864, - "eval_runtime": 497.1419, - "eval_samples_per_second": 8.657, - "eval_steps_per_second": 1.082, + "eval_logits/chosen": -3.068061590194702, + "eval_logits/rejected": -3.06298828125, + "eval_logps/chosen": -154.95416259765625, + "eval_logps/rejected": -174.41162109375, + "eval_loss": 0.6478354334831238, + "eval_rewards/accuracies": 0.6157063245773315, + "eval_rewards/chosen": -0.8354412913322449, + "eval_rewards/margins": 0.15760457515716553, + "eval_rewards/rejected": -0.9930458664894104, + "eval_runtime": 483.9379, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 3500 }, { "epoch": 0.6, - "grad_norm": 2.25, + "grad_norm": 2.28125, "learning_rate": 2.024634452075209e-06, - "logits/chosen": -3.0433077812194824, - "logits/rejected": -3.0365288257598877, - "logps/chosen": -169.48248291015625, - "logps/rejected": -189.098388671875, - "loss": 0.6227, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.0600249767303467, - "rewards/margins": 0.2405003011226654, - "rewards/rejected": -1.3005253076553345, + "logits/chosen": -3.0592658519744873, + "logits/rejected": -3.0531296730041504, + "logps/chosen": -170.28494262695312, + "logps/rejected": -189.07864379882812, + "loss": 0.627, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.068049430847168, + "rewards/margins": 0.23227830231189728, + "rewards/rejected": -1.3003276586532593, "step": 3510 }, { "epoch": 0.61, - "grad_norm": 2.671875, + "grad_norm": 2.625, "learning_rate": 2.0098773606044627e-06, - "logits/chosen": -3.042524814605713, - "logits/rejected": -3.03288197517395, - "logps/chosen": -166.3798370361328, - "logps/rejected": -187.0223846435547, - "loss": 0.6148, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.0103702545166016, - "rewards/margins": 0.2540992498397827, - "rewards/rejected": -1.2644695043563843, + "logits/chosen": -3.0587456226348877, + "logits/rejected": -3.0494563579559326, + "logps/chosen": -165.07681274414062, + "logps/rejected": -185.6932373046875, + "loss": 0.6146, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9973398447036743, + "rewards/margins": 0.2538382112979889, + "rewards/rejected": -1.2511780261993408, "step": 3520 }, { "epoch": 0.61, - "grad_norm": 4.6875, + "grad_norm": 4.3125, "learning_rate": 1.9951380081472135e-06, - "logits/chosen": -3.0522756576538086, - "logits/rejected": -3.0436208248138428, - "logps/chosen": -171.316650390625, - "logps/rejected": -194.4859619140625, - "loss": 0.6068, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0492935180664062, - "rewards/margins": 0.3023799955844879, - "rewards/rejected": -1.3516733646392822, + "logits/chosen": -3.0682685375213623, + "logits/rejected": -3.059941530227661, + "logps/chosen": -169.71578979492188, + "logps/rejected": -192.5686492919922, + "loss": 0.6077, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0332850217819214, + "rewards/margins": 0.29921552538871765, + "rewards/rejected": -1.332500696182251, "step": 3530 }, { "epoch": 0.61, - "grad_norm": 2.890625, + "grad_norm": 2.84375, "learning_rate": 1.9804169281649873e-06, - "logits/chosen": -3.038957118988037, - "logits/rejected": -3.0326712131500244, - "logps/chosen": -171.6193389892578, - "logps/rejected": -189.50172424316406, - "loss": 0.6394, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.062083125114441, - "rewards/margins": 0.2282875031232834, - "rewards/rejected": -1.2903707027435303, + "logits/chosen": -3.0553269386291504, + "logits/rejected": -3.0493812561035156, + "logps/chosen": -169.69444274902344, + "logps/rejected": -186.82571411132812, + "loss": 0.6423, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0428342819213867, + "rewards/margins": 0.22077639400959015, + "rewards/rejected": -1.2636107206344604, "step": 3540 }, { "epoch": 0.61, - "grad_norm": 2.75, + "grad_norm": 2.640625, "learning_rate": 1.965714653457979e-06, - "logits/chosen": -3.0525827407836914, - "logits/rejected": -3.0477380752563477, - "logps/chosen": -173.34603881835938, - "logps/rejected": -188.1285858154297, - "loss": 0.6458, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.0868390798568726, - "rewards/margins": 0.18998117744922638, - "rewards/rejected": -1.276820421218872, + "logits/chosen": -3.0672309398651123, + "logits/rejected": -3.0627758502960205, + "logps/chosen": -171.85525512695312, + "logps/rejected": -186.51995849609375, + "loss": 0.6456, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0719311237335205, + "rewards/margins": 0.1888027787208557, + "rewards/rejected": -1.260733962059021, "step": 3550 }, { "epoch": 0.61, - "grad_norm": 3.484375, + "grad_norm": 3.21875, "learning_rate": 1.9510317161457586e-06, - "logits/chosen": -3.055346965789795, - "logits/rejected": -3.047273874282837, - "logps/chosen": -164.53245544433594, - "logps/rejected": -185.46621704101562, - "loss": 0.6174, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.9918788075447083, - "rewards/margins": 0.25856080651283264, - "rewards/rejected": -1.2504394054412842, + "logits/chosen": -3.0688438415527344, + "logits/rejected": -3.061432361602783, + "logps/chosen": -163.94273376464844, + "logps/rejected": -184.4805908203125, + "loss": 0.6189, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9859815835952759, + "rewards/margins": 0.2546016573905945, + "rewards/rejected": -1.2405831813812256, "step": 3560 }, { "epoch": 0.62, - "grad_norm": 2.703125, + "grad_norm": 2.890625, "learning_rate": 1.936368647648022e-06, - "logits/chosen": -3.0525732040405273, - "logits/rejected": -3.0441994667053223, - "logps/chosen": -179.19534301757812, - "logps/rejected": -193.872314453125, - "loss": 0.6613, + "logits/chosen": -3.0663957595825195, + "logits/rejected": -3.058706760406494, + "logps/chosen": -178.1262969970703, + "logps/rejected": -191.65078735351562, + "loss": 0.6647, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1191743612289429, - "rewards/margins": 0.19655336439609528, - "rewards/rejected": -1.315727710723877, + "rewards/chosen": -1.108483910560608, + "rewards/margins": 0.18502870202064514, + "rewards/rejected": -1.2935125827789307, "step": 3570 }, { "epoch": 0.62, - "grad_norm": 2.046875, + "grad_norm": 2.0625, "learning_rate": 1.9217259786653513e-06, - "logits/chosen": -3.0564985275268555, - "logits/rejected": -3.0520143508911133, - "logps/chosen": -175.3017120361328, - "logps/rejected": -195.2724151611328, - "loss": 0.6279, + "logits/chosen": -3.0707554817199707, + "logits/rejected": -3.0660297870635986, + "logps/chosen": -173.64324951171875, + "logps/rejected": -193.48947143554688, + "loss": 0.6291, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0685102939605713, - "rewards/margins": 0.2402305155992508, - "rewards/rejected": -1.3087408542633057, + "rewards/chosen": -1.0519256591796875, + "rewards/margins": 0.2389855682849884, + "rewards/rejected": -1.290911316871643, "step": 3580 }, { "epoch": 0.62, - "grad_norm": 2.796875, + "grad_norm": 2.671875, "learning_rate": 1.9071042391600074e-06, - "logits/chosen": -3.072998046875, - "logits/rejected": -3.0672171115875244, - "logps/chosen": -173.44651794433594, - "logps/rejected": -193.4931640625, - "loss": 0.6401, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.0806306600570679, - "rewards/margins": 0.24041788280010223, - "rewards/rejected": -1.321048617362976, + "logits/chosen": -3.084831714630127, + "logits/rejected": -3.0786590576171875, + "logps/chosen": -172.4977569580078, + "logps/rejected": -193.0021514892578, + "loss": 0.6374, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.0711432695388794, + "rewards/margins": 0.24499531090259552, + "rewards/rejected": -1.316138505935669, "step": 3590 }, { "epoch": 0.62, - "grad_norm": 3.28125, + "grad_norm": 3.25, "learning_rate": 1.8925039583367535e-06, - "logits/chosen": -3.05549955368042, - "logits/rejected": -3.0492753982543945, - "logps/chosen": -166.55067443847656, - "logps/rejected": -186.08181762695312, - "loss": 0.6263, + "logits/chosen": -3.0663247108459473, + "logits/rejected": -3.060382127761841, + "logps/chosen": -166.39263916015625, + "logps/rejected": -185.80763244628906, + "loss": 0.6256, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0093283653259277, - "rewards/margins": 0.24818992614746094, - "rewards/rejected": -1.2575181722640991, + "rewards/chosen": -1.007748007774353, + "rewards/margins": 0.24702855944633484, + "rewards/rejected": -1.2547765970230103, "step": 3600 }, { "epoch": 0.62, - "eval_logits/chosen": -3.0720903873443604, - "eval_logits/rejected": -3.067195177078247, - "eval_logps/chosen": -149.97569274902344, - "eval_logps/rejected": -168.07350158691406, - "eval_loss": 0.650830090045929, - "eval_rewards/accuracies": 0.6043215394020081, - "eval_rewards/chosen": -0.785656750202179, - "eval_rewards/margins": 0.14400769770145416, - "eval_rewards/rejected": -0.9296644330024719, - "eval_runtime": 497.1837, - "eval_samples_per_second": 8.657, - "eval_steps_per_second": 1.082, + "eval_logits/chosen": -3.076892375946045, + "eval_logits/rejected": -3.072154998779297, + "eval_logps/chosen": -148.85464477539062, + "eval_logps/rejected": -167.44215393066406, + "eval_loss": 0.6490924954414368, + "eval_rewards/accuracies": 0.6145446300506592, + "eval_rewards/chosen": -0.7744462490081787, + "eval_rewards/margins": 0.14890483021736145, + "eval_rewards/rejected": -0.923350989818573, + "eval_runtime": 483.8552, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.112, "step": 3600 }, { "epoch": 0.62, - "grad_norm": 2.65625, + "grad_norm": 2.6875, "learning_rate": 1.8779256646236945e-06, - "logits/chosen": -3.050173044204712, - "logits/rejected": -3.0390946865081787, - "logps/chosen": -176.19290161132812, - "logps/rejected": -194.0640411376953, - "loss": 0.6329, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.14492928981781, - "rewards/margins": 0.2223505675792694, - "rewards/rejected": -1.3672797679901123, + "logits/chosen": -3.0635766983032227, + "logits/rejected": -3.0530872344970703, + "logps/chosen": -175.24679565429688, + "logps/rejected": -192.53961181640625, + "loss": 0.6352, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1354683637619019, + "rewards/margins": 0.21656735241413116, + "rewards/rejected": -1.3520357608795166, "step": 3610 }, { "epoch": 0.62, - "grad_norm": 2.609375, + "grad_norm": 2.484375, "learning_rate": 1.8633698856531602e-06, - "logits/chosen": -3.0332961082458496, - "logits/rejected": -3.0208582878112793, - "logps/chosen": -166.79562377929688, - "logps/rejected": -198.54063415527344, - "loss": 0.5671, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9951320886611938, - "rewards/margins": 0.36578112840652466, - "rewards/rejected": -1.3609130382537842, + "logits/chosen": -3.0502521991729736, + "logits/rejected": -3.0385677814483643, + "logps/chosen": -164.17025756835938, + "logps/rejected": -196.63186645507812, + "loss": 0.5629, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9688783884048462, + "rewards/margins": 0.37294721603393555, + "rewards/rejected": -1.3418257236480713, "step": 3620 }, { "epoch": 0.63, - "grad_norm": 3.3125, + "grad_norm": 2.984375, "learning_rate": 1.8488371482425988e-06, - "logits/chosen": -3.019685983657837, - "logits/rejected": -3.0057766437530518, - "logps/chosen": -181.3262176513672, - "logps/rejected": -221.469970703125, - "loss": 0.5866, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1980793476104736, - "rewards/margins": 0.3820146918296814, - "rewards/rejected": -1.5800940990447998, + "logits/chosen": -3.033907413482666, + "logits/rejected": -3.0198476314544678, + "logps/chosen": -180.27218627929688, + "logps/rejected": -220.55386352539062, + "loss": 0.5883, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.187538981437683, + "rewards/margins": 0.38339418172836304, + "rewards/rejected": -1.5709333419799805, "step": 3630 }, { "epoch": 0.63, - "grad_norm": 2.6875, + "grad_norm": 2.796875, "learning_rate": 1.8343279783755208e-06, - "logits/chosen": -2.9965124130249023, - "logits/rejected": -2.990307569503784, - "logps/chosen": -184.87025451660156, - "logps/rejected": -211.8139190673828, - "loss": 0.6045, + "logits/chosen": -3.0072665214538574, + "logits/rejected": -3.0015065670013428, + "logps/chosen": -185.86297607421875, + "logps/rejected": -213.43020629882812, + "loss": 0.6018, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.1814584732055664, - "rewards/margins": 0.29887276887893677, - "rewards/rejected": -1.4803311824798584, + "rewards/chosen": -1.1913856267929077, + "rewards/margins": 0.3051080107688904, + "rewards/rejected": -1.4964938163757324, "step": 3640 }, { "epoch": 0.63, "grad_norm": 3.28125, "learning_rate": 1.8198429011824515e-06, - "logits/chosen": -2.991365671157837, - "logits/rejected": -2.983931064605713, - "logps/chosen": -182.23580932617188, - "logps/rejected": -213.08407592773438, - "loss": 0.6193, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2039625644683838, - "rewards/margins": 0.291355162858963, - "rewards/rejected": -1.4953176975250244, + "logits/chosen": -3.001743793487549, + "logits/rejected": -2.9939351081848145, + "logps/chosen": -183.04818725585938, + "logps/rejected": -214.5342254638672, + "loss": 0.6168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2120859622955322, + "rewards/margins": 0.2977331876754761, + "rewards/rejected": -1.5098191499710083, "step": 3650 }, { "epoch": 0.63, - "grad_norm": 3.0, + "grad_norm": 2.78125, "learning_rate": 1.8053824409219322e-06, - "logits/chosen": -2.9847915172576904, - "logits/rejected": -2.9697489738464355, - "logps/chosen": -196.5591583251953, - "logps/rejected": -231.37966918945312, - "loss": 0.5736, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3088271617889404, - "rewards/margins": 0.39994674921035767, - "rewards/rejected": -1.7087738513946533, + "logits/chosen": -2.991367816925049, + "logits/rejected": -2.9764809608459473, + "logps/chosen": -198.4747772216797, + "logps/rejected": -234.4219970703125, + "loss": 0.5701, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3279832601547241, + "rewards/margins": 0.4112142026424408, + "rewards/rejected": -1.7391973733901978, "step": 3660 }, { "epoch": 0.63, - "grad_norm": 2.625, + "grad_norm": 2.59375, "learning_rate": 1.7909471209615447e-06, - "logits/chosen": -2.9805190563201904, - "logits/rejected": -2.973362445831299, - "logps/chosen": -197.0777587890625, - "logps/rejected": -217.090576171875, - "loss": 0.6475, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.328054666519165, - "rewards/margins": 0.23754820227622986, - "rewards/rejected": -1.5656030178070068, + "logits/chosen": -2.9868359565734863, + "logits/rejected": -2.980062484741211, + "logps/chosen": -199.0198516845703, + "logps/rejected": -218.91775512695312, + "loss": 0.6493, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3474757671356201, + "rewards/margins": 0.23639878630638123, + "rewards/rejected": -1.5838743448257446, "step": 3670 }, { "epoch": 0.63, - "grad_norm": 2.890625, + "grad_norm": 2.953125, "learning_rate": 1.7765374637589632e-06, - "logits/chosen": -3.008643627166748, - "logits/rejected": -3.003309965133667, - "logps/chosen": -195.97483825683594, - "logps/rejected": -215.29763793945312, - "loss": 0.6228, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2987301349639893, - "rewards/margins": 0.2805440127849579, - "rewards/rejected": -1.5792741775512695, + "logits/chosen": -3.0147016048431396, + "logits/rejected": -3.0103507041931152, + "logps/chosen": -197.39666748046875, + "logps/rejected": -217.50534057617188, + "loss": 0.621, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.312948226928711, + "rewards/margins": 0.28840309381484985, + "rewards/rejected": -1.6013513803482056, "step": 3680 }, { "epoch": 0.64, - "grad_norm": 3.078125, + "grad_norm": 2.921875, "learning_rate": 1.7621539908430555e-06, - "logits/chosen": -3.013218879699707, - "logits/rejected": -3.00187087059021, - "logps/chosen": -180.8542938232422, - "logps/rejected": -219.6073455810547, - "loss": 0.5985, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1830642223358154, - "rewards/margins": 0.3425402045249939, - "rewards/rejected": -1.525604486465454, + "logits/chosen": -3.0218026638031006, + "logits/rejected": -3.0105462074279785, + "logps/chosen": -180.84689331054688, + "logps/rejected": -219.6415557861328, + "loss": 0.5988, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1829901933670044, + "rewards/margins": 0.34295639395713806, + "rewards/rejected": -1.5259464979171753, "step": 3690 }, { "epoch": 0.64, - "grad_norm": 2.546875, + "grad_norm": 2.578125, "learning_rate": 1.7477972227949947e-06, - "logits/chosen": -2.990821361541748, - "logits/rejected": -2.9791314601898193, - "logps/chosen": -187.114013671875, - "logps/rejected": -219.70413208007812, - "loss": 0.5961, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1857783794403076, - "rewards/margins": 0.34895533323287964, - "rewards/rejected": -1.534733772277832, + "logits/chosen": -2.998730182647705, + "logits/rejected": -2.9870681762695312, + "logps/chosen": -187.25515747070312, + "logps/rejected": -220.0942840576172, + "loss": 0.5969, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1871895790100098, + "rewards/margins": 0.35144585371017456, + "rewards/rejected": -1.538635492324829, "step": 3700 }, { "epoch": 0.64, - "eval_logits/chosen": -3.0151216983795166, - "eval_logits/rejected": -3.0089986324310303, - "eval_logps/chosen": -169.4567108154297, - "eval_logps/rejected": -189.36888122558594, - "eval_loss": 0.6491547226905823, - "eval_rewards/accuracies": 0.6136152148246765, - "eval_rewards/chosen": -0.9804668426513672, - "eval_rewards/margins": 0.162151500582695, - "eval_rewards/rejected": -1.1426185369491577, - "eval_runtime": 497.6263, - "eval_samples_per_second": 8.649, - "eval_steps_per_second": 1.081, + "eval_logits/chosen": -3.023085594177246, + "eval_logits/rejected": -3.0171217918395996, + "eval_logps/chosen": -168.72821044921875, + "eval_logps/rejected": -189.29779052734375, + "eval_loss": 0.6469103693962097, + "eval_rewards/accuracies": 0.6150093078613281, + "eval_rewards/chosen": -0.9731818437576294, + "eval_rewards/margins": 0.16872557997703552, + "eval_rewards/rejected": -1.1419075727462769, + "eval_runtime": 483.9353, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 3700 }, { "epoch": 0.64, - "grad_norm": 3.171875, + "grad_norm": 2.78125, "learning_rate": 1.7334676792294303e-06, - "logits/chosen": -3.0052781105041504, - "logits/rejected": -2.9978158473968506, - "logps/chosen": -183.4770050048828, - "logps/rejected": -205.83377075195312, - "loss": 0.6272, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.192054271697998, - "rewards/margins": 0.2757667601108551, - "rewards/rejected": -1.4678208827972412, + "logits/chosen": -3.0105113983154297, + "logits/rejected": -3.0030391216278076, + "logps/chosen": -183.92034912109375, + "logps/rejected": -206.87088012695312, + "loss": 0.6257, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1964876651763916, + "rewards/margins": 0.28170451521873474, + "rewards/rejected": -1.4781922101974487, "step": 3710 }, { "epoch": 0.64, - "grad_norm": 2.640625, + "grad_norm": 2.890625, "learning_rate": 1.7191658787756705e-06, - "logits/chosen": -3.0063445568084717, - "logits/rejected": -2.99770450592041, - "logps/chosen": -177.88133239746094, - "logps/rejected": -215.02456665039062, - "loss": 0.5729, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.14735746383667, - "rewards/margins": 0.38209640979766846, - "rewards/rejected": -1.5294538736343384, + "logits/chosen": -3.0147390365600586, + "logits/rejected": -3.0061700344085693, + "logps/chosen": -177.37066650390625, + "logps/rejected": -215.53512573242188, + "loss": 0.5694, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1422507762908936, + "rewards/margins": 0.3923090100288391, + "rewards/rejected": -1.5345598459243774, "step": 3720 }, { "epoch": 0.64, - "grad_norm": 2.765625, + "grad_norm": 2.71875, "learning_rate": 1.7048923390589211e-06, - "logits/chosen": -2.99979829788208, - "logits/rejected": -2.982865571975708, - "logps/chosen": -190.33706665039062, - "logps/rejected": -222.1634521484375, - "loss": 0.5925, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2473335266113281, - "rewards/margins": 0.38343560695648193, - "rewards/rejected": -1.63076913356781, + "logits/chosen": -3.007803440093994, + "logits/rejected": -2.991123676300049, + "logps/chosen": -190.38050842285156, + "logps/rejected": -221.82601928710938, + "loss": 0.5958, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2477678060531616, + "rewards/margins": 0.3796270489692688, + "rewards/rejected": -1.6273949146270752, "step": 3730 }, { "epoch": 0.64, - "grad_norm": 2.65625, + "grad_norm": 2.8125, "learning_rate": 1.6906475766815455e-06, - "logits/chosen": -3.015282392501831, - "logits/rejected": -3.006476402282715, - "logps/chosen": -177.47451782226562, - "logps/rejected": -208.50765991210938, - "loss": 0.6071, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1354906558990479, - "rewards/margins": 0.3008057475090027, - "rewards/rejected": -1.4362964630126953, + "logits/chosen": -3.019498348236084, + "logits/rejected": -3.0102386474609375, + "logps/chosen": -177.45591735839844, + "logps/rejected": -209.06979370117188, + "loss": 0.6059, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1353048086166382, + "rewards/margins": 0.306613028049469, + "rewards/rejected": -1.4419176578521729, "step": 3740 }, { "epoch": 0.65, - "grad_norm": 2.375, + "grad_norm": 2.359375, "learning_rate": 1.676432107204367e-06, - "logits/chosen": -3.021735191345215, - "logits/rejected": -3.017714500427246, - "logps/chosen": -183.16357421875, - "logps/rejected": -203.0084991455078, - "loss": 0.6401, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1851723194122314, - "rewards/margins": 0.2241462767124176, - "rewards/rejected": -1.4093185663223267, + "logits/chosen": -3.0263257026672363, + "logits/rejected": -3.022390604019165, + "logps/chosen": -183.7809295654297, + "logps/rejected": -203.02212524414062, + "loss": 0.6448, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1913459300994873, + "rewards/margins": 0.21810908615589142, + "rewards/rejected": -1.4094550609588623, "step": 3750 }, { "epoch": 0.65, - "grad_norm": 2.953125, + "grad_norm": 2.84375, "learning_rate": 1.6622464451280131e-06, - "logits/chosen": -3.032392978668213, - "logits/rejected": -3.025547504425049, - "logps/chosen": -190.98033142089844, - "logps/rejected": -205.5364227294922, - "loss": 0.6542, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2406162023544312, - "rewards/margins": 0.17366747558116913, - "rewards/rejected": -1.4142837524414062, + "logits/chosen": -3.0362114906311035, + "logits/rejected": -3.029592514038086, + "logps/chosen": -190.61134338378906, + "logps/rejected": -205.67208862304688, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2369264364242554, + "rewards/margins": 0.17871399223804474, + "rewards/rejected": -1.4156402349472046, "step": 3760 }, { "epoch": 0.65, - "grad_norm": 2.640625, + "grad_norm": 2.59375, "learning_rate": 1.6480911038742892e-06, - "logits/chosen": -3.0329129695892334, - "logits/rejected": -3.022871255874634, - "logps/chosen": -177.671630859375, - "logps/rejected": -199.72903442382812, - "loss": 0.6289, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1310114860534668, - "rewards/margins": 0.24106483161449432, - "rewards/rejected": -1.3720762729644775, + "logits/chosen": -3.0372822284698486, + "logits/rejected": -3.0271191596984863, + "logps/chosen": -177.426513671875, + "logps/rejected": -200.05203247070312, + "loss": 0.6261, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1285603046417236, + "rewards/margins": 0.24674637615680695, + "rewards/rejected": -1.375306487083435, "step": 3770 }, { "epoch": 0.65, - "grad_norm": 4.03125, + "grad_norm": 4.1875, "learning_rate": 1.6339665957676012e-06, - "logits/chosen": -3.0127625465393066, - "logits/rejected": -3.005437135696411, - "logps/chosen": -182.5919189453125, - "logps/rejected": -203.1020050048828, - "loss": 0.6167, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.182477593421936, - "rewards/margins": 0.27642253041267395, - "rewards/rejected": -1.4589000940322876, + "logits/chosen": -3.011491537094116, + "logits/rejected": -3.0045924186706543, + "logps/chosen": -183.3855743408203, + "logps/rejected": -204.1756591796875, + "loss": 0.6169, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1904141902923584, + "rewards/margins": 0.2792222797870636, + "rewards/rejected": -1.4696365594863892, "step": 3780 }, { "epoch": 0.65, - "grad_norm": 3.328125, + "grad_norm": 3.125, "learning_rate": 1.6198734320164084e-06, - "logits/chosen": -3.0002925395965576, - "logits/rejected": -2.9951531887054443, - "logps/chosen": -184.12808227539062, - "logps/rejected": -204.37026977539062, - "loss": 0.62, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.200259804725647, - "rewards/margins": 0.2649845778942108, - "rewards/rejected": -1.4652442932128906, + "logits/chosen": -3.002366542816162, + "logits/rejected": -2.9970383644104004, + "logps/chosen": -184.1189422607422, + "logps/rejected": -204.14517211914062, + "loss": 0.6207, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.200168490409851, + "rewards/margins": 0.26282474398612976, + "rewards/rejected": -1.4629931449890137, "step": 3790 }, { "epoch": 0.65, - "grad_norm": 4.8125, + "grad_norm": 4.78125, "learning_rate": 1.6058121226947265e-06, - "logits/chosen": -2.9992926120758057, - "logits/rejected": -2.9876251220703125, - "logps/chosen": -187.41111755371094, - "logps/rejected": -206.1591796875, - "loss": 0.6273, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1964071989059448, - "rewards/margins": 0.2528632581233978, - "rewards/rejected": -1.449270486831665, + "logits/chosen": -2.999645948410034, + "logits/rejected": -2.988257646560669, + "logps/chosen": -186.4853057861328, + "logps/rejected": -205.41824340820312, + "loss": 0.6272, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.187149167060852, + "rewards/margins": 0.25471192598342896, + "rewards/rejected": -1.4418610334396362, "step": 3800 }, { "epoch": 0.65, - "eval_logits/chosen": -3.011683702468872, - "eval_logits/rejected": -3.005713701248169, - "eval_logps/chosen": -167.98045349121094, - "eval_logps/rejected": -187.6573028564453, - "eval_loss": 0.6493790149688721, - "eval_rewards/accuracies": 0.6140799522399902, - "eval_rewards/chosen": -0.9657043814659119, - "eval_rewards/margins": 0.1597982496023178, - "eval_rewards/rejected": -1.1255027055740356, - "eval_runtime": 496.5286, - "eval_samples_per_second": 8.668, - "eval_steps_per_second": 1.084, + "eval_logits/chosen": -3.014477014541626, + "eval_logits/rejected": -3.008650779724121, + "eval_logps/chosen": -166.1768035888672, + "eval_logps/rejected": -186.34889221191406, + "eval_loss": 0.6471571326255798, + "eval_rewards/accuracies": 0.6175650358200073, + "eval_rewards/chosen": -0.9476678967475891, + "eval_rewards/margins": 0.16475053131580353, + "eval_rewards/rejected": -1.1124184131622314, + "eval_runtime": 483.7099, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 1.112, "step": 3800 }, { "epoch": 0.66, - "grad_norm": 3.375, + "grad_norm": 3.234375, "learning_rate": 1.5917831767236597e-06, - "logits/chosen": -3.01556134223938, - "logits/rejected": -3.0062661170959473, - "logps/chosen": -193.03201293945312, - "logps/rejected": -212.33935546875, - "loss": 0.6215, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2390611171722412, - "rewards/margins": 0.2815569341182709, - "rewards/rejected": -1.5206180810928345, + "logits/chosen": -3.016317844390869, + "logits/rejected": -3.0069072246551514, + "logps/chosen": -192.1353759765625, + "logps/rejected": -211.64712524414062, + "loss": 0.6218, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2300946712493896, + "rewards/margins": 0.28360113501548767, + "rewards/rejected": -1.5136957168579102, "step": 3810 }, { "epoch": 0.66, "grad_norm": 2.671875, "learning_rate": 1.577787101852988e-06, - "logits/chosen": -3.0076615810394287, - "logits/rejected": -3.0015645027160645, - "logps/chosen": -181.38314819335938, - "logps/rejected": -206.2713623046875, + "logits/chosen": -3.009542942047119, + "logits/rejected": -3.003291606903076, + "logps/chosen": -180.47171020507812, + "logps/rejected": -205.54873657226562, "loss": 0.6031, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1591265201568604, - "rewards/margins": 0.2929556965827942, - "rewards/rejected": -1.4520821571350098, + "rewards/chosen": -1.1500122547149658, + "rewards/margins": 0.2948438823223114, + "rewards/rejected": -1.4448561668395996, "step": 3820 }, { "epoch": 0.66, - "grad_norm": 4.3125, + "grad_norm": 4.21875, "learning_rate": 1.5638244046427879e-06, - "logits/chosen": -3.0184571743011475, - "logits/rejected": -3.006317615509033, - "logps/chosen": -186.52359008789062, - "logps/rejected": -200.72528076171875, - "loss": 0.6279, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.177769422531128, - "rewards/margins": 0.23591813445091248, - "rewards/rejected": -1.4136877059936523, + "logits/chosen": -3.0196218490600586, + "logits/rejected": -3.0076098442077637, + "logps/chosen": -185.17117309570312, + "logps/rejected": -199.9987335205078, + "loss": 0.626, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1642451286315918, + "rewards/margins": 0.24217692017555237, + "rewards/rejected": -1.4064220190048218, "step": 3830 }, { "epoch": 0.66, - "grad_norm": 2.09375, + "grad_norm": 1.9921875, "learning_rate": 1.549895590445094e-06, - "logits/chosen": -3.0161845684051514, - "logits/rejected": -3.0071911811828613, - "logps/chosen": -181.43736267089844, - "logps/rejected": -220.2910614013672, - "loss": 0.5827, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1619912385940552, - "rewards/margins": 0.3711855411529541, - "rewards/rejected": -1.5331767797470093, + "logits/chosen": -3.017007827758789, + "logits/rejected": -3.0075902938842773, + "logps/chosen": -180.5908966064453, + "logps/rejected": -221.23373413085938, + "loss": 0.5758, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1535264253616333, + "rewards/margins": 0.3890773355960846, + "rewards/rejected": -1.5426037311553955, "step": 3840 }, { "epoch": 0.66, - "grad_norm": 3.28125, + "grad_norm": 3.375, "learning_rate": 1.5360011633856175e-06, - "logits/chosen": -3.034837007522583, - "logits/rejected": -3.0271098613739014, - "logps/chosen": -180.33920288085938, - "logps/rejected": -202.33938598632812, - "loss": 0.6091, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1167991161346436, - "rewards/margins": 0.28126758337020874, - "rewards/rejected": -1.398066759109497, + "logits/chosen": -3.0341978073120117, + "logits/rejected": -3.026261329650879, + "logps/chosen": -181.00332641601562, + "logps/rejected": -202.96026611328125, + "loss": 0.61, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1234402656555176, + "rewards/margins": 0.28083527088165283, + "rewards/rejected": -1.4042755365371704, "step": 3850 }, { "epoch": 0.67, - "grad_norm": 2.515625, + "grad_norm": 2.546875, "learning_rate": 1.5221416263454914e-06, - "logits/chosen": -3.0175633430480957, - "logits/rejected": -3.0098071098327637, - "logps/chosen": -184.2635955810547, - "logps/rejected": -209.46450805664062, - "loss": 0.613, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1878941059112549, - "rewards/margins": 0.2898927927017212, - "rewards/rejected": -1.4777867794036865, + "logits/chosen": -3.0151686668395996, + "logits/rejected": -3.0070672035217285, + "logps/chosen": -185.99903869628906, + "logps/rejected": -212.03543090820312, + "loss": 0.6109, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.205248475074768, + "rewards/margins": 0.29824763536453247, + "rewards/rejected": -1.5034960508346558, "step": 3860 }, { "epoch": 0.67, - "grad_norm": 3.140625, + "grad_norm": 3.015625, "learning_rate": 1.5083174809430773e-06, - "logits/chosen": -3.014516592025757, - "logits/rejected": -3.0030410289764404, - "logps/chosen": -188.04342651367188, - "logps/rejected": -216.844482421875, - "loss": 0.5909, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.179465889930725, - "rewards/margins": 0.35519683361053467, - "rewards/rejected": -1.5346627235412598, + "logits/chosen": -3.012986660003662, + "logits/rejected": -3.0011372566223145, + "logps/chosen": -189.08023071289062, + "logps/rejected": -219.11672973632812, + "loss": 0.5882, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1898341178894043, + "rewards/margins": 0.36755114793777466, + "rewards/rejected": -1.5573852062225342, "step": 3870 }, { "epoch": 0.67, - "grad_norm": 3.40625, + "grad_norm": 3.390625, "learning_rate": 1.4945292275158044e-06, - "logits/chosen": -2.9874320030212402, - "logits/rejected": -2.9876856803894043, - "logps/chosen": -191.178955078125, - "logps/rejected": -210.03970336914062, - "loss": 0.653, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.294626235961914, - "rewards/margins": 0.18726655840873718, - "rewards/rejected": -1.4818929433822632, + "logits/chosen": -2.9859509468078613, + "logits/rejected": -2.9861233234405518, + "logps/chosen": -192.88870239257812, + "logps/rejected": -211.9856414794922, + "loss": 0.6548, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3117234706878662, + "rewards/margins": 0.18962886929512024, + "rewards/rejected": -1.5013524293899536, "step": 3880 }, { "epoch": 0.67, - "grad_norm": 2.90625, + "grad_norm": 2.859375, "learning_rate": 1.4807773651020645e-06, - "logits/chosen": -3.010956048965454, - "logits/rejected": -3.004424571990967, - "logps/chosen": -183.68826293945312, - "logps/rejected": -208.5262451171875, - "loss": 0.6147, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1942976713180542, - "rewards/margins": 0.2909819781780243, - "rewards/rejected": -1.4852796792984009, + "logits/chosen": -3.0090255737304688, + "logits/rejected": -3.002739191055298, + "logps/chosen": -185.8389892578125, + "logps/rejected": -210.50222778320312, + "loss": 0.6162, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2158050537109375, + "rewards/margins": 0.289234459400177, + "rewards/rejected": -1.5050393342971802, "step": 3890 }, { "epoch": 0.67, - "grad_norm": 2.609375, + "grad_norm": 2.703125, "learning_rate": 1.467062391423149e-06, - "logits/chosen": -3.0182414054870605, - "logits/rejected": -3.0148303508758545, - "logps/chosen": -188.4974822998047, - "logps/rejected": -207.0945281982422, - "loss": 0.6183, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2032827138900757, - "rewards/margins": 0.2851700186729431, - "rewards/rejected": -1.488452672958374, + "logits/chosen": -3.013881206512451, + "logits/rejected": -3.011373281478882, + "logps/chosen": -191.22622680664062, + "logps/rejected": -209.2890167236328, + "loss": 0.6222, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2305704355239868, + "rewards/margins": 0.27982720732688904, + "rewards/rejected": -1.5103976726531982, "step": 3900 }, { "epoch": 0.67, - "eval_logits/chosen": -3.0137338638305664, - "eval_logits/rejected": -3.0076637268066406, - "eval_logps/chosen": -167.441650390625, - "eval_logps/rejected": -187.2733917236328, - "eval_loss": 0.6487549543380737, + "eval_logits/chosen": -3.0100162029266357, + "eval_logits/rejected": -3.004049301147461, + "eval_logps/chosen": -168.60433959960938, + "eval_logps/rejected": -189.1106719970703, + "eval_loss": 0.6467403173446655, "eval_rewards/accuracies": 0.6166356801986694, - "eval_rewards/chosen": -0.9603161215782166, - "eval_rewards/margins": 0.16134725511074066, - "eval_rewards/rejected": -1.1216634511947632, - "eval_runtime": 493.6855, - "eval_samples_per_second": 8.718, - "eval_steps_per_second": 1.09, + "eval_rewards/chosen": -0.9719431400299072, + "eval_rewards/margins": 0.16809284687042236, + "eval_rewards/rejected": -1.1400359869003296, + "eval_runtime": 483.8368, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 1.112, "step": 3900 }, { "epoch": 0.67, - "grad_norm": 2.65625, + "grad_norm": 2.5625, "learning_rate": 1.4533848028652347e-06, - "logits/chosen": -3.0164458751678467, - "logits/rejected": -3.0086934566497803, - "logps/chosen": -185.79513549804688, - "logps/rejected": -216.59213256835938, - "loss": 0.5912, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.194959282875061, - "rewards/margins": 0.36031073331832886, - "rewards/rejected": -1.5552700757980347, + "logits/chosen": -3.0135159492492676, + "logits/rejected": -3.005495071411133, + "logps/chosen": -189.1254425048828, + "logps/rejected": -219.94100952148438, + "loss": 0.5947, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.228262186050415, + "rewards/margins": 0.3604966104030609, + "rewards/rejected": -1.5887585878372192, "step": 3910 }, { "epoch": 0.68, - "grad_norm": 2.546875, + "grad_norm": 2.578125, "learning_rate": 1.4397450944614185e-06, - "logits/chosen": -3.0261919498443604, - "logits/rejected": -3.018491744995117, - "logps/chosen": -183.15394592285156, - "logps/rejected": -201.19017028808594, - "loss": 0.608, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1281392574310303, - "rewards/margins": 0.296304315328598, - "rewards/rejected": -1.4244437217712402, + "logits/chosen": -3.022209882736206, + "logits/rejected": -3.014324188232422, + "logps/chosen": -186.09042358398438, + "logps/rejected": -205.3507843017578, + "loss": 0.6055, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1575043201446533, + "rewards/margins": 0.30854544043540955, + "rewards/rejected": -1.4660497903823853, "step": 3920 }, { "epoch": 0.68, - "grad_norm": 2.703125, + "grad_norm": 2.84375, "learning_rate": 1.426143759873801e-06, - "logits/chosen": -3.005645751953125, - "logits/rejected": -2.9987285137176514, - "logps/chosen": -181.1148223876953, - "logps/rejected": -209.9329071044922, - "loss": 0.5978, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.180918574333191, - "rewards/margins": 0.3122202157974243, - "rewards/rejected": -1.4931389093399048, + "logits/chosen": -3.0014710426330566, + "logits/rejected": -2.9942893981933594, + "logps/chosen": -182.577880859375, + "logps/rejected": -212.81295776367188, + "loss": 0.5951, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1955492496490479, + "rewards/margins": 0.32639041543006897, + "rewards/rejected": -1.5219395160675049, "step": 3930 }, { "epoch": 0.68, - "grad_norm": 3.234375, + "grad_norm": 3.125, "learning_rate": 1.4125812913756174e-06, - "logits/chosen": -2.989655017852783, - "logits/rejected": -2.9861814975738525, - "logps/chosen": -181.75909423828125, - "logps/rejected": -210.435302734375, - "loss": 0.5962, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.17508065700531, - "rewards/margins": 0.3153669238090515, - "rewards/rejected": -1.4904476404190063, + "logits/chosen": -2.984102725982666, + "logits/rejected": -2.9806723594665527, + "logps/chosen": -185.0265350341797, + "logps/rejected": -214.3390350341797, + "loss": 0.5953, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2077550888061523, + "rewards/margins": 0.32172971963882446, + "rewards/rejected": -1.5294848680496216, "step": 3940 }, { "epoch": 0.68, "grad_norm": 4.0, "learning_rate": 1.3990581798334236e-06, - "logits/chosen": -2.9854207038879395, - "logits/rejected": -2.971961498260498, - "logps/chosen": -188.04066467285156, - "logps/rejected": -217.80105590820312, - "loss": 0.5735, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2220876216888428, - "rewards/margins": 0.3745902180671692, - "rewards/rejected": -1.5966777801513672, + "logits/chosen": -2.9794344902038574, + "logits/rejected": -2.9656434059143066, + "logps/chosen": -190.2133331298828, + "logps/rejected": -221.944580078125, + "loss": 0.5668, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2438141107559204, + "rewards/margins": 0.3942989706993103, + "rewards/rejected": -1.638113260269165, "step": 3950 }, { "epoch": 0.68, - "grad_norm": 3.53125, + "grad_norm": 3.625, "learning_rate": 1.3855749146893285e-06, - "logits/chosen": -3.0040407180786133, - "logits/rejected": -2.998903512954712, - "logps/chosen": -191.17031860351562, - "logps/rejected": -228.12728881835938, - "loss": 0.5992, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.273728370666504, - "rewards/margins": 0.33710581064224243, - "rewards/rejected": -1.6108341217041016, + "logits/chosen": -2.9964189529418945, + "logits/rejected": -2.9914658069610596, + "logps/chosen": -194.50753784179688, + "logps/rejected": -232.042724609375, + "loss": 0.6, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3071004152297974, + "rewards/margins": 0.3428882956504822, + "rewards/rejected": -1.6499887704849243, "step": 3960 }, { "epoch": 0.68, - "grad_norm": 2.8125, + "grad_norm": 2.84375, "learning_rate": 1.3721319839432794e-06, - "logits/chosen": -2.9864563941955566, - "logits/rejected": -2.977466344833374, - "logps/chosen": -199.20652770996094, - "logps/rejected": -229.17971801757812, - "loss": 0.593, + "logits/chosen": -2.97955584526062, + "logits/rejected": -2.970545530319214, + "logps/chosen": -201.5922088623047, + "logps/rejected": -231.54232788085938, + "loss": 0.5959, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.334026575088501, - "rewards/margins": 0.3332452178001404, - "rewards/rejected": -1.667271614074707, + "rewards/chosen": -1.3578835725784302, + "rewards/margins": 0.33301469683647156, + "rewards/rejected": -1.6908981800079346, "step": 3970 }, { "epoch": 0.69, - "grad_norm": 2.6875, + "grad_norm": 2.796875, "learning_rate": 1.3587298741353999e-06, - "logits/chosen": -2.967489719390869, - "logits/rejected": -2.9538886547088623, - "logps/chosen": -192.2974853515625, - "logps/rejected": -227.893310546875, - "loss": 0.5772, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2904869318008423, - "rewards/margins": 0.3781260848045349, - "rewards/rejected": -1.668613076210022, + "logits/chosen": -2.9589290618896484, + "logits/rejected": -2.944995403289795, + "logps/chosen": -195.64895629882812, + "logps/rejected": -232.37344360351562, + "loss": 0.5752, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3240019083023071, + "rewards/margins": 0.38941264152526855, + "rewards/rejected": -1.7134145498275757, "step": 3980 }, { "epoch": 0.69, - "grad_norm": 2.875, + "grad_norm": 2.921875, "learning_rate": 1.3453690703283848e-06, - "logits/chosen": -2.962489604949951, - "logits/rejected": -2.96486234664917, - "logps/chosen": -197.5770721435547, - "logps/rejected": -218.3431854248047, - "loss": 0.6503, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3310292959213257, - "rewards/margins": 0.22468487918376923, - "rewards/rejected": -1.5557141304016113, + "logits/chosen": -2.9528799057006836, + "logits/rejected": -2.9546871185302734, + "logps/chosen": -201.53150939941406, + "logps/rejected": -223.7783660888672, + "loss": 0.6462, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3705735206604004, + "rewards/margins": 0.23949268460273743, + "rewards/rejected": -1.6100661754608154, "step": 3990 }, { "epoch": 0.69, - "grad_norm": 4.15625, + "grad_norm": 4.375, "learning_rate": 1.3320500560899329e-06, - "logits/chosen": -2.985219717025757, - "logits/rejected": -2.9798381328582764, - "logps/chosen": -200.13027954101562, - "logps/rejected": -224.08981323242188, - "loss": 0.6051, + "logits/chosen": -2.9793593883514404, + "logits/rejected": -2.9736618995666504, + "logps/chosen": -204.895751953125, + "logps/rejected": -229.46194458007812, + "loss": 0.605, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3275507688522339, - "rewards/margins": 0.3009306490421295, - "rewards/rejected": -1.6284815073013306, + "rewards/chosen": -1.3752055168151855, + "rewards/margins": 0.3069971203804016, + "rewards/rejected": -1.6822025775909424, "step": 4000 }, { "epoch": 0.69, - "eval_logits/chosen": -2.9974019527435303, - "eval_logits/rejected": -2.9908156394958496, - "eval_logps/chosen": -176.37387084960938, - "eval_logps/rejected": -197.12547302246094, - "eval_loss": 0.6481823325157166, - "eval_rewards/accuracies": 0.6177973747253418, - "eval_rewards/chosen": -1.0496385097503662, - "eval_rewards/margins": 0.17054608464241028, - "eval_rewards/rejected": -1.220184564590454, - "eval_runtime": 494.8129, - "eval_samples_per_second": 8.698, - "eval_steps_per_second": 1.087, + "eval_logits/chosen": -2.9848952293395996, + "eval_logits/rejected": -2.978325366973877, + "eval_logps/chosen": -179.137939453125, + "eval_logps/rejected": -200.68565368652344, + "eval_loss": 0.6460844874382019, + "eval_rewards/accuracies": 0.6203531622886658, + "eval_rewards/chosen": -1.0772794485092163, + "eval_rewards/margins": 0.178506538271904, + "eval_rewards/rejected": -1.2557858228683472, + "eval_runtime": 483.7273, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 1.112, "step": 4000 }, { "epoch": 0.69, - "grad_norm": 3.453125, + "grad_norm": 3.359375, "learning_rate": 1.3187733134752622e-06, - "logits/chosen": -2.9619107246398926, - "logits/rejected": -2.9526774883270264, - "logps/chosen": -188.21420288085938, - "logps/rejected": -226.4281463623047, - "loss": 0.5841, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2888858318328857, - "rewards/margins": 0.3644692599773407, - "rewards/rejected": -1.6533548831939697, + "logits/chosen": -2.9539637565612793, + "logits/rejected": -2.9446120262145996, + "logps/chosen": -191.04747009277344, + "logps/rejected": -230.6028289794922, + "loss": 0.58, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3172180652618408, + "rewards/margins": 0.3778838515281677, + "rewards/rejected": -1.6951020956039429, "step": 4010 }, { "epoch": 0.69, - "grad_norm": 3.109375, + "grad_norm": 2.984375, "learning_rate": 1.3055393230096433e-06, - "logits/chosen": -2.975512981414795, - "logits/rejected": -2.971684217453003, - "logps/chosen": -195.65884399414062, - "logps/rejected": -222.35800170898438, - "loss": 0.6214, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3342936038970947, - "rewards/margins": 0.2772584557533264, - "rewards/rejected": -1.6115522384643555, + "logits/chosen": -2.9659435749053955, + "logits/rejected": -2.9620676040649414, + "logps/chosen": -199.49159240722656, + "logps/rejected": -227.734130859375, + "loss": 0.6173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3726211786270142, + "rewards/margins": 0.29269200563430786, + "rewards/rejected": -1.6653131246566772, "step": 4020 }, { "epoch": 0.69, - "grad_norm": 2.5625, + "grad_norm": 2.453125, "learning_rate": 1.2923485636710275e-06, - "logits/chosen": -2.986161947250366, - "logits/rejected": -2.9778332710266113, - "logps/chosen": -188.88314819335938, - "logps/rejected": -211.6816864013672, - "loss": 0.6201, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2178932428359985, - "rewards/margins": 0.25717395544052124, - "rewards/rejected": -1.475067377090454, + "logits/chosen": -2.9760024547576904, + "logits/rejected": -2.9680752754211426, + "logps/chosen": -193.3185272216797, + "logps/rejected": -217.10757446289062, + "loss": 0.6181, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2622469663619995, + "rewards/margins": 0.26707887649536133, + "rewards/rejected": -1.52932608127594, "step": 4030 }, { "epoch": 0.7, - "grad_norm": 2.9375, + "grad_norm": 2.953125, "learning_rate": 1.279201512872693e-06, - "logits/chosen": -3.0042479038238525, - "logits/rejected": -2.9908108711242676, - "logps/chosen": -192.40591430664062, - "logps/rejected": -224.39590454101562, - "loss": 0.5853, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2539700269699097, - "rewards/margins": 0.35769122838974, - "rewards/rejected": -1.6116611957550049, + "logits/chosen": -2.995952844619751, + "logits/rejected": -2.982109308242798, + "logps/chosen": -197.069580078125, + "logps/rejected": -229.3331298828125, + "loss": 0.5877, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3006064891815186, + "rewards/margins": 0.36042696237564087, + "rewards/rejected": -1.6610333919525146, "step": 4040 }, { "epoch": 0.7, - "grad_norm": 2.71875, + "grad_norm": 2.765625, "learning_rate": 1.2660986464459817e-06, - "logits/chosen": -2.980405807495117, - "logits/rejected": -2.97395920753479, - "logps/chosen": -186.85433959960938, - "logps/rejected": -212.6918487548828, - "loss": 0.6253, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.247534990310669, - "rewards/margins": 0.2799908518791199, - "rewards/rejected": -1.5275259017944336, + "logits/chosen": -2.974198579788208, + "logits/rejected": -2.967679500579834, + "logps/chosen": -188.9154052734375, + "logps/rejected": -215.5602569580078, + "loss": 0.625, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2681456804275513, + "rewards/margins": 0.2880643904209137, + "rewards/rejected": -1.556209921836853, "step": 4050 }, { "epoch": 0.7, - "grad_norm": 2.546875, + "grad_norm": 2.578125, "learning_rate": 1.2530404386230637e-06, - "logits/chosen": -2.9891881942749023, - "logits/rejected": -2.9860939979553223, - "logps/chosen": -202.81207275390625, - "logps/rejected": -219.61532592773438, - "loss": 0.6415, + "logits/chosen": -2.9840919971466064, + "logits/rejected": -2.9815354347229004, + "logps/chosen": -205.19137573242188, + "logps/rejected": -222.2770538330078, + "loss": 0.645, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3588091135025024, - "rewards/margins": 0.24134831130504608, - "rewards/rejected": -1.6001571416854858, + "rewards/chosen": -1.3826020956039429, + "rewards/margins": 0.24417224526405334, + "rewards/rejected": -1.6267744302749634, "step": 4060 }, { "epoch": 0.7, - "grad_norm": 2.828125, + "grad_norm": 2.703125, "learning_rate": 1.2400273620197856e-06, - "logits/chosen": -2.985337257385254, - "logits/rejected": -2.975621223449707, - "logps/chosen": -193.52359008789062, - "logps/rejected": -226.47958374023438, - "loss": 0.5742, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2964990139007568, - "rewards/margins": 0.3701520264148712, - "rewards/rejected": -1.6666511297225952, + "logits/chosen": -2.982635498046875, + "logits/rejected": -2.973203182220459, + "logps/chosen": -194.95956420898438, + "logps/rejected": -227.84765625, + "loss": 0.5765, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.310858964920044, + "rewards/margins": 0.36947301030158997, + "rewards/rejected": -1.680331826210022, "step": 4070 }, { "epoch": 0.7, - "grad_norm": 2.578125, + "grad_norm": 2.5625, "learning_rate": 1.2270598876185553e-06, - "logits/chosen": -2.9908981323242188, - "logits/rejected": -2.982999563217163, - "logps/chosen": -187.68170166015625, - "logps/rejected": -217.3961944580078, - "loss": 0.6105, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2406961917877197, - "rewards/margins": 0.3032611310482025, - "rewards/rejected": -1.5439573526382446, + "logits/chosen": -2.9874231815338135, + "logits/rejected": -2.979541778564453, + "logps/chosen": -189.1808319091797, + "logps/rejected": -219.6368408203125, + "loss": 0.6067, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2556878328323364, + "rewards/margins": 0.31067654490470886, + "rewards/rejected": -1.5663644075393677, "step": 4080 }, { "epoch": 0.7, - "grad_norm": 2.796875, + "grad_norm": 2.78125, "learning_rate": 1.2141384847513006e-06, - "logits/chosen": -3.018590211868286, - "logits/rejected": -3.004499912261963, - "logps/chosen": -179.45327758789062, - "logps/rejected": -208.90170288085938, - "loss": 0.5894, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1329796314239502, - "rewards/margins": 0.34134823083877563, - "rewards/rejected": -1.474327802658081, + "logits/chosen": -3.0166218280792236, + "logits/rejected": -3.002739429473877, + "logps/chosen": -180.31333923339844, + "logps/rejected": -209.5823211669922, + "loss": 0.5903, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1415802240371704, + "rewards/margins": 0.3395538628101349, + "rewards/rejected": -1.4811339378356934, "step": 4090 }, { "epoch": 0.71, - "grad_norm": 4.0625, + "grad_norm": 4.09375, "learning_rate": 1.2012636210824833e-06, - "logits/chosen": -2.995112180709839, - "logits/rejected": -2.990323543548584, - "logps/chosen": -178.38339233398438, - "logps/rejected": -205.15158081054688, - "loss": 0.5867, + "logits/chosen": -2.993722915649414, + "logits/rejected": -2.988193988800049, + "logps/chosen": -179.03883361816406, + "logps/rejected": -206.8237762451172, + "loss": 0.585, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1436684131622314, - "rewards/margins": 0.3371172547340393, - "rewards/rejected": -1.480785608291626, + "rewards/chosen": -1.1502227783203125, + "rewards/margins": 0.347284734249115, + "rewards/rejected": -1.4975074529647827, "step": 4100 }, { "epoch": 0.71, - "eval_logits/chosen": -3.015127420425415, - "eval_logits/rejected": -3.008767604827881, - "eval_logps/chosen": -169.1083984375, - "eval_logps/rejected": -189.3998260498047, - "eval_loss": 0.6484485864639282, - "eval_rewards/accuracies": 0.6124535202980042, - "eval_rewards/chosen": -0.9769837260246277, - "eval_rewards/margins": 0.1659441590309143, - "eval_rewards/rejected": -1.1429280042648315, - "eval_runtime": 486.6806, - "eval_samples_per_second": 8.844, - "eval_steps_per_second": 1.105, + "eval_logits/chosen": -3.008620500564575, + "eval_logits/rejected": -3.0023536682128906, + "eval_logps/chosen": -169.76588439941406, + "eval_logps/rejected": -190.66702270507812, + "eval_loss": 0.6464406251907349, + "eval_rewards/accuracies": 0.616403341293335, + "eval_rewards/chosen": -0.9835586547851562, + "eval_rewards/margins": 0.17204123735427856, + "eval_rewards/rejected": -1.1556000709533691, + "eval_runtime": 483.817, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 1.112, "step": 4100 }, { "epoch": 0.71, - "grad_norm": 3.296875, + "grad_norm": 3.453125, "learning_rate": 1.1884357625921695e-06, - "logits/chosen": -2.9936797618865967, - "logits/rejected": -2.9857420921325684, - "logps/chosen": -192.5975341796875, - "logps/rejected": -207.32955932617188, - "loss": 0.6497, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2625000476837158, - "rewards/margins": 0.2128407210111618, - "rewards/rejected": -1.475340723991394, + "logits/chosen": -2.9908840656280518, + "logits/rejected": -2.983269214630127, + "logps/chosen": -194.69482421875, + "logps/rejected": -209.41250610351562, + "loss": 0.6519, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2834727764129639, + "rewards/margins": 0.21269741654396057, + "rewards/rejected": -1.4961702823638916, "step": 4110 }, { "epoch": 0.71, - "grad_norm": 3.140625, + "grad_norm": 3.046875, "learning_rate": 1.175655373559168e-06, - "logits/chosen": -3.006986618041992, - "logits/rejected": -2.996345043182373, - "logps/chosen": -184.16152954101562, - "logps/rejected": -209.470947265625, - "loss": 0.6272, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2307326793670654, - "rewards/margins": 0.23652946949005127, - "rewards/rejected": -1.4672620296478271, + "logits/chosen": -2.999894618988037, + "logits/rejected": -2.9891955852508545, + "logps/chosen": -186.88348388671875, + "logps/rejected": -212.41897583007812, + "loss": 0.6279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.257952094078064, + "rewards/margins": 0.23879018425941467, + "rewards/rejected": -1.4967423677444458, "step": 4120 }, { "epoch": 0.71, - "grad_norm": 3.0625, + "grad_norm": 3.09375, "learning_rate": 1.162922916544224e-06, - "logits/chosen": -3.007054090499878, - "logits/rejected": -2.995990037918091, - "logps/chosen": -181.14195251464844, - "logps/rejected": -208.498291015625, - "loss": 0.5865, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1549180746078491, - "rewards/margins": 0.32630079984664917, - "rewards/rejected": -1.481218934059143, + "logits/chosen": -2.9989144802093506, + "logits/rejected": -2.9878134727478027, + "logps/chosen": -184.68089294433594, + "logps/rejected": -212.1684112548828, + "loss": 0.5881, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1903077363967896, + "rewards/margins": 0.3276127576828003, + "rewards/rejected": -1.5179203748703003, "step": 4130 }, { "epoch": 0.71, - "grad_norm": 3.71875, + "grad_norm": 3.703125, "learning_rate": 1.15023885237328e-06, - "logits/chosen": -2.9959869384765625, - "logits/rejected": -2.9884610176086426, - "logps/chosen": -195.40237426757812, - "logps/rejected": -207.7620086669922, - "loss": 0.6527, + "logits/chosen": -2.988089084625244, + "logits/rejected": -2.980468273162842, + "logps/chosen": -198.49673461914062, + "logps/rejected": -212.5007781982422, + "loss": 0.6467, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2849228382110596, - "rewards/margins": 0.20186881721019745, - "rewards/rejected": -1.4867916107177734, + "rewards/chosen": -1.315866231918335, + "rewards/margins": 0.2183130979537964, + "rewards/rejected": -1.534179449081421, "step": 4140 }, { "epoch": 0.72, - "grad_norm": 2.609375, + "grad_norm": 2.734375, "learning_rate": 1.1376036401207939e-06, - "logits/chosen": -3.0100998878479004, - "logits/rejected": -3.0050740242004395, - "logps/chosen": -189.6447296142578, - "logps/rejected": -204.48178100585938, - "loss": 0.6342, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.214764952659607, - "rewards/margins": 0.23950621485710144, - "rewards/rejected": -1.4542710781097412, + "logits/chosen": -3.0006988048553467, + "logits/rejected": -2.99579119682312, + "logps/chosen": -192.1051788330078, + "logps/rejected": -208.1251983642578, + "loss": 0.632, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.23936927318573, + "rewards/margins": 0.2513357996940613, + "rewards/rejected": -1.4907052516937256, "step": 4150 }, { "epoch": 0.72, - "grad_norm": 2.4375, + "grad_norm": 2.375, "learning_rate": 1.1250177370931265e-06, - "logits/chosen": -3.004896640777588, - "logits/rejected": -2.9942173957824707, - "logps/chosen": -178.70838928222656, - "logps/rejected": -208.43161010742188, - "loss": 0.5788, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1545559167861938, - "rewards/margins": 0.3637978136539459, - "rewards/rejected": -1.518353819847107, + "logits/chosen": -2.993255376815796, + "logits/rejected": -2.9826114177703857, + "logps/chosen": -182.51060485839844, + "logps/rejected": -212.94644165039062, + "loss": 0.5801, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1925780773162842, + "rewards/margins": 0.37092381715774536, + "rewards/rejected": -1.5635017156600952, "step": 4160 }, { "epoch": 0.72, - "grad_norm": 3.296875, + "grad_norm": 3.53125, "learning_rate": 1.112481598811992e-06, - "logits/chosen": -3.0165929794311523, - "logits/rejected": -3.010983943939209, - "logps/chosen": -174.68006896972656, - "logps/rejected": -200.9302215576172, - "loss": 0.6214, + "logits/chosen": -3.0047919750213623, + "logits/rejected": -2.9989700317382812, + "logps/chosen": -179.00527954101562, + "logps/rejected": -205.2192840576172, + "loss": 0.6232, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.1269192695617676, - "rewards/margins": 0.2599869966506958, - "rewards/rejected": -1.3869061470031738, + "rewards/chosen": -1.1701711416244507, + "rewards/margins": 0.25962555408477783, + "rewards/rejected": -1.4297969341278076, "step": 4170 }, { "epoch": 0.72, - "grad_norm": 2.84375, + "grad_norm": 2.890625, "learning_rate": 1.0999956789979626e-06, - "logits/chosen": -3.0153422355651855, - "logits/rejected": -3.0049679279327393, - "logps/chosen": -180.34591674804688, - "logps/rejected": -207.4092254638672, - "loss": 0.6085, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1334244012832642, - "rewards/margins": 0.2850673496723175, - "rewards/rejected": -1.4184919595718384, + "logits/chosen": -3.00277042388916, + "logits/rejected": -2.9926650524139404, + "logps/chosen": -184.0215301513672, + "logps/rejected": -211.02987670898438, + "loss": 0.6108, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1701806783676147, + "rewards/margins": 0.2845175862312317, + "rewards/rejected": -1.4546983242034912, "step": 4180 }, { "epoch": 0.72, - "grad_norm": 3.53125, + "grad_norm": 3.296875, "learning_rate": 1.0875604295540607e-06, - "logits/chosen": -3.009065628051758, - "logits/rejected": -3.0044589042663574, - "logps/chosen": -181.29495239257812, - "logps/rejected": -211.8234100341797, - "loss": 0.6012, + "logits/chosen": -2.9975829124450684, + "logits/rejected": -2.9934194087982178, + "logps/chosen": -185.83523559570312, + "logps/rejected": -216.00106811523438, + "loss": 0.6053, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1944143772125244, - "rewards/margins": 0.3176630735397339, - "rewards/rejected": -1.5120774507522583, + "rewards/chosen": -1.2398172616958618, + "rewards/margins": 0.314037024974823, + "rewards/rejected": -1.55385422706604, "step": 4190 }, { "epoch": 0.72, - "grad_norm": 2.578125, + "grad_norm": 2.8125, "learning_rate": 1.075176300549387e-06, - "logits/chosen": -3.0220468044281006, - "logits/rejected": -3.01804780960083, - "logps/chosen": -181.4476776123047, - "logps/rejected": -192.96243286132812, - "loss": 0.6554, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.1441035270690918, - "rewards/margins": 0.17912748456001282, - "rewards/rejected": -1.3232312202453613, + "logits/chosen": -3.012768030166626, + "logits/rejected": -3.0091800689697266, + "logps/chosen": -185.5307159423828, + "logps/rejected": -196.47280883789062, + "loss": 0.6602, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1849339008331299, + "rewards/margins": 0.17340119183063507, + "rewards/rejected": -1.3583351373672485, "step": 4200 }, { "epoch": 0.72, - "eval_logits/chosen": -3.0269603729248047, - "eval_logits/rejected": -3.0208635330200195, - "eval_logps/chosen": -164.2754669189453, - "eval_logps/rejected": -184.01255798339844, - "eval_loss": 0.6489173769950867, - "eval_rewards/accuracies": 0.6175650358200073, - "eval_rewards/chosen": -0.928654670715332, - "eval_rewards/margins": 0.16040048003196716, - "eval_rewards/rejected": -1.089055061340332, - "eval_runtime": 484.1713, - "eval_samples_per_second": 8.889, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.015038013458252, + "eval_logits/rejected": -3.0089128017425537, + "eval_logps/chosen": -166.36691284179688, + "eval_logps/rejected": -186.9268341064453, + "eval_loss": 0.6464580297470093, + "eval_rewards/accuracies": 0.6177973747253418, + "eval_rewards/chosen": -0.9495689272880554, + "eval_rewards/margins": 0.16862896084785461, + "eval_rewards/rejected": -1.1181979179382324, + "eval_runtime": 483.9621, + "eval_samples_per_second": 8.893, + "eval_steps_per_second": 1.112, "step": 4200 }, { "epoch": 0.73, - "grad_norm": 3.453125, + "grad_norm": 3.28125, "learning_rate": 1.0628437402028475e-06, - "logits/chosen": -3.0186755657196045, - "logits/rejected": -3.0084311962127686, - "logps/chosen": -184.8686981201172, - "logps/rejected": -200.7421112060547, - "loss": 0.641, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.191839337348938, - "rewards/margins": 0.21558180451393127, - "rewards/rejected": -1.4074211120605469, + "logits/chosen": -3.0106093883514404, + "logits/rejected": -3.0004124641418457, + "logps/chosen": -186.6129913330078, + "logps/rejected": -202.49771118164062, + "loss": 0.6427, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2092821598052979, + "rewards/margins": 0.21569499373435974, + "rewards/rejected": -1.4249770641326904, "step": 4210 }, { "epoch": 0.73, - "grad_norm": 2.390625, + "grad_norm": 2.34375, "learning_rate": 1.0505631948669184e-06, - "logits/chosen": -3.0035483837127686, - "logits/rejected": -2.9989542961120605, - "logps/chosen": -178.74893188476562, - "logps/rejected": -201.6924591064453, - "loss": 0.6239, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1593618392944336, - "rewards/margins": 0.24961857497692108, - "rewards/rejected": -1.408980369567871, + "logits/chosen": -2.998325824737549, + "logits/rejected": -2.994420289993286, + "logps/chosen": -180.66578674316406, + "logps/rejected": -202.93563842773438, + "loss": 0.6281, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1785303354263306, + "rewards/margins": 0.24288196861743927, + "rewards/rejected": -1.4214122295379639, "step": 4220 }, { "epoch": 0.73, - "grad_norm": 3.4375, + "grad_norm": 3.296875, "learning_rate": 1.038335109011498e-06, - "logits/chosen": -3.0282905101776123, - "logits/rejected": -3.024932861328125, - "logps/chosen": -183.4084930419922, - "logps/rejected": -201.07754516601562, - "loss": 0.6335, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1595426797866821, - "rewards/margins": 0.234393909573555, - "rewards/rejected": -1.3939363956451416, + "logits/chosen": -3.022573947906494, + "logits/rejected": -3.018914222717285, + "logps/chosen": -184.13241577148438, + "logps/rejected": -202.57627868652344, + "loss": 0.6306, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1667819023132324, + "rewards/margins": 0.24214200675487518, + "rewards/rejected": -1.408923864364624, "step": 4230 }, { "epoch": 0.73, - "grad_norm": 2.390625, + "grad_norm": 2.296875, "learning_rate": 1.026159925207817e-06, - "logits/chosen": -3.0245633125305176, - "logits/rejected": -3.0180656909942627, - "logps/chosen": -175.83587646484375, - "logps/rejected": -205.62216186523438, - "loss": 0.596, + "logits/chosen": -3.017408609390259, + "logits/rejected": -3.0109002590179443, + "logps/chosen": -178.22518920898438, + "logps/rejected": -208.6528778076172, + "loss": 0.5954, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1032450199127197, - "rewards/margins": 0.32235080003738403, - "rewards/rejected": -1.4255958795547485, + "rewards/chosen": -1.1271382570266724, + "rewards/margins": 0.3287648558616638, + "rewards/rejected": -1.4559029340744019, "step": 4240 }, { "epoch": 0.73, - "grad_norm": 2.890625, + "grad_norm": 2.96875, "learning_rate": 1.014038084112423e-06, - "logits/chosen": -3.018775224685669, - "logits/rejected": -3.015329360961914, - "logps/chosen": -172.9668426513672, - "logps/rejected": -193.0954132080078, - "loss": 0.6107, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.0623455047607422, - "rewards/margins": 0.2572886645793915, - "rewards/rejected": -1.319634199142456, + "logits/chosen": -3.0114645957946777, + "logits/rejected": -3.0079257488250732, + "logps/chosen": -174.28128051757812, + "logps/rejected": -194.43869018554688, + "loss": 0.6103, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0754897594451904, + "rewards/margins": 0.2575770914554596, + "rewards/rejected": -1.3330668210983276, "step": 4250 }, { "epoch": 0.73, - "grad_norm": 2.953125, + "grad_norm": 3.015625, "learning_rate": 1.001970024451229e-06, - "logits/chosen": -3.0206713676452637, - "logits/rejected": -3.011845588684082, - "logps/chosen": -172.6252899169922, - "logps/rejected": -201.23965454101562, - "loss": 0.6096, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0669132471084595, - "rewards/margins": 0.3177719712257385, - "rewards/rejected": -1.3846852779388428, + "logits/chosen": -3.0140953063964844, + "logits/rejected": -3.005549907684326, + "logps/chosen": -174.39950561523438, + "logps/rejected": -202.94818115234375, + "loss": 0.6097, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0846554040908813, + "rewards/margins": 0.31711500883102417, + "rewards/rejected": -1.4017703533172607, "step": 4260 }, { "epoch": 0.74, - "grad_norm": 3.296875, + "grad_norm": 3.125, "learning_rate": 9.899561830036372e-07, - "logits/chosen": -3.020733118057251, - "logits/rejected": -3.012129306793213, - "logps/chosen": -167.78994750976562, - "logps/rejected": -192.33631896972656, - "loss": 0.6085, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.049797773361206, - "rewards/margins": 0.2823103070259094, - "rewards/rejected": -1.3321080207824707, + "logits/chosen": -3.0136845111846924, + "logits/rejected": -3.0040946006774902, + "logps/chosen": -169.96408081054688, + "logps/rejected": -195.17189025878906, + "loss": 0.6073, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.071539044380188, + "rewards/margins": 0.28892484307289124, + "rewards/rejected": -1.3604638576507568, "step": 4270 }, { "epoch": 0.74, - "grad_norm": 2.625, + "grad_norm": 2.5625, "learning_rate": 9.779969945867288e-07, - "logits/chosen": -3.0113422870635986, - "logits/rejected": -3.0036492347717285, - "logps/chosen": -171.27249145507812, - "logps/rejected": -196.91317749023438, - "loss": 0.6161, + "logits/chosen": -3.003387928009033, + "logits/rejected": -2.9958813190460205, + "logps/chosen": -173.567626953125, + "logps/rejected": -199.57489013671875, + "loss": 0.6171, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0888071060180664, - "rewards/margins": 0.2934654653072357, - "rewards/rejected": -1.3822726011276245, + "rewards/chosen": -1.1117585897445679, + "rewards/margins": 0.297131210565567, + "rewards/rejected": -1.4088897705078125, "step": 4280 }, { "epoch": 0.74, - "grad_norm": 3.265625, + "grad_norm": 3.453125, "learning_rate": 9.660928920395274e-07, - "logits/chosen": -3.0022969245910645, - "logits/rejected": -2.994992971420288, - "logps/chosen": -184.11524963378906, - "logps/rejected": -204.52259826660156, - "loss": 0.6256, + "logits/chosen": -2.9939751625061035, + "logits/rejected": -2.9857096672058105, + "logps/chosen": -186.2943115234375, + "logps/rejected": -207.37026977539062, + "loss": 0.6242, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1857540607452393, - "rewards/margins": 0.27106592059135437, - "rewards/rejected": -1.4568201303482056, + "rewards/chosen": -1.2075446844100952, + "rewards/margins": 0.27775177359580994, + "rewards/rejected": -1.4852964878082275, "step": 4290 }, { "epoch": 0.74, - "grad_norm": 2.671875, + "grad_norm": 2.40625, "learning_rate": 9.542443062073337e-07, - "logits/chosen": -3.0421223640441895, - "logits/rejected": -3.0338714122772217, - "logps/chosen": -171.01522827148438, - "logps/rejected": -196.96121215820312, - "loss": 0.6053, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.0753799676895142, - "rewards/margins": 0.2902859151363373, - "rewards/rejected": -1.3656659126281738, + "logits/chosen": -3.03216552734375, + "logits/rejected": -3.0238797664642334, + "logps/chosen": -174.03836059570312, + "logps/rejected": -200.1016387939453, + "loss": 0.6074, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1056114435195923, + "rewards/margins": 0.29145902395248413, + "rewards/rejected": -1.3970704078674316, "step": 4300 }, { "epoch": 0.74, - "eval_logits/chosen": -3.0361945629119873, - "eval_logits/rejected": -3.0303401947021484, - "eval_logps/chosen": -159.97738647460938, - "eval_logps/rejected": -179.44456481933594, - "eval_loss": 0.6488844752311707, - "eval_rewards/accuracies": 0.6096654534339905, - "eval_rewards/chosen": -0.8856736421585083, - "eval_rewards/margins": 0.15770147740840912, - "eval_rewards/rejected": -1.0433752536773682, - "eval_runtime": 484.3352, - "eval_samples_per_second": 8.886, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.030604362487793, + "eval_logits/rejected": -3.0247747898101807, + "eval_logps/chosen": -160.95040893554688, + "eval_logps/rejected": -181.0815887451172, + "eval_loss": 0.6467998027801514, + "eval_rewards/accuracies": 0.6182620525360107, + "eval_rewards/chosen": -0.8954039216041565, + "eval_rewards/margins": 0.16434147953987122, + "eval_rewards/rejected": -1.0597453117370605, + "eval_runtime": 483.7233, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 1.112, "step": 4300 }, { "epoch": 0.74, - "grad_norm": 2.40625, + "grad_norm": 2.390625, "learning_rate": 9.424516659261304e-07, - "logits/chosen": -3.014176368713379, - "logits/rejected": -3.002622127532959, - "logps/chosen": -178.19253540039062, - "logps/rejected": -199.57150268554688, - "loss": 0.6146, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1301661729812622, - "rewards/margins": 0.26318198442459106, - "rewards/rejected": -1.393347978591919, + "logits/chosen": -3.006664276123047, + "logits/rejected": -2.9954018592834473, + "logps/chosen": -180.44845581054688, + "logps/rejected": -201.24253845214844, + "loss": 0.6192, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.152725338935852, + "rewards/margins": 0.2573332190513611, + "rewards/rejected": -1.4100584983825684, "step": 4310 }, { "epoch": 0.74, - "grad_norm": 2.65625, + "grad_norm": 2.8125, "learning_rate": 9.307153980070624e-07, - "logits/chosen": -3.02925443649292, - "logits/rejected": -3.017390727996826, - "logps/chosen": -180.28123474121094, - "logps/rejected": -212.3802947998047, - "loss": 0.5645, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1029917001724243, - "rewards/margins": 0.3897990584373474, - "rewards/rejected": -1.492790699005127, + "logits/chosen": -3.0227952003479004, + "logits/rejected": -3.010831832885742, + "logps/chosen": -182.61170959472656, + "logps/rejected": -214.8519744873047, + "loss": 0.5658, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1262962818145752, + "rewards/margins": 0.39121127128601074, + "rewards/rejected": -1.517507553100586, "step": 4320 }, { "epoch": 0.75, - "grad_norm": 2.671875, + "grad_norm": 2.765625, "learning_rate": 9.190359272209912e-07, - "logits/chosen": -3.015522003173828, - "logits/rejected": -3.007598400115967, - "logps/chosen": -178.08860778808594, - "logps/rejected": -195.5576171875, - "loss": 0.6269, + "logits/chosen": -3.0103302001953125, + "logits/rejected": -3.002786159515381, + "logps/chosen": -179.65603637695312, + "logps/rejected": -197.30091857910156, + "loss": 0.6279, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1294702291488647, - "rewards/margins": 0.24931836128234863, - "rewards/rejected": -1.3787885904312134, + "rewards/chosen": -1.1451447010040283, + "rewards/margins": 0.2510768473148346, + "rewards/rejected": -1.3962215185165405, "step": 4330 }, { "epoch": 0.75, - "grad_norm": 3.046875, + "grad_norm": 3.140625, "learning_rate": 9.074136762831168e-07, - "logits/chosen": -3.0052382946014404, - "logits/rejected": -3.0017242431640625, - "logps/chosen": -173.1341552734375, - "logps/rejected": -204.6593780517578, - "loss": 0.5954, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1217544078826904, - "rewards/margins": 0.33835938572883606, - "rewards/rejected": -1.4601138830184937, + "logits/chosen": -2.9997153282165527, + "logits/rejected": -2.996535062789917, + "logps/chosen": -174.4058380126953, + "logps/rejected": -205.3519287109375, + "loss": 0.5987, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1344711780548096, + "rewards/margins": 0.33256837725639343, + "rewards/rejected": -1.4670393466949463, "step": 4340 }, { "epoch": 0.75, - "grad_norm": 3.03125, + "grad_norm": 3.109375, "learning_rate": 8.958490658376815e-07, - "logits/chosen": -3.0085527896881104, - "logits/rejected": -3.002901554107666, - "logps/chosen": -170.144287109375, - "logps/rejected": -197.52249145507812, - "loss": 0.6131, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.076792597770691, - "rewards/margins": 0.27325382828712463, - "rewards/rejected": -1.350046157836914, + "logits/chosen": -3.005241632461548, + "logits/rejected": -2.999251365661621, + "logps/chosen": -170.48207092285156, + "logps/rejected": -198.5691375732422, + "loss": 0.6109, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0801702737808228, + "rewards/margins": 0.2803425192832947, + "rewards/rejected": -1.3605127334594727, "step": 4350 }, { "epoch": 0.75, - "grad_norm": 2.6875, + "grad_norm": 2.828125, "learning_rate": 8.843425144427442e-07, - "logits/chosen": -3.0132718086242676, - "logits/rejected": -3.00311017036438, - "logps/chosen": -186.8450469970703, - "logps/rejected": -204.13070678710938, - "loss": 0.6485, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.223015546798706, - "rewards/margins": 0.22089993953704834, - "rewards/rejected": -1.4439154863357544, + "logits/chosen": -3.0085055828094482, + "logits/rejected": -2.9987130165100098, + "logps/chosen": -188.10092163085938, + "logps/rejected": -205.174560546875, + "loss": 0.6524, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2355743646621704, + "rewards/margins": 0.21877996623516083, + "rewards/rejected": -1.454354166984558, "step": 4360 }, { "epoch": 0.75, - "grad_norm": 3.453125, + "grad_norm": 3.6875, "learning_rate": 8.728944385550328e-07, - "logits/chosen": -3.0162041187286377, - "logits/rejected": -3.0061447620391846, - "logps/chosen": -179.4248046875, - "logps/rejected": -203.92308044433594, - "loss": 0.6156, + "logits/chosen": -3.0124564170837402, + "logits/rejected": -3.0019426345825195, + "logps/chosen": -180.61471557617188, + "logps/rejected": -204.51443481445312, + "loss": 0.6199, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1377297639846802, - "rewards/margins": 0.26933321356773376, - "rewards/rejected": -1.4070630073547363, + "rewards/chosen": -1.1496288776397705, + "rewards/margins": 0.26334765553474426, + "rewards/rejected": -1.4129765033721924, "step": 4370 }, { "epoch": 0.75, - "grad_norm": 2.546875, + "grad_norm": 2.671875, "learning_rate": 8.615052525148701e-07, - "logits/chosen": -3.0335440635681152, - "logits/rejected": -3.0290024280548096, - "logps/chosen": -178.56539916992188, - "logps/rejected": -197.3820343017578, - "loss": 0.6345, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.1294496059417725, - "rewards/margins": 0.23765477538108826, - "rewards/rejected": -1.3671042919158936, + "logits/chosen": -3.0287554264068604, + "logits/rejected": -3.024423837661743, + "logps/chosen": -179.65756225585938, + "logps/rejected": -197.85726928710938, + "loss": 0.6391, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1403712034225464, + "rewards/margins": 0.2314852774143219, + "rewards/rejected": -1.371856451034546, "step": 4380 }, { "epoch": 0.76, - "grad_norm": 3.328125, + "grad_norm": 3.40625, "learning_rate": 8.501753685311784e-07, - "logits/chosen": -3.0319645404815674, - "logits/rejected": -3.0271618366241455, - "logps/chosen": -176.71009826660156, - "logps/rejected": -203.8400115966797, - "loss": 0.6112, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1303045749664307, - "rewards/margins": 0.28727442026138306, - "rewards/rejected": -1.4175790548324585, + "logits/chosen": -3.0272891521453857, + "logits/rejected": -3.0223135948181152, + "logps/chosen": -176.3140411376953, + "logps/rejected": -203.53335571289062, + "loss": 0.6115, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1263439655303955, + "rewards/margins": 0.28816819190979004, + "rewards/rejected": -1.4145123958587646, "step": 4390 }, { "epoch": 0.76, - "grad_norm": 3.28125, + "grad_norm": 3.3125, "learning_rate": 8.389051966665596e-07, - "logits/chosen": -3.027657985687256, - "logits/rejected": -3.0215001106262207, - "logps/chosen": -182.5607452392578, - "logps/rejected": -203.29652404785156, - "loss": 0.6153, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1382032632827759, - "rewards/margins": 0.2578727602958679, - "rewards/rejected": -1.396075963973999, + "logits/chosen": -3.024461269378662, + "logits/rejected": -3.0175955295562744, + "logps/chosen": -181.77696228027344, + "logps/rejected": -203.70669555664062, + "loss": 0.6105, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1303656101226807, + "rewards/margins": 0.2698122560977936, + "rewards/rejected": -1.4001778364181519, "step": 4400 }, { "epoch": 0.76, - "eval_logits/chosen": -3.0351431369781494, - "eval_logits/rejected": -3.0291762351989746, - "eval_logps/chosen": -160.54696655273438, - "eval_logps/rejected": -180.12347412109375, - "eval_loss": 0.64886873960495, - "eval_rewards/accuracies": 0.6119888424873352, - "eval_rewards/chosen": -0.8913692831993103, - "eval_rewards/margins": 0.1587950885295868, - "eval_rewards/rejected": -1.0501643419265747, - "eval_runtime": 484.4593, - "eval_samples_per_second": 8.884, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.0364904403686523, + "eval_logits/rejected": -3.0306472778320312, + "eval_logps/chosen": -160.46258544921875, + "eval_logps/rejected": -180.574462890625, + "eval_loss": 0.6469578146934509, + "eval_rewards/accuracies": 0.6150093078613281, + "eval_rewards/chosen": -0.8905255794525146, + "eval_rewards/margins": 0.1641487181186676, + "eval_rewards/rejected": -1.0546742677688599, + "eval_runtime": 483.9156, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 4400 }, { "epoch": 0.76, - "grad_norm": 2.734375, + "grad_norm": 2.71875, "learning_rate": 8.276951448224546e-07, - "logits/chosen": -3.0109057426452637, - "logits/rejected": -3.001814126968384, - "logps/chosen": -185.36212158203125, - "logps/rejected": -205.12521362304688, - "loss": 0.6408, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1937808990478516, - "rewards/margins": 0.22498579323291779, - "rewards/rejected": -1.4187666177749634, + "logits/chosen": -3.004551410675049, + "logits/rejected": -2.995788097381592, + "logps/chosen": -186.7025604248047, + "logps/rejected": -206.0946502685547, + "loss": 0.6432, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.20718514919281, + "rewards/margins": 0.22127576172351837, + "rewards/rejected": -1.4284608364105225, "step": 4410 }, { "epoch": 0.76, - "grad_norm": 3.421875, + "grad_norm": 3.59375, "learning_rate": 8.165456187243797e-07, - "logits/chosen": -3.02956485748291, - "logits/rejected": -3.024728536605835, - "logps/chosen": -181.58412170410156, - "logps/rejected": -200.54795837402344, - "loss": 0.6143, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1534773111343384, - "rewards/margins": 0.2757090926170349, - "rewards/rejected": -1.4291863441467285, + "logits/chosen": -3.0231869220733643, + "logits/rejected": -3.0181660652160645, + "logps/chosen": -182.51235961914062, + "logps/rejected": -201.51275634765625, + "loss": 0.6154, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.16275954246521, + "rewards/margins": 0.2760746479034424, + "rewards/rejected": -1.4388344287872314, "step": 4420 }, { "epoch": 0.76, - "grad_norm": 3.65625, + "grad_norm": 3.984375, "learning_rate": 8.054570219072419e-07, - "logits/chosen": -3.0097193717956543, - "logits/rejected": -3.0025272369384766, - "logps/chosen": -175.24148559570312, - "logps/rejected": -196.47402954101562, - "loss": 0.6314, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1144134998321533, - "rewards/margins": 0.2582167387008667, - "rewards/rejected": -1.372630000114441, + "logits/chosen": -3.004575729370117, + "logits/rejected": -2.997720241546631, + "logps/chosen": -176.9400634765625, + "logps/rejected": -197.44618225097656, + "loss": 0.6364, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.131399393081665, + "rewards/margins": 0.2509526312351227, + "rewards/rejected": -1.3823518753051758, "step": 4430 }, { "epoch": 0.77, - "grad_norm": 2.9375, + "grad_norm": 2.71875, "learning_rate": 7.944297557007366e-07, - "logits/chosen": -3.033053398132324, - "logits/rejected": -3.0255160331726074, - "logps/chosen": -187.86953735351562, - "logps/rejected": -211.07113647460938, - "loss": 0.6031, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1804062128067017, - "rewards/margins": 0.3040129244327545, - "rewards/rejected": -1.4844191074371338, + "logits/chosen": -3.0259780883789062, + "logits/rejected": -3.019028663635254, + "logps/chosen": -189.46664428710938, + "logps/rejected": -213.1741485595703, + "loss": 0.6029, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1963773965835571, + "rewards/margins": 0.309071809053421, + "rewards/rejected": -1.5054491758346558, "step": 4440 }, { "epoch": 0.77, - "grad_norm": 2.34375, + "grad_norm": 2.234375, "learning_rate": 7.834642192148151e-07, - "logits/chosen": -3.0207505226135254, - "logits/rejected": -3.013399600982666, - "logps/chosen": -172.2884063720703, - "logps/rejected": -196.8644256591797, - "loss": 0.602, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0546090602874756, - "rewards/margins": 0.29372638463974, - "rewards/rejected": -1.3483353853225708, + "logits/chosen": -3.0177788734436035, + "logits/rejected": -3.010500431060791, + "logps/chosen": -172.21006774902344, + "logps/rejected": -196.91000366210938, + "loss": 0.6027, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0538257360458374, + "rewards/margins": 0.2949651777744293, + "rewards/rejected": -1.3487910032272339, "step": 4450 }, { "epoch": 0.77, - "grad_norm": 2.5625, + "grad_norm": 2.640625, "learning_rate": 7.725608093252496e-07, - "logits/chosen": -3.0331952571868896, - "logits/rejected": -3.024142026901245, - "logps/chosen": -167.3407440185547, - "logps/rejected": -199.12606811523438, - "loss": 0.5877, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0478832721710205, - "rewards/margins": 0.3440885841846466, - "rewards/rejected": -1.3919718265533447, + "logits/chosen": -3.030818223953247, + "logits/rejected": -3.021237850189209, + "logps/chosen": -167.2049102783203, + "logps/rejected": -199.4949188232422, + "loss": 0.5868, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0465247631072998, + "rewards/margins": 0.3491355776786804, + "rewards/rejected": -1.395660161972046, "step": 4460 }, { "epoch": 0.77, "grad_norm": 2.71875, "learning_rate": 7.617199206592584e-07, - "logits/chosen": -3.0438404083251953, - "logits/rejected": -3.0357441902160645, - "logps/chosen": -175.3656768798828, - "logps/rejected": -192.6163787841797, - "loss": 0.6263, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.0873674154281616, - "rewards/margins": 0.25625208020210266, - "rewards/rejected": -1.3436195850372314, + "logits/chosen": -3.0402872562408447, + "logits/rejected": -3.032195568084717, + "logps/chosen": -176.73138427734375, + "logps/rejected": -195.28189086914062, + "loss": 0.6216, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1010245084762573, + "rewards/margins": 0.2692503035068512, + "rewards/rejected": -1.3702747821807861, "step": 4470 }, { "epoch": 0.77, "grad_norm": 4.0625, "learning_rate": 7.509419455812336e-07, - "logits/chosen": -3.050670862197876, - "logits/rejected": -3.042977809906006, - "logps/chosen": -173.09136962890625, - "logps/rejected": -199.6421661376953, - "loss": 0.615, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.115952730178833, - "rewards/margins": 0.2790243625640869, - "rewards/rejected": -1.3949769735336304, + "logits/chosen": -3.045377254486084, + "logits/rejected": -3.0374984741210938, + "logps/chosen": -174.28468322753906, + "logps/rejected": -200.76377868652344, + "loss": 0.6152, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1278858184814453, + "rewards/margins": 0.27830731868743896, + "rewards/rejected": -1.4061931371688843, "step": 4480 }, { "epoch": 0.77, - "grad_norm": 3.0, + "grad_norm": 2.921875, "learning_rate": 7.402272741785322e-07, - "logits/chosen": -3.0218803882598877, - "logits/rejected": -3.0118536949157715, - "logps/chosen": -170.727294921875, - "logps/rejected": -197.43954467773438, - "loss": 0.5931, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0870743989944458, - "rewards/margins": 0.30827030539512634, - "rewards/rejected": -1.3953447341918945, + "logits/chosen": -3.0193753242492676, + "logits/rejected": -3.0095601081848145, + "logps/chosen": -171.40904235839844, + "logps/rejected": -198.89688110351562, + "loss": 0.591, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0938918590545654, + "rewards/margins": 0.3160261809825897, + "rewards/rejected": -1.4099183082580566, "step": 4490 }, { "epoch": 0.78, - "grad_norm": 2.65625, + "grad_norm": 2.546875, "learning_rate": 7.295762942473614e-07, - "logits/chosen": -3.020730972290039, - "logits/rejected": -3.009143352508545, - "logps/chosen": -177.8622589111328, - "logps/rejected": -201.65280151367188, - "loss": 0.6145, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.102785587310791, - "rewards/margins": 0.30618494749069214, - "rewards/rejected": -1.408970594406128, + "logits/chosen": -3.0159997940063477, + "logits/rejected": -3.003957748413086, + "logps/chosen": -178.99685668945312, + "logps/rejected": -203.6125030517578, + "loss": 0.6127, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1141316890716553, + "rewards/margins": 0.31443601846694946, + "rewards/rejected": -1.42856764793396, "step": 4500 }, { "epoch": 0.78, - "eval_logits/chosen": -3.037830114364624, - "eval_logits/rejected": -3.0319135189056396, - "eval_logps/chosen": -160.1719512939453, - "eval_logps/rejected": -179.67282104492188, - "eval_loss": 0.6489555239677429, - "eval_rewards/accuracies": 0.6112918257713318, - "eval_rewards/chosen": -0.8876191973686218, - "eval_rewards/margins": 0.1580386459827423, - "eval_rewards/rejected": -1.0456578731536865, - "eval_runtime": 484.2849, - "eval_samples_per_second": 8.887, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.0338096618652344, + "eval_logits/rejected": -3.027985095977783, + "eval_logps/chosen": -160.40371704101562, + "eval_logps/rejected": -180.48422241210938, + "eval_loss": 0.6470324993133545, + "eval_rewards/accuracies": 0.6182620525360107, + "eval_rewards/chosen": -0.8899369239807129, + "eval_rewards/margins": 0.16383494436740875, + "eval_rewards/rejected": -1.0537718534469604, + "eval_runtime": 483.655, + "eval_samples_per_second": 8.899, + "eval_steps_per_second": 1.112, "step": 4500 }, { "epoch": 0.78, - "grad_norm": 2.734375, + "grad_norm": 2.625, "learning_rate": 7.189893912787424e-07, - "logits/chosen": -3.0372560024261475, - "logits/rejected": -3.023256778717041, - "logps/chosen": -174.68624877929688, - "logps/rejected": -209.28921508789062, - "loss": 0.5836, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.0918270349502563, - "rewards/margins": 0.357189804315567, - "rewards/rejected": -1.449016809463501, + "logits/chosen": -3.0327839851379395, + "logits/rejected": -3.0189476013183594, + "logps/chosen": -175.37973022460938, + "logps/rejected": -210.47427368164062, + "loss": 0.5837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0987615585327148, + "rewards/margins": 0.3621058762073517, + "rewards/rejected": -1.4608676433563232, "step": 4510 }, { "epoch": 0.78, - "grad_norm": 2.671875, + "grad_norm": 2.84375, "learning_rate": 7.084669484445581e-07, - "logits/chosen": -3.0338385105133057, - "logits/rejected": -3.0261270999908447, - "logps/chosen": -177.9006805419922, - "logps/rejected": -195.68893432617188, - "loss": 0.6342, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1469968557357788, - "rewards/margins": 0.22922606766223907, - "rewards/rejected": -1.3762229681015015, + "logits/chosen": -3.0290794372558594, + "logits/rejected": -3.021406412124634, + "logps/chosen": -179.26211547851562, + "logps/rejected": -196.85629272460938, + "loss": 0.6376, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1606110334396362, + "rewards/margins": 0.22728554904460907, + "rewards/rejected": -1.3878967761993408, "step": 4520 }, { "epoch": 0.78, - "grad_norm": 5.59375, + "grad_norm": 5.65625, "learning_rate": 6.980093465836852e-07, - "logits/chosen": -3.0291950702667236, - "logits/rejected": -3.0212788581848145, - "logps/chosen": -176.28085327148438, - "logps/rejected": -194.02845764160156, - "loss": 0.6486, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.120300531387329, - "rewards/margins": 0.21059612929821014, - "rewards/rejected": -1.3308966159820557, + "logits/chosen": -3.0249483585357666, + "logits/rejected": -3.0177531242370605, + "logps/chosen": -177.0812530517578, + "logps/rejected": -194.39430236816406, + "loss": 0.6518, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.128304123878479, + "rewards/margins": 0.20625057816505432, + "rewards/rejected": -1.3345547914505005, "step": 4530 }, { "epoch": 0.78, - "grad_norm": 4.125, + "grad_norm": 4.09375, "learning_rate": 6.876169641882105e-07, - "logits/chosen": -3.0162062644958496, - "logits/rejected": -3.005119800567627, - "logps/chosen": -172.72186279296875, - "logps/rejected": -192.6821746826172, - "loss": 0.6345, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.0865572690963745, - "rewards/margins": 0.2528809905052185, - "rewards/rejected": -1.3394381999969482, + "logits/chosen": -3.0121078491210938, + "logits/rejected": -3.000190019607544, + "logps/chosen": -173.17886352539062, + "logps/rejected": -194.7333526611328, + "loss": 0.6287, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0911271572113037, + "rewards/margins": 0.26882293820381165, + "rewards/rejected": -1.3599501848220825, "step": 4540 }, { "epoch": 0.78, - "grad_norm": 2.859375, + "grad_norm": 2.734375, "learning_rate": 6.772901773897319e-07, - "logits/chosen": -3.0391769409179688, - "logits/rejected": -3.0284054279327393, - "logps/chosen": -175.7732391357422, - "logps/rejected": -200.0652618408203, - "loss": 0.6028, + "logits/chosen": -3.034879207611084, + "logits/rejected": -3.024160385131836, + "logps/chosen": -177.19334411621094, + "logps/rejected": -202.35165405273438, + "loss": 0.5996, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.075699806213379, - "rewards/margins": 0.3025535047054291, - "rewards/rejected": -1.37825345993042, + "rewards/chosen": -1.0899009704589844, + "rewards/margins": 0.31121626496315, + "rewards/rejected": -1.4011173248291016, "step": 4550 }, { "epoch": 0.79, - "grad_norm": 2.5, + "grad_norm": 2.484375, "learning_rate": 6.670293599457459e-07, - "logits/chosen": -3.0181899070739746, - "logits/rejected": -3.0071330070495605, - "logps/chosen": -173.14572143554688, - "logps/rejected": -197.73306274414062, - "loss": 0.6003, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.064906358718872, - "rewards/margins": 0.30180588364601135, - "rewards/rejected": -1.366712212562561, + "logits/chosen": -3.0138959884643555, + "logits/rejected": -3.0035240650177, + "logps/chosen": -173.7094268798828, + "logps/rejected": -199.3247833251953, + "loss": 0.5977, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0705434083938599, + "rewards/margins": 0.31208616495132446, + "rewards/rejected": -1.382629632949829, "step": 4560 }, { "epoch": 0.79, - "grad_norm": 3.890625, + "grad_norm": 3.71875, "learning_rate": 6.568348832261174e-07, - "logits/chosen": -3.027233839035034, - "logits/rejected": -3.020333766937256, - "logps/chosen": -179.43515014648438, - "logps/rejected": -210.864013671875, - "loss": 0.6053, + "logits/chosen": -3.0246381759643555, + "logits/rejected": -3.0175058841705322, + "logps/chosen": -179.7490997314453, + "logps/rejected": -212.00320434570312, + "loss": 0.6024, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1779333353042603, - "rewards/margins": 0.3318426311016083, - "rewards/rejected": -1.5097758769989014, + "rewards/chosen": -1.1810725927352905, + "rewards/margins": 0.3400949537754059, + "rewards/rejected": -1.521167516708374, "step": 4570 }, { "epoch": 0.79, - "grad_norm": 2.34375, + "grad_norm": 2.4375, "learning_rate": 6.467071161996447e-07, - "logits/chosen": -3.0118486881256104, - "logits/rejected": -3.0026967525482178, - "logps/chosen": -167.81613159179688, - "logps/rejected": -190.34109497070312, - "loss": 0.6067, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0197126865386963, - "rewards/margins": 0.27357515692710876, - "rewards/rejected": -1.293287992477417, + "logits/chosen": -3.007577657699585, + "logits/rejected": -2.9982829093933105, + "logps/chosen": -169.00662231445312, + "logps/rejected": -191.3984832763672, + "loss": 0.6086, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0316174030303955, + "rewards/margins": 0.272244393825531, + "rewards/rejected": -1.3038618564605713, "step": 4580 }, { "epoch": 0.79, - "grad_norm": 3.453125, + "grad_norm": 3.53125, "learning_rate": 6.366464254206966e-07, - "logits/chosen": -3.031846523284912, - "logits/rejected": -3.024355411529541, - "logps/chosen": -179.59120178222656, - "logps/rejected": -197.9866180419922, - "loss": 0.6412, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.1390419006347656, - "rewards/margins": 0.23965242505073547, - "rewards/rejected": -1.3786942958831787, + "logits/chosen": -3.0268683433532715, + "logits/rejected": -3.019697904586792, + "logps/chosen": -181.07965087890625, + "logps/rejected": -199.53695678710938, + "loss": 0.6443, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1539264917373657, + "rewards/margins": 0.2402711808681488, + "rewards/rejected": -1.394197702407837, "step": 4590 }, { "epoch": 0.79, - "grad_norm": 3.046875, + "grad_norm": 3.0, "learning_rate": 6.266531750159557e-07, - "logits/chosen": -3.0279648303985596, - "logits/rejected": -3.0112688541412354, - "logps/chosen": -174.84622192382812, - "logps/rejected": -210.6271514892578, + "logits/chosen": -3.022550106048584, + "logits/rejected": -3.0063700675964355, + "logps/chosen": -175.89822387695312, + "logps/rejected": -211.6148223876953, "loss": 0.5798, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1091092824935913, - "rewards/margins": 0.3708891272544861, - "rewards/rejected": -1.4799985885620117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1196292638778687, + "rewards/margins": 0.37024620175361633, + "rewards/rejected": -1.4898754358291626, "step": 4600 }, { "epoch": 0.79, - "eval_logits/chosen": -3.0308055877685547, - "eval_logits/rejected": -3.0247178077697754, - "eval_logps/chosen": -162.68133544921875, - "eval_logps/rejected": -182.4701385498047, - "eval_loss": 0.6488083004951477, - "eval_rewards/accuracies": 0.6147769689559937, - "eval_rewards/chosen": -0.9127131104469299, - "eval_rewards/margins": 0.16091784834861755, - "eval_rewards/rejected": -1.0736308097839355, - "eval_runtime": 483.7515, + "eval_logits/chosen": -3.0254907608032227, + "eval_logits/rejected": -3.019495964050293, + "eval_logps/chosen": -162.6863555908203, + "eval_logps/rejected": -183.03439331054688, + "eval_loss": 0.6467684507369995, + "eval_rewards/accuracies": 0.6208178400993347, + "eval_rewards/chosen": -0.9127631783485413, + "eval_rewards/margins": 0.16651012003421783, + "eval_rewards/rejected": -1.0792733430862427, + "eval_runtime": 483.7395, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 4600 }, { "epoch": 0.79, - "grad_norm": 3.15625, + "grad_norm": 3.296875, "learning_rate": 6.167277266712293e-07, - "logits/chosen": -3.006178379058838, - "logits/rejected": -3.000431776046753, - "logps/chosen": -184.13197326660156, - "logps/rejected": -201.56419372558594, - "loss": 0.6495, + "logits/chosen": -3.0018093585968018, + "logits/rejected": -2.995985269546509, + "logps/chosen": -184.42770385742188, + "logps/rejected": -202.32168579101562, + "loss": 0.6484, "rewards/accuracies": 0.625, - "rewards/chosen": -1.224673867225647, - "rewards/margins": 0.20185346901416779, - "rewards/rejected": -1.4265271425247192, + "rewards/chosen": -1.227630615234375, + "rewards/margins": 0.2064712941646576, + "rewards/rejected": -1.4341020584106445, "step": 4610 }, { "epoch": 0.8, - "grad_norm": 3.375, + "grad_norm": 3.109375, "learning_rate": 6.068704396183694e-07, - "logits/chosen": -3.0257067680358887, - "logits/rejected": -3.0185368061065674, - "logps/chosen": -173.94790649414062, - "logps/rejected": -199.21034240722656, - "loss": 0.5994, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.074373722076416, - "rewards/margins": 0.2779634892940521, - "rewards/rejected": -1.3523372411727905, + "logits/chosen": -3.023332118988037, + "logits/rejected": -3.015925168991089, + "logps/chosen": -174.80226135253906, + "logps/rejected": -200.43743896484375, + "loss": 0.5993, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.082917332649231, + "rewards/margins": 0.28169089555740356, + "rewards/rejected": -1.3646082878112793, "step": 4620 }, { "epoch": 0.8, - "grad_norm": 2.859375, + "grad_norm": 2.84375, "learning_rate": 5.970816706222604e-07, - "logits/chosen": -3.021345853805542, - "logits/rejected": -3.0148777961730957, - "logps/chosen": -186.1139678955078, - "logps/rejected": -213.1942138671875, - "loss": 0.5979, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1828062534332275, - "rewards/margins": 0.3105699419975281, - "rewards/rejected": -1.4933760166168213, + "logits/chosen": -3.019421100616455, + "logits/rejected": -3.0130579471588135, + "logps/chosen": -186.56312561035156, + "logps/rejected": -214.2733917236328, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.187298059463501, + "rewards/margins": 0.31686994433403015, + "rewards/rejected": -1.504167914390564, "step": 4630 }, { "epoch": 0.8, - "grad_norm": 2.734375, + "grad_norm": 2.640625, "learning_rate": 5.873617739679172e-07, - "logits/chosen": -3.010087251663208, - "logits/rejected": -3.0054357051849365, - "logps/chosen": -188.48785400390625, - "logps/rejected": -210.8941650390625, - "loss": 0.6287, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2335959672927856, - "rewards/margins": 0.24029798805713654, - "rewards/rejected": -1.4738938808441162, + "logits/chosen": -3.006319522857666, + "logits/rejected": -3.0017735958099365, + "logps/chosen": -191.32012939453125, + "logps/rejected": -214.06747436523438, + "loss": 0.6296, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2619186639785767, + "rewards/margins": 0.24370841681957245, + "rewards/rejected": -1.5056270360946655, "step": 4640 }, { "epoch": 0.8, "grad_norm": 2.734375, "learning_rate": 5.77711101447652e-07, - "logits/chosen": -3.0194153785705566, - "logits/rejected": -3.0138816833496094, - "logps/chosen": -182.27523803710938, - "logps/rejected": -202.95693969726562, - "loss": 0.6223, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1761658191680908, - "rewards/margins": 0.2618311941623688, - "rewards/rejected": -1.4379971027374268, + "logits/chosen": -3.0137367248535156, + "logits/rejected": -3.00858998298645, + "logps/chosen": -183.54359436035156, + "logps/rejected": -204.957275390625, + "loss": 0.6188, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1888493299484253, + "rewards/margins": 0.2691509425640106, + "rewards/rejected": -1.4580004215240479, "step": 4650 }, { "epoch": 0.8, - "grad_norm": 2.890625, + "grad_norm": 2.6875, "learning_rate": 5.681300023483521e-07, - "logits/chosen": -3.02057147026062, - "logits/rejected": -3.013674020767212, - "logps/chosen": -180.13131713867188, - "logps/rejected": -201.14138793945312, - "loss": 0.6135, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1474506855010986, - "rewards/margins": 0.2440837174654007, - "rewards/rejected": -1.391534447669983, + "logits/chosen": -3.017210006713867, + "logits/rejected": -3.0096325874328613, + "logps/chosen": -180.97731018066406, + "logps/rejected": -202.33935546875, + "loss": 0.613, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1559107303619385, + "rewards/margins": 0.2476036101579666, + "rewards/rejected": -1.4035141468048096, "step": 4660 }, { "epoch": 0.8, - "grad_norm": 3.140625, + "grad_norm": 2.90625, "learning_rate": 5.586188234388306e-07, - "logits/chosen": -3.022001028060913, - "logits/rejected": -3.0123629570007324, - "logps/chosen": -172.04263305664062, - "logps/rejected": -200.15243530273438, - "loss": 0.5797, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0928536653518677, - "rewards/margins": 0.3391774594783783, - "rewards/rejected": -1.4320310354232788, + "logits/chosen": -3.0169410705566406, + "logits/rejected": -3.0076098442077637, + "logps/chosen": -173.56065368652344, + "logps/rejected": -200.5047607421875, + "loss": 0.5863, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1080338954925537, + "rewards/margins": 0.3275201916694641, + "rewards/rejected": -1.4355541467666626, "step": 4670 }, { "epoch": 0.81, - "grad_norm": 2.96875, + "grad_norm": 2.765625, "learning_rate": 5.491779089572793e-07, - "logits/chosen": -3.0283255577087402, - "logits/rejected": -3.0246620178222656, - "logps/chosen": -179.98556518554688, - "logps/rejected": -201.14598083496094, - "loss": 0.6366, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1617043018341064, - "rewards/margins": 0.24819931387901306, - "rewards/rejected": -1.409903645515442, + "logits/chosen": -3.0221850872039795, + "logits/rejected": -3.019211769104004, + "logps/chosen": -181.35293579101562, + "logps/rejected": -202.7132568359375, + "loss": 0.6375, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1753777265548706, + "rewards/margins": 0.2501986026763916, + "rewards/rejected": -1.4255764484405518, "step": 4680 }, { "epoch": 0.81, - "grad_norm": 2.59375, + "grad_norm": 2.359375, "learning_rate": 5.398076005988082e-07, - "logits/chosen": -3.0354971885681152, - "logits/rejected": -3.0243539810180664, - "logps/chosen": -184.61776733398438, - "logps/rejected": -217.5264892578125, - "loss": 0.592, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1839282512664795, - "rewards/margins": 0.3616673946380615, - "rewards/rejected": -1.5455955266952515, + "logits/chosen": -3.0308825969696045, + "logits/rejected": -3.0206403732299805, + "logps/chosen": -186.6603240966797, + "logps/rejected": -218.62936401367188, + "loss": 0.5978, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.204353928565979, + "rewards/margins": 0.3522704541683197, + "rewards/rejected": -1.556624412536621, "step": 4690 }, { "epoch": 0.81, - "grad_norm": 3.671875, + "grad_norm": 3.71875, "learning_rate": 5.305082375030798e-07, - "logits/chosen": -3.0186095237731934, - "logits/rejected": -3.010040044784546, - "logps/chosen": -179.74301147460938, - "logps/rejected": -208.00949096679688, - "loss": 0.6218, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.171088457107544, - "rewards/margins": 0.2796909809112549, - "rewards/rejected": -1.4507795572280884, + "logits/chosen": -3.0150041580200195, + "logits/rejected": -3.006856679916382, + "logps/chosen": -181.75979614257812, + "logps/rejected": -209.6063232421875, + "loss": 0.6228, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1912561655044556, + "rewards/margins": 0.2754915654659271, + "rewards/rejected": -1.466747760772705, "step": 4700 }, { "epoch": 0.81, - "eval_logits/chosen": -3.0306708812713623, - "eval_logits/rejected": -3.02455735206604, - "eval_logps/chosen": -163.04933166503906, - "eval_logps/rejected": -182.94818115234375, - "eval_loss": 0.6485710740089417, - "eval_rewards/accuracies": 0.6152416467666626, - "eval_rewards/chosen": -0.916392982006073, - "eval_rewards/margins": 0.1620185673236847, - "eval_rewards/rejected": -1.07841157913208, - "eval_runtime": 484.3564, - "eval_samples_per_second": 8.886, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.0290753841400146, + "eval_logits/rejected": -3.0230636596679688, + "eval_logps/chosen": -163.55618286132812, + "eval_logps/rejected": -184.06402587890625, + "eval_loss": 0.6466771960258484, + "eval_rewards/accuracies": 0.6191914677619934, + "eval_rewards/chosen": -0.9214615821838379, + "eval_rewards/margins": 0.1681082397699356, + "eval_rewards/rejected": -1.0895699262619019, + "eval_runtime": 483.8875, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.112, "step": 4700 }, { "epoch": 0.81, - "grad_norm": 3.1875, + "grad_norm": 3.203125, "learning_rate": 5.212801562420342e-07, - "logits/chosen": -3.0316972732543945, - "logits/rejected": -3.0217230319976807, - "logps/chosen": -181.59439086914062, - "logps/rejected": -206.296875, - "loss": 0.5887, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.130401372909546, - "rewards/margins": 0.3333713412284851, - "rewards/rejected": -1.4637725353240967, + "logits/chosen": -3.027268409729004, + "logits/rejected": -3.017737627029419, + "logps/chosen": -182.80491638183594, + "logps/rejected": -208.4292449951172, + "loss": 0.5857, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1425063610076904, + "rewards/margins": 0.3425898849964142, + "rewards/rejected": -1.4850962162017822, "step": 4710 }, { "epoch": 0.81, - "grad_norm": 2.9375, + "grad_norm": 3.15625, "learning_rate": 5.121236908077063e-07, - "logits/chosen": -3.0099377632141113, - "logits/rejected": -3.002049207687378, - "logps/chosen": -181.88710021972656, - "logps/rejected": -211.2221221923828, - "loss": 0.5857, + "logits/chosen": -3.0060229301452637, + "logits/rejected": -2.9983930587768555, + "logps/chosen": -183.74058532714844, + "logps/rejected": -213.54690551757812, + "loss": 0.5862, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1480029821395874, - "rewards/margins": 0.3564620912075043, - "rewards/rejected": -1.5044652223587036, + "rewards/chosen": -1.1665380001068115, + "rewards/margins": 0.361175000667572, + "rewards/rejected": -1.5277130603790283, "step": 4720 }, { "epoch": 0.81, - "grad_norm": 3.25, + "grad_norm": 3.328125, "learning_rate": 5.030391726001394e-07, - "logits/chosen": -3.00630521774292, - "logits/rejected": -2.998715877532959, - "logps/chosen": -180.12782287597656, - "logps/rejected": -203.79656982421875, - "loss": 0.6242, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1553746461868286, - "rewards/margins": 0.2904283106327057, - "rewards/rejected": -1.445802927017212, + "logits/chosen": -3.003763198852539, + "logits/rejected": -2.9963369369506836, + "logps/chosen": -181.78604125976562, + "logps/rejected": -205.4948272705078, + "loss": 0.6263, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.171957015991211, + "rewards/margins": 0.2908283770084381, + "rewards/rejected": -1.462785243988037, "step": 4730 }, { "epoch": 0.82, - "grad_norm": 3.125, + "grad_norm": 3.21875, "learning_rate": 4.940269304153919e-07, - "logits/chosen": -3.0037386417388916, - "logits/rejected": -2.993199586868286, - "logps/chosen": -173.32028198242188, - "logps/rejected": -207.9495391845703, - "loss": 0.5761, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.065929651260376, - "rewards/margins": 0.38310036063194275, - "rewards/rejected": -1.4490301609039307, + "logits/chosen": -2.9984312057495117, + "logits/rejected": -2.9876062870025635, + "logps/chosen": -174.63938903808594, + "logps/rejected": -210.2099609375, + "loss": 0.5725, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0791208744049072, + "rewards/margins": 0.3925134241580963, + "rewards/rejected": -1.4716343879699707, "step": 4740 }, { "epoch": 0.82, - "grad_norm": 2.859375, + "grad_norm": 3.078125, "learning_rate": 4.850872904336307e-07, - "logits/chosen": -3.0079355239868164, - "logits/rejected": -3.0077691078186035, - "logps/chosen": -180.9278564453125, - "logps/rejected": -201.39138793945312, - "loss": 0.6212, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1507413387298584, - "rewards/margins": 0.23052909970283508, - "rewards/rejected": -1.3812705278396606, + "logits/chosen": -3.005293607711792, + "logits/rejected": -3.0054850578308105, + "logps/chosen": -181.31361389160156, + "logps/rejected": -202.7025604248047, + "loss": 0.6171, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1545989513397217, + "rewards/margins": 0.23978309333324432, + "rewards/rejected": -1.394382119178772, "step": 4750 }, { "epoch": 0.82, - "grad_norm": 2.890625, + "grad_norm": 2.84375, "learning_rate": 4.762205762073363e-07, - "logits/chosen": -3.006566286087036, - "logits/rejected": -2.997593402862549, - "logps/chosen": -180.80429077148438, - "logps/rejected": -209.4434051513672, - "loss": 0.599, + "logits/chosen": -3.003577947616577, + "logits/rejected": -2.994811534881592, + "logps/chosen": -182.7504425048828, + "logps/rejected": -210.9694366455078, + "loss": 0.6039, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1853374242782593, - "rewards/margins": 0.31632545590400696, - "rewards/rejected": -1.5016629695892334, + "rewards/chosen": -1.204798698425293, + "rewards/margins": 0.3121243119239807, + "rewards/rejected": -1.516923189163208, "step": 4760 }, { "epoch": 0.82, - "grad_norm": 2.578125, + "grad_norm": 2.640625, "learning_rate": 4.6742710864958103e-07, - "logits/chosen": -3.026768684387207, - "logits/rejected": -3.0133731365203857, - "logps/chosen": -187.4033660888672, - "logps/rejected": -210.99179077148438, - "loss": 0.6107, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1747559309005737, - "rewards/margins": 0.31086036562919617, - "rewards/rejected": -1.4856163263320923, + "logits/chosen": -3.023016929626465, + "logits/rejected": -3.009099006652832, + "logps/chosen": -189.5552215576172, + "logps/rejected": -213.70492553710938, + "loss": 0.6092, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1962745189666748, + "rewards/margins": 0.3164735436439514, + "rewards/rejected": -1.5127480030059814, "step": 4770 }, { "epoch": 0.82, - "grad_norm": 2.984375, + "grad_norm": 3.25, "learning_rate": 4.5870720602242513e-07, - "logits/chosen": -3.0101070404052734, - "logits/rejected": -2.9985129833221436, - "logps/chosen": -177.30874633789062, - "logps/rejected": -208.9322509765625, - "loss": 0.589, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1518343687057495, - "rewards/margins": 0.321411669254303, - "rewards/rejected": -1.4732460975646973, + "logits/chosen": -3.0070056915283203, + "logits/rejected": -2.9952831268310547, + "logps/chosen": -178.74525451660156, + "logps/rejected": -210.75643920898438, + "loss": 0.5898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1661994457244873, + "rewards/margins": 0.3252885341644287, + "rewards/rejected": -1.4914880990982056, "step": 4780 }, { "epoch": 0.83, - "grad_norm": 2.6875, + "grad_norm": 2.75, "learning_rate": 4.500611839253871e-07, - "logits/chosen": -3.0094149112701416, - "logits/rejected": -3.0022799968719482, - "logps/chosen": -188.61123657226562, - "logps/rejected": -201.9243621826172, - "loss": 0.6452, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2073025703430176, - "rewards/margins": 0.20692972838878632, - "rewards/rejected": -1.4142323732376099, + "logits/chosen": -3.0066330432891846, + "logits/rejected": -2.999293327331543, + "logps/chosen": -189.28778076171875, + "logps/rejected": -203.16702270507812, + "loss": 0.6432, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.214068055152893, + "rewards/margins": 0.21259085834026337, + "rewards/rejected": -1.4266588687896729, "step": 4790 }, { "epoch": 0.83, - "grad_norm": 2.5625, + "grad_norm": 2.546875, "learning_rate": 4.4148935528403244e-07, - "logits/chosen": -2.9978463649749756, - "logits/rejected": -2.9894039630889893, - "logps/chosen": -180.93643188476562, - "logps/rejected": -207.8934326171875, - "loss": 0.6102, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1843605041503906, - "rewards/margins": 0.2998817563056946, - "rewards/rejected": -1.4842422008514404, + "logits/chosen": -2.9953420162200928, + "logits/rejected": -2.9865942001342773, + "logps/chosen": -182.12466430664062, + "logps/rejected": -209.14828491210938, + "loss": 0.6131, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1962429285049438, + "rewards/margins": 0.30054768919944763, + "rewards/rejected": -1.4967906475067139, "step": 4800 }, { "epoch": 0.83, - "eval_logits/chosen": -3.025871753692627, - "eval_logits/rejected": -3.0196568965911865, - "eval_logps/chosen": -164.8939208984375, - "eval_logps/rejected": -184.97686767578125, - "eval_loss": 0.6484309434890747, - "eval_rewards/accuracies": 0.6150093078613281, - "eval_rewards/chosen": -0.934839129447937, - "eval_rewards/margins": 0.16385912895202637, - "eval_rewards/rejected": -1.0986981391906738, - "eval_runtime": 484.0715, - "eval_samples_per_second": 8.891, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.020195960998535, + "eval_logits/rejected": -3.014082670211792, + "eval_logps/chosen": -165.31649780273438, + "eval_logps/rejected": -186.017578125, + "eval_loss": 0.6466081738471985, + "eval_rewards/accuracies": 0.6198884844779968, + "eval_rewards/chosen": -0.9390648603439331, + "eval_rewards/margins": 0.1700403094291687, + "eval_rewards/rejected": -1.1091052293777466, + "eval_runtime": 483.8577, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.112, "step": 4800 }, { "epoch": 0.83, - "grad_norm": 3.09375, + "grad_norm": 2.765625, "learning_rate": 4.3299203033863643e-07, - "logits/chosen": -3.003702163696289, - "logits/rejected": -2.996516704559326, - "logps/chosen": -183.0110321044922, - "logps/rejected": -206.063232421875, - "loss": 0.5994, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.151745319366455, - "rewards/margins": 0.31077447533607483, - "rewards/rejected": -1.462519645690918, + "logits/chosen": -3.0014617443084717, + "logits/rejected": -2.9942636489868164, + "logps/chosen": -184.14466857910156, + "logps/rejected": -207.19735717773438, + "loss": 0.601, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1630815267562866, + "rewards/margins": 0.31077930331230164, + "rewards/rejected": -1.4738609790802002, "step": 4810 }, { "epoch": 0.83, - "grad_norm": 3.671875, + "grad_norm": 3.234375, "learning_rate": 4.245695166329661e-07, - "logits/chosen": -3.0186033248901367, - "logits/rejected": -3.013110399246216, - "logps/chosen": -175.73336791992188, - "logps/rejected": -201.29922485351562, - "loss": 0.6165, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1273748874664307, - "rewards/margins": 0.2871752381324768, - "rewards/rejected": -1.4145500659942627, + "logits/chosen": -3.0155398845672607, + "logits/rejected": -3.00993013381958, + "logps/chosen": -176.3240966796875, + "logps/rejected": -202.77798461914062, + "loss": 0.6134, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1332825422286987, + "rewards/margins": 0.29605549573898315, + "rewards/rejected": -1.429337978363037, "step": 4820 }, { "epoch": 0.83, - "grad_norm": 2.859375, + "grad_norm": 3.015625, "learning_rate": 4.1622211900314235e-07, - "logits/chosen": -3.0206007957458496, - "logits/rejected": -3.0085816383361816, - "logps/chosen": -178.76016235351562, - "logps/rejected": -201.3628692626953, - "loss": 0.6258, + "logits/chosen": -3.0176029205322266, + "logits/rejected": -3.0059218406677246, + "logps/chosen": -178.7200927734375, + "logps/rejected": -202.0586395263672, + "loss": 0.6233, "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1388037204742432, - "rewards/margins": 0.26333537697792053, - "rewards/rejected": -1.4021390676498413, + "rewards/chosen": -1.1384029388427734, + "rewards/margins": 0.27069368958473206, + "rewards/rejected": -1.4090964794158936, "step": 4830 }, { "epoch": 0.83, - "grad_norm": 2.40625, + "grad_norm": 2.328125, "learning_rate": 4.0795013956660884e-07, - "logits/chosen": -3.0116143226623535, - "logits/rejected": -2.9952967166900635, - "logps/chosen": -187.82388305664062, - "logps/rejected": -217.6949005126953, - "loss": 0.5905, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2078577280044556, - "rewards/margins": 0.3612230718135834, - "rewards/rejected": -1.5690808296203613, + "logits/chosen": -3.0080854892730713, + "logits/rejected": -2.9921693801879883, + "logps/chosen": -189.64425659179688, + "logps/rejected": -220.44540405273438, + "loss": 0.5898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.226061463356018, + "rewards/margins": 0.37052440643310547, + "rewards/rejected": -1.5965858697891235, "step": 4840 }, { "epoch": 0.84, - "grad_norm": 2.59375, + "grad_norm": 2.546875, "learning_rate": 3.9975387771119925e-07, - "logits/chosen": -3.0145621299743652, - "logits/rejected": -3.007615327835083, - "logps/chosen": -176.22604370117188, - "logps/rejected": -202.64767456054688, - "loss": 0.6038, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1093838214874268, - "rewards/margins": 0.2972642481327057, - "rewards/rejected": -1.4066479206085205, + "logits/chosen": -3.0115370750427246, + "logits/rejected": -3.0042665004730225, + "logps/chosen": -176.58824157714844, + "logps/rejected": -204.17892456054688, + "loss": 0.6, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1130057573318481, + "rewards/margins": 0.30895477533340454, + "rewards/rejected": -1.421960473060608, "step": 4850 }, { "epoch": 0.84, - "grad_norm": 3.59375, + "grad_norm": 3.640625, "learning_rate": 3.916336300842988e-07, - "logits/chosen": -3.0211424827575684, - "logits/rejected": -3.013709545135498, - "logps/chosen": -187.14520263671875, - "logps/rejected": -200.6242218017578, - "loss": 0.6361, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1789805889129639, - "rewards/margins": 0.23755235970020294, - "rewards/rejected": -1.41653311252594, + "logits/chosen": -3.016387701034546, + "logits/rejected": -3.008925199508667, + "logps/chosen": -188.9263916015625, + "logps/rejected": -202.9337158203125, + "loss": 0.6344, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1967926025390625, + "rewards/margins": 0.24283523857593536, + "rewards/rejected": -1.4396278858184814, "step": 4860 }, { "epoch": 0.84, - "grad_norm": 3.125, + "grad_norm": 3.140625, "learning_rate": 3.8358969058210957e-07, - "logits/chosen": -3.0137581825256348, - "logits/rejected": -3.005734920501709, - "logps/chosen": -189.39662170410156, - "logps/rejected": -210.9857940673828, - "loss": 0.61, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2036226987838745, - "rewards/margins": 0.3072102665901184, - "rewards/rejected": -1.5108331441879272, + "logits/chosen": -3.0090579986572266, + "logits/rejected": -3.0010299682617188, + "logps/chosen": -190.43182373046875, + "logps/rejected": -213.4534912109375, + "loss": 0.6068, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2139747142791748, + "rewards/margins": 0.32153528928756714, + "rewards/rejected": -1.5355100631713867, "step": 4870 }, { "epoch": 0.84, - "grad_norm": 2.71875, + "grad_norm": 2.875, "learning_rate": 3.7562235033901273e-07, - "logits/chosen": -3.0142416954040527, - "logits/rejected": -3.006392240524292, - "logps/chosen": -176.6653289794922, - "logps/rejected": -199.0810089111328, - "loss": 0.6117, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1300135850906372, - "rewards/margins": 0.28717657923698425, - "rewards/rejected": -1.4171901941299438, + "logits/chosen": -3.0088894367218018, + "logits/rejected": -3.0016369819641113, + "logps/chosen": -179.21157836914062, + "logps/rejected": -201.3288116455078, + "loss": 0.6141, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1554762125015259, + "rewards/margins": 0.2841919958591461, + "rewards/rejected": -1.4396681785583496, "step": 4880 }, { "epoch": 0.84, "grad_norm": 3.328125, "learning_rate": 3.677318977170324e-07, - "logits/chosen": -3.0293221473693848, - "logits/rejected": -3.0215888023376465, - "logps/chosen": -181.97988891601562, - "logps/rejected": -212.58010864257812, - "loss": 0.5876, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1641104221343994, - "rewards/margins": 0.34244081377983093, - "rewards/rejected": -1.5065511465072632, + "logits/chosen": -3.0278687477111816, + "logits/rejected": -3.020611047744751, + "logps/chosen": -182.8571319580078, + "logps/rejected": -213.39736938476562, + "loss": 0.5899, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1728826761245728, + "rewards/margins": 0.3418412506580353, + "rewards/rejected": -1.514723777770996, "step": 4890 }, { "epoch": 0.84, "grad_norm": 2.765625, "learning_rate": 3.599186182953973e-07, - "logits/chosen": -3.0197079181671143, - "logits/rejected": -3.009781837463379, - "logps/chosen": -179.6743621826172, - "logps/rejected": -203.55433654785156, - "loss": 0.6176, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1642951965332031, - "rewards/margins": 0.29456597566604614, - "rewards/rejected": -1.4588611125946045, + "logits/chosen": -3.0165491104125977, + "logits/rejected": -3.0070672035217285, + "logps/chosen": -181.3474578857422, + "logps/rejected": -204.9975128173828, + "loss": 0.6215, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1810262203216553, + "rewards/margins": 0.29226669669151306, + "rewards/rejected": -1.4732929468154907, "step": 4900 }, { "epoch": 0.84, - "eval_logits/chosen": -3.0273115634918213, - "eval_logits/rejected": -3.021085739135742, - "eval_logps/chosen": -165.75540161132812, - "eval_logps/rejected": -185.9427947998047, - "eval_loss": 0.6482663154602051, - "eval_rewards/accuracies": 0.6157063245773315, - "eval_rewards/chosen": -0.9434537291526794, - "eval_rewards/margins": 0.16490375995635986, - "eval_rewards/rejected": -1.1083574295043945, - "eval_runtime": 484.1991, - "eval_samples_per_second": 8.889, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.024120807647705, + "eval_logits/rejected": -3.0179731845855713, + "eval_logps/chosen": -166.19190979003906, + "eval_logps/rejected": -186.99465942382812, + "eval_loss": 0.6464829444885254, + "eval_rewards/accuracies": 0.6196561455726624, + "eval_rewards/chosen": -0.94781893491745, + "eval_rewards/margins": 0.1710570603609085, + "eval_rewards/rejected": -1.1188760995864868, + "eval_runtime": 483.8313, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 1.112, "step": 4900 }, { "epoch": 0.85, - "grad_norm": 3.1875, + "grad_norm": 3.40625, "learning_rate": 3.5218279486020605e-07, - "logits/chosen": -3.0414328575134277, - "logits/rejected": -3.034557342529297, - "logps/chosen": -179.6194305419922, - "logps/rejected": -204.5117950439453, - "loss": 0.6038, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1279804706573486, - "rewards/margins": 0.3157906234264374, - "rewards/rejected": -1.4437711238861084, + "logits/chosen": -3.0371718406677246, + "logits/rejected": -3.0303807258605957, + "logps/chosen": -181.00103759765625, + "logps/rejected": -205.81491088867188, + "loss": 0.6047, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.141796350479126, + "rewards/margins": 0.31500619649887085, + "rewards/rejected": -1.4568026065826416, "step": 4910 }, { "epoch": 0.85, - "grad_norm": 2.453125, + "grad_norm": 2.546875, "learning_rate": 3.445247073941932e-07, - "logits/chosen": -2.9959917068481445, - "logits/rejected": -2.981940746307373, - "logps/chosen": -184.15304565429688, - "logps/rejected": -220.98568725585938, - "loss": 0.5778, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.197116494178772, - "rewards/margins": 0.39637279510498047, - "rewards/rejected": -1.593489408493042, + "logits/chosen": -2.993680477142334, + "logits/rejected": -2.9794511795043945, + "logps/chosen": -185.45579528808594, + "logps/rejected": -223.18002319335938, + "loss": 0.5763, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2101439237594604, + "rewards/margins": 0.40528878569602966, + "rewards/rejected": -1.6154325008392334, "step": 4920 }, { "epoch": 0.85, - "grad_norm": 2.65625, + "grad_norm": 2.859375, "learning_rate": 3.369446330665918e-07, - "logits/chosen": -3.0306859016418457, - "logits/rejected": -3.021571397781372, - "logps/chosen": -184.3625946044922, - "logps/rejected": -214.49563598632812, - "loss": 0.6277, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2076117992401123, - "rewards/margins": 0.2767212390899658, - "rewards/rejected": -1.4843331575393677, + "logits/chosen": -3.025158405303955, + "logits/rejected": -3.0162906646728516, + "logps/chosen": -186.8469696044922, + "logps/rejected": -216.64041137695312, + "loss": 0.6315, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2324554920196533, + "rewards/margins": 0.2733253836631775, + "rewards/rejected": -1.505780816078186, "step": 4930 }, { "epoch": 0.85, - "grad_norm": 3.3125, + "grad_norm": 3.578125, "learning_rate": 3.2944284622310834e-07, - "logits/chosen": -3.0334010124206543, - "logits/rejected": -3.02193546295166, - "logps/chosen": -185.15042114257812, - "logps/rejected": -213.9960479736328, - "loss": 0.5907, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1906287670135498, - "rewards/margins": 0.35692232847213745, - "rewards/rejected": -1.5475513935089111, + "logits/chosen": -3.0310635566711426, + "logits/rejected": -3.019331216812134, + "logps/chosen": -186.7605438232422, + "logps/rejected": -215.85299682617188, + "loss": 0.5915, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2067301273345947, + "rewards/margins": 0.35939091444015503, + "rewards/rejected": -1.5661208629608154, "step": 4940 }, { "epoch": 0.85, - "grad_norm": 3.078125, + "grad_norm": 3.03125, "learning_rate": 3.220196183759855e-07, - "logits/chosen": -3.0085220336914062, - "logits/rejected": -2.9998950958251953, - "logps/chosen": -179.29806518554688, - "logps/rejected": -208.2992706298828, - "loss": 0.6052, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1302188634872437, - "rewards/margins": 0.317536324262619, - "rewards/rejected": -1.447755217552185, + "logits/chosen": -3.004424810409546, + "logits/rejected": -2.995706558227539, + "logps/chosen": -180.77694702148438, + "logps/rejected": -209.75, + "loss": 0.6066, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.145007848739624, + "rewards/margins": 0.31725460290908813, + "rewards/rejected": -1.4622623920440674, "step": 4950 }, { "epoch": 0.85, - "grad_norm": 3.125, + "grad_norm": 3.21875, "learning_rate": 3.146752181941834e-07, - "logits/chosen": -3.0223827362060547, - "logits/rejected": -3.0077877044677734, - "logps/chosen": -177.91940307617188, - "logps/rejected": -215.8114013671875, - "loss": 0.5797, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1578664779663086, - "rewards/margins": 0.36045825481414795, - "rewards/rejected": -1.518324613571167, + "logits/chosen": -3.0184290409088135, + "logits/rejected": -3.003715991973877, + "logps/chosen": -179.36331176757812, + "logps/rejected": -217.3169403076172, + "loss": 0.5828, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1723054647445679, + "rewards/margins": 0.3610745966434479, + "rewards/rejected": -1.5333800315856934, "step": 4960 }, { "epoch": 0.86, - "grad_norm": 3.25, + "grad_norm": 3.3125, "learning_rate": 3.074099114936491e-07, - "logits/chosen": -3.0086965560913086, - "logits/rejected": -2.997326374053955, - "logps/chosen": -178.9662628173828, - "logps/rejected": -213.8463897705078, - "loss": 0.5768, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.168075680732727, - "rewards/margins": 0.37543433904647827, - "rewards/rejected": -1.54351007938385, + "logits/chosen": -3.006310224533081, + "logits/rejected": -2.994600534439087, + "logps/chosen": -179.55746459960938, + "logps/rejected": -214.8990478515625, + "loss": 0.5762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.173987865447998, + "rewards/margins": 0.38004904985427856, + "rewards/rejected": -1.5540368556976318, "step": 4970 }, { "epoch": 0.86, - "grad_norm": 2.6875, + "grad_norm": 2.53125, "learning_rate": 3.002239612276991e-07, - "logits/chosen": -3.013916492462158, - "logits/rejected": -3.0011227130889893, - "logps/chosen": -181.9564208984375, - "logps/rejected": -211.95346069335938, - "loss": 0.5983, + "logits/chosen": -3.0120511054992676, + "logits/rejected": -2.998878002166748, + "logps/chosen": -181.58895874023438, + "logps/rejected": -212.9563446044922, + "loss": 0.594, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.177166223526001, - "rewards/margins": 0.3119356036186218, - "rewards/rejected": -1.489101767539978, + "rewards/chosen": -1.1734919548034668, + "rewards/margins": 0.32563871145248413, + "rewards/rejected": -1.4991306066513062, "step": 4980 }, { "epoch": 0.86, - "grad_norm": 2.78125, + "grad_norm": 2.90625, "learning_rate": 2.931176274775024e-07, - "logits/chosen": -3.015437126159668, - "logits/rejected": -3.0049312114715576, - "logps/chosen": -181.30780029296875, - "logps/rejected": -219.16586303710938, - "loss": 0.5788, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.192025899887085, - "rewards/margins": 0.36687594652175903, - "rewards/rejected": -1.5589020252227783, + "logits/chosen": -3.0127527713775635, + "logits/rejected": -3.002488374710083, + "logps/chosen": -182.4782257080078, + "logps/rejected": -219.88577270507812, + "loss": 0.5831, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2037298679351807, + "rewards/margins": 0.362371027469635, + "rewards/rejected": -1.5661009550094604, "step": 4990 }, { "epoch": 0.86, - "grad_norm": 3.265625, + "grad_norm": 3.03125, "learning_rate": 2.8609116744266586e-07, - "logits/chosen": -3.024904727935791, - "logits/rejected": -3.011610507965088, - "logps/chosen": -180.6025390625, - "logps/rejected": -209.94070434570312, - "loss": 0.5907, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1356312036514282, - "rewards/margins": 0.35062500834465027, - "rewards/rejected": -1.4862562417984009, + "logits/chosen": -3.022179126739502, + "logits/rejected": -3.0086216926574707, + "logps/chosen": -181.14414978027344, + "logps/rejected": -212.1620635986328, + "loss": 0.585, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1410472393035889, + "rewards/margins": 0.36742258071899414, + "rewards/rejected": -1.5084699392318726, "step": 5000 }, { "epoch": 0.86, - "eval_logits/chosen": -3.0259499549865723, - "eval_logits/rejected": -3.019639253616333, - "eval_logps/chosen": -167.130126953125, - "eval_logps/rejected": -187.4627227783203, - "eval_loss": 0.6482229828834534, - "eval_rewards/accuracies": 0.616403341293335, - "eval_rewards/chosen": -0.9572010636329651, - "eval_rewards/margins": 0.1663556545972824, - "eval_rewards/rejected": -1.1235567331314087, - "eval_runtime": 484.132, - "eval_samples_per_second": 8.89, + "eval_logits/chosen": -3.0225670337677, + "eval_logits/rejected": -3.01635479927063, + "eval_logps/chosen": -167.32522583007812, + "eval_logps/rejected": -188.31539916992188, + "eval_loss": 0.6460159420967102, + "eval_rewards/accuracies": 0.6201208233833313, + "eval_rewards/chosen": -0.9591519236564636, + "eval_rewards/margins": 0.17293164134025574, + "eval_rewards/rejected": -1.132083535194397, + "eval_runtime": 484.0695, + "eval_samples_per_second": 8.891, "eval_steps_per_second": 1.111, "step": 5000 }, { "epoch": 0.86, - "grad_norm": 2.78125, + "grad_norm": 2.8125, "learning_rate": 2.791448354319265e-07, - "logits/chosen": -3.0054192543029785, - "logits/rejected": -2.994859218597412, - "logps/chosen": -187.4678497314453, - "logps/rejected": -218.24655151367188, - "loss": 0.5867, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2418291568756104, - "rewards/margins": 0.3632369637489319, - "rewards/rejected": -1.6050662994384766, + "logits/chosen": -3.002469301223755, + "logits/rejected": -2.992302417755127, + "logps/chosen": -188.56805419921875, + "logps/rejected": -220.1179656982422, + "loss": 0.5861, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.252831220626831, + "rewards/margins": 0.3709490895271301, + "rewards/rejected": -1.6237804889678955, "step": 5010 }, { "epoch": 0.86, - "grad_norm": 2.90625, + "grad_norm": 2.84375, "learning_rate": 2.722788828539469e-07, - "logits/chosen": -2.994016170501709, - "logits/rejected": -2.982360363006592, - "logps/chosen": -179.8846893310547, - "logps/rejected": -211.01028442382812, - "loss": 0.5976, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.158063530921936, - "rewards/margins": 0.34929779171943665, - "rewards/rejected": -1.5073611736297607, + "logits/chosen": -2.9895589351654053, + "logits/rejected": -2.978393077850342, + "logps/chosen": -181.16273498535156, + "logps/rejected": -212.7776336669922, + "loss": 0.5968, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1708439588546753, + "rewards/margins": 0.3541907072067261, + "rewards/rejected": -1.5250345468521118, "step": 5020 }, { "epoch": 0.87, - "grad_norm": 3.96875, + "grad_norm": 4.21875, "learning_rate": 2.65493558208216e-07, - "logits/chosen": -3.016785144805908, - "logits/rejected": -3.006324052810669, - "logps/chosen": -185.47933959960938, - "logps/rejected": -212.0589141845703, - "loss": 0.6153, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2257567644119263, - "rewards/margins": 0.2794381380081177, - "rewards/rejected": -1.5051950216293335, + "logits/chosen": -3.0141069889068604, + "logits/rejected": -3.003230333328247, + "logps/chosen": -187.78970336914062, + "logps/rejected": -213.2385711669922, + "loss": 0.6226, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2488605976104736, + "rewards/margins": 0.2681311070919037, + "rewards/rejected": -1.5169916152954102, "step": 5030 }, { "epoch": 0.87, - "grad_norm": 2.875, + "grad_norm": 3.0, "learning_rate": 2.5878910707605535e-07, - "logits/chosen": -3.027489185333252, - "logits/rejected": -3.0190629959106445, - "logps/chosen": -192.54586791992188, - "logps/rejected": -210.1737823486328, - "loss": 0.619, + "logits/chosen": -3.023101568222046, + "logits/rejected": -3.015597105026245, + "logps/chosen": -193.99081420898438, + "logps/rejected": -211.8629608154297, + "loss": 0.6181, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2427799701690674, - "rewards/margins": 0.2671624720096588, - "rewards/rejected": -1.5099425315856934, + "rewards/chosen": -1.257229208946228, + "rewards/margins": 0.2696048617362976, + "rewards/rejected": -1.52683424949646, "step": 5040 }, { "epoch": 0.87, - "grad_norm": 2.203125, + "grad_norm": 2.359375, "learning_rate": 2.5216577211173045e-07, - "logits/chosen": -3.0162785053253174, - "logits/rejected": -3.010364532470703, - "logps/chosen": -186.3660125732422, - "logps/rejected": -210.4250030517578, - "loss": 0.6196, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1881041526794434, - "rewards/margins": 0.2793116867542267, - "rewards/rejected": -1.4674158096313477, + "logits/chosen": -3.0130486488342285, + "logits/rejected": -3.0070443153381348, + "logps/chosen": -187.1128692626953, + "logps/rejected": -210.8717041015625, + "loss": 0.6204, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1955726146697998, + "rewards/margins": 0.27631038427352905, + "rewards/rejected": -1.4718830585479736, "step": 5050 }, { "epoch": 0.87, - "grad_norm": 3.171875, + "grad_norm": 3.140625, "learning_rate": 2.4562379303366855e-07, - "logits/chosen": -2.9955532550811768, - "logits/rejected": -2.9888062477111816, - "logps/chosen": -182.958251953125, - "logps/rejected": -207.3423614501953, - "loss": 0.6383, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2245664596557617, - "rewards/margins": 0.2448711395263672, - "rewards/rejected": -1.469437599182129, + "logits/chosen": -2.9913368225097656, + "logits/rejected": -2.9847044944763184, + "logps/chosen": -184.39361572265625, + "logps/rejected": -209.1636962890625, + "loss": 0.6384, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2389198541641235, + "rewards/margins": 0.24873094260692596, + "rewards/rejected": -1.4876508712768555, "step": 5060 }, { "epoch": 0.87, - "grad_norm": 3.296875, + "grad_norm": 3.28125, "learning_rate": 2.39163406615783e-07, - "logits/chosen": -2.996875762939453, - "logits/rejected": -2.98545241355896, - "logps/chosen": -189.33526611328125, - "logps/rejected": -207.96871948242188, - "loss": 0.6364, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2511800527572632, - "rewards/margins": 0.2708195149898529, - "rewards/rejected": -1.5219995975494385, + "logits/chosen": -2.9937796592712402, + "logits/rejected": -2.9822850227355957, + "logps/chosen": -190.34483337402344, + "logps/rejected": -209.4861602783203, + "loss": 0.6359, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2612758874893188, + "rewards/margins": 0.2758980095386505, + "rewards/rejected": -1.5371739864349365, "step": 5070 }, { "epoch": 0.88, - "grad_norm": 2.703125, + "grad_norm": 2.671875, "learning_rate": 2.327848466789029e-07, - "logits/chosen": -3.036180257797241, - "logits/rejected": -3.02483868598938, - "logps/chosen": -182.4312744140625, - "logps/rejected": -206.91598510742188, - "loss": 0.5997, + "logits/chosen": -3.0329487323760986, + "logits/rejected": -3.0216927528381348, + "logps/chosen": -183.50765991210938, + "logps/rejected": -208.6122283935547, + "loss": 0.5977, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1447256803512573, - "rewards/margins": 0.3263944387435913, - "rewards/rejected": -1.4711202383041382, + "rewards/chosen": -1.1554895639419556, + "rewards/margins": 0.3325926661491394, + "rewards/rejected": -1.4880822896957397, "step": 5080 }, { "epoch": 0.88, - "grad_norm": 2.96875, + "grad_norm": 3.140625, "learning_rate": 2.2648834408231012e-07, - "logits/chosen": -3.031940221786499, - "logits/rejected": -3.02018404006958, - "logps/chosen": -178.73605346679688, - "logps/rejected": -205.97500610351562, - "loss": 0.5943, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1420974731445312, - "rewards/margins": 0.3264358937740326, - "rewards/rejected": -1.4685331583023071, + "logits/chosen": -3.027855157852173, + "logits/rejected": -3.01641845703125, + "logps/chosen": -180.16256713867188, + "logps/rejected": -208.2643585205078, + "loss": 0.5918, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.156362771987915, + "rewards/margins": 0.3350638747215271, + "rewards/rejected": -1.491426706314087, "step": 5090 }, { "epoch": 0.88, - "grad_norm": 4.0, + "grad_norm": 4.1875, "learning_rate": 2.2027412671538517e-07, - "logits/chosen": -3.007051944732666, - "logits/rejected": -3.0003175735473633, - "logps/chosen": -186.65670776367188, - "logps/rejected": -200.91976928710938, - "loss": 0.6534, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2080416679382324, - "rewards/margins": 0.1921565681695938, - "rewards/rejected": -1.400198221206665, + "logits/chosen": -3.003014326095581, + "logits/rejected": -2.9966347217559814, + "logps/chosen": -187.82479858398438, + "logps/rejected": -203.16989135742188, + "loss": 0.6478, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2197223901748657, + "rewards/margins": 0.20297710597515106, + "rewards/rejected": -1.4226996898651123, "step": 5100 }, { "epoch": 0.88, - "eval_logits/chosen": -3.0211427211761475, - "eval_logits/rejected": -3.014779806137085, - "eval_logps/chosen": -167.22410583496094, - "eval_logps/rejected": -187.57122802734375, - "eval_loss": 0.6480957865715027, - "eval_rewards/accuracies": 0.6154739856719971, - "eval_rewards/chosen": -0.9581407904624939, - "eval_rewards/margins": 0.1665009707212448, - "eval_rewards/rejected": -1.124642014503479, - "eval_runtime": 484.5249, - "eval_samples_per_second": 8.883, - "eval_steps_per_second": 1.11, + "eval_logits/chosen": -3.0212771892547607, + "eval_logits/rejected": -3.015050172805786, + "eval_logps/chosen": -167.4737091064453, + "eval_logps/rejected": -188.46954345703125, + "eval_loss": 0.6460275650024414, + "eval_rewards/accuracies": 0.6194238066673279, + "eval_rewards/chosen": -0.9606368541717529, + "eval_rewards/margins": 0.172988161444664, + "eval_rewards/rejected": -1.1336250305175781, + "eval_runtime": 483.893, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.112, "step": 5100 }, { "epoch": 0.88, - "grad_norm": 3.046875, + "grad_norm": 3.0, "learning_rate": 2.1414241948935822e-07, - "logits/chosen": -3.0215511322021484, - "logits/rejected": -3.0139107704162598, - "logps/chosen": -198.515625, - "logps/rejected": -212.1075897216797, - "loss": 0.6693, + "logits/chosen": -3.0174098014831543, + "logits/rejected": -3.010067939758301, + "logps/chosen": -200.5458526611328, + "logps/rejected": -214.78872680664062, + "loss": 0.6678, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2991710901260376, - "rewards/margins": 0.2032029926776886, - "rewards/rejected": -1.5023739337921143, + "rewards/chosen": -1.3194735050201416, + "rewards/margins": 0.20971199870109558, + "rewards/rejected": -1.5291855335235596, "step": 5110 }, { "epoch": 0.88, - "grad_norm": 3.5, + "grad_norm": 3.390625, "learning_rate": 2.0809344432916905e-07, - "logits/chosen": -3.020785093307495, - "logits/rejected": -3.013937473297119, - "logps/chosen": -184.99778747558594, - "logps/rejected": -204.39956665039062, - "loss": 0.6251, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2011207342147827, - "rewards/margins": 0.25702404975891113, - "rewards/rejected": -1.4581449031829834, + "logits/chosen": -3.0166664123535156, + "logits/rejected": -3.0095877647399902, + "logps/chosen": -186.00674438476562, + "logps/rejected": -206.6129150390625, + "loss": 0.622, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2112102508544922, + "rewards/margins": 0.26906818151474, + "rewards/rejected": -1.480278491973877, "step": 5120 }, { "epoch": 0.88, - "grad_norm": 2.640625, + "grad_norm": 2.484375, "learning_rate": 2.0212742016543468e-07, - "logits/chosen": -3.0052871704101562, - "logits/rejected": -3.000296115875244, - "logps/chosen": -186.18795776367188, - "logps/rejected": -210.3855743408203, - "loss": 0.5988, + "logits/chosen": -3.002864360809326, + "logits/rejected": -2.9982025623321533, + "logps/chosen": -187.5155487060547, + "logps/rejected": -212.5284423828125, + "loss": 0.5957, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2200263738632202, - "rewards/margins": 0.30521923303604126, - "rewards/rejected": -1.5252454280853271, + "rewards/chosen": -1.233302354812622, + "rewards/margins": 0.3133721947669983, + "rewards/rejected": -1.5466746091842651, "step": 5130 }, { "epoch": 0.89, - "grad_norm": 2.984375, + "grad_norm": 2.90625, "learning_rate": 1.9624456292652667e-07, - "logits/chosen": -3.0031652450561523, - "logits/rejected": -3.0017037391662598, - "logps/chosen": -193.90243530273438, - "logps/rejected": -201.4404754638672, - "loss": 0.6729, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2456893920898438, - "rewards/margins": 0.16400747001171112, - "rewards/rejected": -1.4096968173980713, + "logits/chosen": -3.001178026199341, + "logits/rejected": -2.999664545059204, + "logps/chosen": -194.84710693359375, + "logps/rejected": -202.6844482421875, + "loss": 0.6722, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2551358938217163, + "rewards/margins": 0.16700053215026855, + "rewards/rejected": -1.4221365451812744, "step": 5140 }, { "epoch": 0.89, - "grad_norm": 3.25, + "grad_norm": 3.359375, "learning_rate": 1.9044508553075436e-07, - "logits/chosen": -3.016469955444336, - "logits/rejected": -3.0100483894348145, - "logps/chosen": -185.59703063964844, - "logps/rejected": -211.82931518554688, - "loss": 0.6145, + "logits/chosen": -3.0168654918670654, + "logits/rejected": -3.0097479820251465, + "logps/chosen": -186.14413452148438, + "logps/rejected": -213.52523803710938, + "loss": 0.6104, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.218392252922058, - "rewards/margins": 0.2722601890563965, - "rewards/rejected": -1.4906524419784546, + "rewards/chosen": -1.2238632440567017, + "rewards/margins": 0.2837482988834381, + "rewards/rejected": -1.5076117515563965, "step": 5150 }, { "epoch": 0.89, - "grad_norm": 3.953125, + "grad_norm": 3.65625, "learning_rate": 1.8472919787865971e-07, - "logits/chosen": -3.016695976257324, - "logits/rejected": -3.016082286834717, - "logps/chosen": -183.193359375, - "logps/rejected": -199.03392028808594, - "loss": 0.6535, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2164713144302368, - "rewards/margins": 0.2021493911743164, - "rewards/rejected": -1.4186207056045532, + "logits/chosen": -3.012164354324341, + "logits/rejected": -3.0114898681640625, + "logps/chosen": -184.64852905273438, + "logps/rejected": -200.6281280517578, + "loss": 0.654, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2310230731964111, + "rewards/margins": 0.20353946089744568, + "rewards/rejected": -1.4345625638961792, "step": 5160 }, { "epoch": 0.89, - "grad_norm": 2.625, + "grad_norm": 2.46875, "learning_rate": 1.7909710684542225e-07, - "logits/chosen": -3.0045788288116455, - "logits/rejected": -2.9934210777282715, - "logps/chosen": -185.44113159179688, - "logps/rejected": -215.13961791992188, - "loss": 0.5852, + "logits/chosen": -3.0000133514404297, + "logits/rejected": -2.9884800910949707, + "logps/chosen": -187.14547729492188, + "logps/rejected": -217.1771697998047, + "loss": 0.5846, "rewards/accuracies": 0.75, - "rewards/chosen": -1.1757686138153076, - "rewards/margins": 0.3737691342830658, - "rewards/rejected": -1.5495378971099854, + "rewards/chosen": -1.1928120851516724, + "rewards/margins": 0.3771010637283325, + "rewards/rejected": -1.5699129104614258, "step": 5170 }, { "epoch": 0.89, - "grad_norm": 3.625, + "grad_norm": 3.390625, "learning_rate": 1.735490162733658e-07, - "logits/chosen": -3.0251495838165283, - "logits/rejected": -3.017005443572998, - "logps/chosen": -189.4347686767578, - "logps/rejected": -210.934326171875, - "loss": 0.629, + "logits/chosen": -3.0229477882385254, + "logits/rejected": -3.0149283409118652, + "logps/chosen": -190.2616729736328, + "logps/rejected": -212.183349609375, + "loss": 0.631, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.235618233680725, - "rewards/margins": 0.23946666717529297, - "rewards/rejected": -1.475084900856018, + "rewards/chosen": -1.243887186050415, + "rewards/margins": 0.2436877191066742, + "rewards/rejected": -1.4875750541687012, "step": 5180 }, { "epoch": 0.89, - "grad_norm": 2.859375, + "grad_norm": 3.015625, "learning_rate": 1.6808512696458862e-07, - "logits/chosen": -3.014350175857544, - "logits/rejected": -3.0113131999969482, - "logps/chosen": -184.97525024414062, - "logps/rejected": -206.00265502929688, - "loss": 0.6419, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2191311120986938, - "rewards/margins": 0.2178182601928711, - "rewards/rejected": -1.4369492530822754, + "logits/chosen": -3.0097222328186035, + "logits/rejected": -3.0068881511688232, + "logps/chosen": -187.0318145751953, + "logps/rejected": -207.45431518554688, + "loss": 0.6469, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2396968603134155, + "rewards/margins": 0.21176902949810028, + "rewards/rejected": -1.4514659643173218, "step": 5190 }, { "epoch": 0.9, - "grad_norm": 2.609375, + "grad_norm": 3.234375, "learning_rate": 1.6270563667368872e-07, - "logits/chosen": -3.0289926528930664, - "logits/rejected": -3.0216994285583496, - "logps/chosen": -181.39625549316406, - "logps/rejected": -210.9692840576172, - "loss": 0.5973, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1923518180847168, - "rewards/margins": 0.30694520473480225, - "rewards/rejected": -1.4992971420288086, + "logits/chosen": -3.0239500999450684, + "logits/rejected": -3.0173819065093994, + "logps/chosen": -183.49264526367188, + "logps/rejected": -212.5428924560547, + "loss": 0.6018, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.213315725326538, + "rewards/margins": 0.3017173409461975, + "rewards/rejected": -1.5150331258773804, "step": 5200 }, { "epoch": 0.9, - "eval_logits/chosen": -3.0193612575531006, - "eval_logits/rejected": -3.0130417346954346, - "eval_logps/chosen": -166.88232421875, - "eval_logps/rejected": -187.1678924560547, - "eval_loss": 0.6482971906661987, - "eval_rewards/accuracies": 0.6168680191040039, - "eval_rewards/chosen": -0.9547229409217834, - "eval_rewards/margins": 0.16588544845581055, - "eval_rewards/rejected": -1.1206083297729492, - "eval_runtime": 483.9931, - "eval_samples_per_second": 8.893, + "eval_logits/chosen": -3.016746997833252, + "eval_logits/rejected": -3.0105228424072266, + "eval_logps/chosen": -167.12588500976562, + "eval_logps/rejected": -188.0692138671875, + "eval_loss": 0.6461666226387024, + "eval_rewards/accuracies": 0.6205855011940002, + "eval_rewards/chosen": -0.9571587443351746, + "eval_rewards/margins": 0.17246277630329132, + "eval_rewards/rejected": -1.1296215057373047, + "eval_runtime": 483.6852, + "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 5200 }, { "epoch": 0.9, - "grad_norm": 2.5, + "grad_norm": 2.421875, "learning_rate": 1.5741074010061252e-07, - "logits/chosen": -3.009269952774048, - "logits/rejected": -3.0040059089660645, - "logps/chosen": -183.85488891601562, - "logps/rejected": -201.80844116210938, - "loss": 0.6429, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2172787189483643, - "rewards/margins": 0.21514368057250977, - "rewards/rejected": -1.432422399520874, + "logits/chosen": -3.0030903816223145, + "logits/rejected": -2.9979660511016846, + "logps/chosen": -184.4320831298828, + "logps/rejected": -202.42068481445312, + "loss": 0.6448, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.223050594329834, + "rewards/margins": 0.21549400687217712, + "rewards/rejected": -1.438544750213623, "step": 5210 }, { "epoch": 0.9, - "grad_norm": 2.453125, + "grad_norm": 2.578125, "learning_rate": 1.5220062888360172e-07, - "logits/chosen": -3.026101589202881, - "logits/rejected": -3.0166168212890625, - "logps/chosen": -176.77809143066406, - "logps/rejected": -197.70420837402344, - "loss": 0.6466, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1455947160720825, - "rewards/margins": 0.19701746106147766, - "rewards/rejected": -1.3426120281219482, + "logits/chosen": -3.02173113822937, + "logits/rejected": -3.0115773677825928, + "logps/chosen": -178.15750122070312, + "logps/rejected": -200.08419799804688, + "loss": 0.6451, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.159388780593872, + "rewards/margins": 0.20702338218688965, + "rewards/rejected": -1.3664120435714722, "step": 5220 }, { "epoch": 0.9, - "grad_norm": 2.703125, + "grad_norm": 2.890625, "learning_rate": 1.4707549159226425e-07, - "logits/chosen": -3.0132172107696533, - "logits/rejected": -3.0083303451538086, - "logps/chosen": -184.35971069335938, - "logps/rejected": -216.65420532226562, - "loss": 0.5877, + "logits/chosen": -3.0073437690734863, + "logits/rejected": -3.002361297607422, + "logps/chosen": -185.7303466796875, + "logps/rejected": -219.1875762939453, + "loss": 0.5854, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1685558557510376, - "rewards/margins": 0.3556649088859558, - "rewards/rejected": -1.5242207050323486, + "rewards/chosen": -1.1822620630264282, + "rewards/margins": 0.3672925531864166, + "rewards/rejected": -1.549554467201233, "step": 5230 }, { "epoch": 0.9, - "grad_norm": 2.875, + "grad_norm": 2.9375, "learning_rate": 1.4203551372074382e-07, - "logits/chosen": -2.995349884033203, - "logits/rejected": -2.9813692569732666, - "logps/chosen": -186.01040649414062, - "logps/rejected": -228.42507934570312, - "loss": 0.5638, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2257850170135498, - "rewards/margins": 0.4474519193172455, - "rewards/rejected": -1.6732368469238281, + "logits/chosen": -2.993562698364258, + "logits/rejected": -2.979778289794922, + "logps/chosen": -186.5539093017578, + "logps/rejected": -229.2770233154297, + "loss": 0.5627, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.231220006942749, + "rewards/margins": 0.4505365788936615, + "rewards/rejected": -1.681756615638733, "step": 5240 }, { "epoch": 0.9, - "grad_norm": 3.390625, + "grad_norm": 3.5, "learning_rate": 1.3708087768100897e-07, - "logits/chosen": -2.9991250038146973, - "logits/rejected": -2.9920661449432373, - "logps/chosen": -179.68421936035156, - "logps/rejected": -210.48654174804688, - "loss": 0.5901, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1758867502212524, - "rewards/margins": 0.33366623520851135, - "rewards/rejected": -1.5095527172088623, + "logits/chosen": -2.997523546218872, + "logits/rejected": -2.9901375770568848, + "logps/chosen": -180.35211181640625, + "logps/rejected": -211.8171844482422, + "loss": 0.5884, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1825653314590454, + "rewards/margins": 0.3402937054634094, + "rewards/rejected": -1.52285897731781, "step": 5250 }, { "epoch": 0.91, - "grad_norm": 3.4375, + "grad_norm": 3.3125, "learning_rate": 1.3221176279625047e-07, - "logits/chosen": -3.015169143676758, - "logits/rejected": -3.007452964782715, - "logps/chosen": -179.83773803710938, - "logps/rejected": -202.9173126220703, - "loss": 0.6024, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1293582916259766, - "rewards/margins": 0.3117751479148865, - "rewards/rejected": -1.4411332607269287, + "logits/chosen": -3.0125765800476074, + "logits/rejected": -3.005173444747925, + "logps/chosen": -180.85592651367188, + "logps/rejected": -203.88833618164062, + "loss": 0.6035, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.139540195465088, + "rewards/margins": 0.3113035261631012, + "rewards/rejected": -1.4508435726165771, "step": 5260 }, { "epoch": 0.91, - "grad_norm": 2.640625, + "grad_norm": 2.671875, "learning_rate": 1.2742834529439112e-07, - "logits/chosen": -3.0302436351776123, - "logits/rejected": -3.025125741958618, - "logps/chosen": -181.14987182617188, - "logps/rejected": -208.68142700195312, - "loss": 0.6103, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1812187433242798, - "rewards/margins": 0.3093191087245941, - "rewards/rejected": -1.4905378818511963, + "logits/chosen": -3.024702548980713, + "logits/rejected": -3.0192930698394775, + "logps/chosen": -182.72805786132812, + "logps/rejected": -210.27511596679688, + "loss": 0.6119, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.197000503540039, + "rewards/margins": 0.30947452783584595, + "rewards/rejected": -1.5064748525619507, "step": 5270 }, { "epoch": 0.91, - "grad_norm": 3.203125, + "grad_norm": 3.078125, "learning_rate": 1.2273079830170787e-07, - "logits/chosen": -3.0149941444396973, - "logits/rejected": -3.0027318000793457, - "logps/chosen": -191.20547485351562, - "logps/rejected": -210.00521850585938, - "loss": 0.636, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2669647932052612, - "rewards/margins": 0.2477412223815918, - "rewards/rejected": -1.514706015586853, + "logits/chosen": -3.0090789794921875, + "logits/rejected": -2.996187925338745, + "logps/chosen": -193.37173461914062, + "logps/rejected": -212.6551055908203, + "loss": 0.6347, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2886271476745605, + "rewards/margins": 0.25257769227027893, + "rewards/rejected": -1.5412046909332275, "step": 5280 }, { "epoch": 0.91, - "grad_norm": 3.34375, + "grad_norm": 3.109375, "learning_rate": 1.181192918365645e-07, - "logits/chosen": -3.0035336017608643, - "logits/rejected": -2.9970505237579346, - "logps/chosen": -185.7930908203125, - "logps/rejected": -204.4087677001953, - "loss": 0.6191, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1858208179473877, - "rewards/margins": 0.2634347379207611, - "rewards/rejected": -1.4492554664611816, + "logits/chosen": -2.9993698596954346, + "logits/rejected": -2.9934823513031006, + "logps/chosen": -187.09957885742188, + "logps/rejected": -205.87130737304688, + "loss": 0.6187, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1988855600357056, + "rewards/margins": 0.26499563455581665, + "rewards/rejected": -1.463881254196167, "step": 5290 }, { "epoch": 0.91, - "grad_norm": 3.15625, + "grad_norm": 3.34375, "learning_rate": 1.1359399280326034e-07, - "logits/chosen": -3.012399911880493, - "logits/rejected": -3.002013683319092, - "logps/chosen": -186.54757690429688, - "logps/rejected": -213.6743927001953, - "loss": 0.5975, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1939321756362915, - "rewards/margins": 0.3121901750564575, - "rewards/rejected": -1.506122350692749, + "logits/chosen": -3.008162021636963, + "logits/rejected": -2.9974722862243652, + "logps/chosen": -188.11373901367188, + "logps/rejected": -216.1291046142578, + "loss": 0.5963, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2095937728881836, + "rewards/margins": 0.32107558846473694, + "rewards/rejected": -1.5306695699691772, "step": 5300 }, { "epoch": 0.91, - "eval_logits/chosen": -3.024763822555542, - "eval_logits/rejected": -3.0184710025787354, - "eval_logps/chosen": -166.61180114746094, - "eval_logps/rejected": -186.8758544921875, - "eval_loss": 0.6482287645339966, - "eval_rewards/accuracies": 0.6161710023880005, - "eval_rewards/chosen": -0.9520178437232971, - "eval_rewards/margins": 0.16567029058933258, - "eval_rewards/rejected": -1.1176881790161133, - "eval_runtime": 484.3937, - "eval_samples_per_second": 8.885, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.022860288619995, + "eval_logits/rejected": -3.0166664123535156, + "eval_logps/chosen": -167.0541229248047, + "eval_logps/rejected": -187.92852783203125, + "eval_loss": 0.6464575529098511, + "eval_rewards/accuracies": 0.6198884844779968, + "eval_rewards/chosen": -0.9564412236213684, + "eval_rewards/margins": 0.17177370190620422, + "eval_rewards/rejected": -1.128214955329895, + "eval_runtime": 483.9372, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 5300 }, { "epoch": 0.91, - "grad_norm": 2.5625, + "grad_norm": 2.59375, "learning_rate": 1.0915506498598711e-07, - "logits/chosen": -3.013671398162842, - "logits/rejected": -3.0129668712615967, - "logps/chosen": -194.44668579101562, - "logps/rejected": -210.31838989257812, - "loss": 0.6345, + "logits/chosen": -3.0100769996643066, + "logits/rejected": -3.0087945461273193, + "logps/chosen": -195.86776733398438, + "logps/rejected": -212.3857421875, + "loss": 0.6352, "rewards/accuracies": 0.625, - "rewards/chosen": -1.260506272315979, - "rewards/margins": 0.2346612960100174, - "rewards/rejected": -1.4951674938201904, + "rewards/chosen": -1.274717092514038, + "rewards/margins": 0.24112406373023987, + "rewards/rejected": -1.5158412456512451, "step": 5310 }, { "epoch": 0.92, - "grad_norm": 3.171875, + "grad_norm": 3.03125, "learning_rate": 1.0480266904290298e-07, - "logits/chosen": -3.0138745307922363, - "logits/rejected": -3.006173610687256, - "logps/chosen": -189.1534423828125, - "logps/rejected": -209.4146728515625, - "loss": 0.6066, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2044683694839478, - "rewards/margins": 0.31247133016586304, - "rewards/rejected": -1.516939640045166, + "logits/chosen": -3.0121946334838867, + "logits/rejected": -3.0049402713775635, + "logps/chosen": -190.0812530517578, + "logps/rejected": -210.83505249023438, + "loss": 0.6051, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2137463092803955, + "rewards/margins": 0.3173975646495819, + "rewards/rejected": -1.5311439037322998, "step": 5320 }, { "epoch": 0.92, - "grad_norm": 3.078125, + "grad_norm": 2.96875, "learning_rate": 1.0053696250031803e-07, - "logits/chosen": -3.0049405097961426, - "logits/rejected": -2.997436761856079, - "logps/chosen": -179.34432983398438, - "logps/rejected": -219.2215576171875, - "loss": 0.5791, + "logits/chosen": -3.0021157264709473, + "logits/rejected": -2.9936251640319824, + "logps/chosen": -179.1097869873047, + "logps/rejected": -220.5721435546875, + "loss": 0.5729, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1866565942764282, - "rewards/margins": 0.40371227264404297, - "rewards/rejected": -1.5903689861297607, + "rewards/chosen": -1.1843111515045166, + "rewards/margins": 0.4195634722709656, + "rewards/rejected": -1.6038745641708374, "step": 5330 }, { "epoch": 0.92, - "grad_norm": 3.140625, + "grad_norm": 3.109375, "learning_rate": 9.635809974698929e-08, - "logits/chosen": -3.023958921432495, - "logits/rejected": -3.018794536590576, - "logps/chosen": -181.73666381835938, - "logps/rejected": -202.86709594726562, - "loss": 0.5928, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1352908611297607, - "rewards/margins": 0.2984369397163391, - "rewards/rejected": -1.4337279796600342, + "logits/chosen": -3.019029140472412, + "logits/rejected": -3.0142836570739746, + "logps/chosen": -183.40927124023438, + "logps/rejected": -204.5322723388672, + "loss": 0.5946, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1520172357559204, + "rewards/margins": 0.29836225509643555, + "rewards/rejected": -1.4503793716430664, "step": 5340 }, { "epoch": 0.92, - "grad_norm": 2.234375, + "grad_norm": 2.46875, "learning_rate": 9.22662320285389e-08, - "logits/chosen": -3.009406089782715, - "logits/rejected": -3.0080466270446777, - "logps/chosen": -182.8820037841797, - "logps/rejected": -203.56088256835938, - "loss": 0.6348, + "logits/chosen": -3.005446672439575, + "logits/rejected": -3.004295825958252, + "logps/chosen": -184.45816040039062, + "logps/rejected": -205.4375762939453, + "loss": 0.6337, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1950762271881104, - "rewards/margins": 0.252188503742218, - "rewards/rejected": -1.4472649097442627, + "rewards/chosen": -1.210837483406067, + "rewards/margins": 0.25519412755966187, + "rewards/rejected": -1.4660316705703735, "step": 5350 }, { "epoch": 0.92, - "grad_norm": 2.890625, + "grad_norm": 2.84375, "learning_rate": 8.826150744197403e-08, - "logits/chosen": -3.0183300971984863, - "logits/rejected": -3.008608341217041, - "logps/chosen": -185.93148803710938, - "logps/rejected": -219.19528198242188, - "loss": 0.6033, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2144702672958374, - "rewards/margins": 0.33122771978378296, - "rewards/rejected": -1.5456980466842651, + "logits/chosen": -3.0155885219573975, + "logits/rejected": -3.0053577423095703, + "logps/chosen": -187.87252807617188, + "logps/rejected": -221.6996307373047, + "loss": 0.6039, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2338807582855225, + "rewards/margins": 0.3368605971336365, + "rewards/rejected": -1.5707414150238037, "step": 5360 }, { "epoch": 0.93, - "grad_norm": 3.296875, + "grad_norm": 3.171875, "learning_rate": 8.434407093033225e-08, - "logits/chosen": -3.0214178562164307, - "logits/rejected": -3.0202929973602295, - "logps/chosen": -179.71841430664062, - "logps/rejected": -200.45217895507812, - "loss": 0.6364, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1635373830795288, - "rewards/margins": 0.2255493402481079, - "rewards/rejected": -1.3890868425369263, + "logits/chosen": -3.018979549407959, + "logits/rejected": -3.0176680088043213, + "logps/chosen": -180.98257446289062, + "logps/rejected": -201.25579833984375, + "loss": 0.6407, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.176179051399231, + "rewards/margins": 0.22094354033470154, + "rewards/rejected": -1.3971226215362549, "step": 5370 }, { "epoch": 0.93, - "grad_norm": 3.234375, + "grad_norm": 3.03125, "learning_rate": 8.051406427743047e-08, - "logits/chosen": -3.0340845584869385, - "logits/rejected": -3.02712082862854, - "logps/chosen": -187.09100341796875, - "logps/rejected": -205.0014190673828, - "loss": 0.6099, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1768646240234375, - "rewards/margins": 0.2828957140445709, - "rewards/rejected": -1.459760308265686, + "logits/chosen": -3.0299999713897705, + "logits/rejected": -3.0229012966156006, + "logps/chosen": -188.4458770751953, + "logps/rejected": -206.90756225585938, + "loss": 0.6086, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.190413475036621, + "rewards/margins": 0.28840866684913635, + "rewards/rejected": -1.4788219928741455, "step": 5380 }, { "epoch": 0.93, - "grad_norm": 2.625, + "grad_norm": 2.453125, "learning_rate": 7.677162610273819e-08, - "logits/chosen": -3.0002315044403076, - "logits/rejected": -2.9908108711242676, - "logps/chosen": -190.63095092773438, - "logps/rejected": -211.54226684570312, - "loss": 0.6124, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.222454309463501, - "rewards/margins": 0.29625049233436584, - "rewards/rejected": -1.5187046527862549, + "logits/chosen": -2.9983856678009033, + "logits/rejected": -2.9884772300720215, + "logps/chosen": -191.33926391601562, + "logps/rejected": -213.33740234375, + "loss": 0.6093, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2295373678207397, + "rewards/margins": 0.3071189522743225, + "rewards/rejected": -1.536656379699707, "step": 5390 }, { "epoch": 0.93, - "grad_norm": 4.53125, + "grad_norm": 4.375, "learning_rate": 7.311689185635573e-08, - "logits/chosen": -3.0046777725219727, - "logits/rejected": -2.9948372840881348, - "logps/chosen": -178.56069946289062, - "logps/rejected": -214.04733276367188, - "loss": 0.5986, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.168642282485962, - "rewards/margins": 0.35686975717544556, - "rewards/rejected": -1.5255119800567627, + "logits/chosen": -3.0020601749420166, + "logits/rejected": -2.992455005645752, + "logps/chosen": -179.33995056152344, + "logps/rejected": -216.6887664794922, + "loss": 0.5921, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.176435112953186, + "rewards/margins": 0.3754914402961731, + "rewards/rejected": -1.5519264936447144, "step": 5400 }, { "epoch": 0.93, - "eval_logits/chosen": -3.024919033050537, - "eval_logits/rejected": -3.0186076164245605, - "eval_logps/chosen": -166.65017700195312, - "eval_logps/rejected": -186.89283752441406, - "eval_loss": 0.6482614278793335, - "eval_rewards/accuracies": 0.6189591288566589, - "eval_rewards/chosen": -0.9524016380310059, - "eval_rewards/margins": 0.16545623540878296, - "eval_rewards/rejected": -1.117857813835144, - "eval_runtime": 484.7866, - "eval_samples_per_second": 8.878, - "eval_steps_per_second": 1.11, + "eval_logits/chosen": -3.0195529460906982, + "eval_logits/rejected": -3.0133368968963623, + "eval_logps/chosen": -167.09963989257812, + "eval_logps/rejected": -188.02737426757812, + "eval_loss": 0.6461929678916931, + "eval_rewards/accuracies": 0.6198884844779968, + "eval_rewards/chosen": -0.9568961262702942, + "eval_rewards/margins": 0.17230701446533203, + "eval_rewards/rejected": -1.129203200340271, + "eval_runtime": 483.9337, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 5400 }, { "epoch": 0.93, - "grad_norm": 4.1875, + "grad_norm": 4.25, "learning_rate": 6.954999381411642e-08, - "logits/chosen": -3.0219473838806152, - "logits/rejected": -3.0172617435455322, - "logps/chosen": -195.20822143554688, - "logps/rejected": -205.44821166992188, - "loss": 0.683, + "logits/chosen": -3.018542766571045, + "logits/rejected": -3.0133025646209717, + "logps/chosen": -197.008056640625, + "logps/rejected": -207.00283813476562, + "loss": 0.6865, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2699486017227173, - "rewards/margins": 0.14408110082149506, - "rewards/rejected": -1.4140297174453735, + "rewards/chosen": -1.2879469394683838, + "rewards/margins": 0.1416289359331131, + "rewards/rejected": -1.429575800895691, "step": 5410 }, { "epoch": 0.93, - "grad_norm": 3.15625, + "grad_norm": 3.3125, "learning_rate": 6.607106107279604e-08, - "logits/chosen": -3.018432140350342, - "logits/rejected": -3.015479326248169, - "logps/chosen": -189.63400268554688, - "logps/rejected": -209.18264770507812, - "loss": 0.6452, + "logits/chosen": -3.0142135620117188, + "logits/rejected": -3.010924816131592, + "logps/chosen": -190.42416381835938, + "logps/rejected": -209.90029907226562, + "loss": 0.6485, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2422764301300049, - "rewards/margins": 0.22592100501060486, - "rewards/rejected": -1.4681974649429321, + "rewards/chosen": -1.2501780986785889, + "rewards/margins": 0.2251962423324585, + "rewards/rejected": -1.4753742218017578, "step": 5420 }, { "epoch": 0.94, - "grad_norm": 2.8125, + "grad_norm": 3.203125, "learning_rate": 6.268021954544095e-08, - "logits/chosen": -3.0200390815734863, - "logits/rejected": -3.0088791847229004, - "logps/chosen": -185.31239318847656, - "logps/rejected": -213.2115020751953, - "loss": 0.6008, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2222305536270142, - "rewards/margins": 0.3227187991142273, - "rewards/rejected": -1.5449492931365967, + "logits/chosen": -3.016510486602783, + "logits/rejected": -3.0054919719696045, + "logps/chosen": -187.24249267578125, + "logps/rejected": -215.45034790039062, + "loss": 0.6014, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2415316104888916, + "rewards/margins": 0.3258061408996582, + "rewards/rejected": -1.5673377513885498, "step": 5430 }, { "epoch": 0.94, - "grad_norm": 2.984375, + "grad_norm": 2.890625, "learning_rate": 5.9377591956812364e-08, - "logits/chosen": -3.0067832469940186, - "logits/rejected": -2.9997334480285645, - "logps/chosen": -186.7572479248047, - "logps/rejected": -213.2431182861328, - "loss": 0.6082, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.202883005142212, - "rewards/margins": 0.30217069387435913, - "rewards/rejected": -1.5050535202026367, + "logits/chosen": -3.003411054611206, + "logits/rejected": -2.9964089393615723, + "logps/chosen": -188.05709838867188, + "logps/rejected": -215.2018585205078, + "loss": 0.6064, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.21588134765625, + "rewards/margins": 0.3087596893310547, + "rewards/rejected": -1.5246409177780151, "step": 5440 }, { "epoch": 0.94, - "grad_norm": 5.1875, + "grad_norm": 5.125, "learning_rate": 5.6163297838942866e-08, - "logits/chosen": -3.0089011192321777, - "logits/rejected": -3.0013954639434814, - "logps/chosen": -186.43460083007812, - "logps/rejected": -208.15872192382812, - "loss": 0.6253, + "logits/chosen": -3.0072951316833496, + "logits/rejected": -3.0005078315734863, + "logps/chosen": -187.33633422851562, + "logps/rejected": -208.70126342773438, + "loss": 0.6291, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1967378854751587, - "rewards/margins": 0.26602086424827576, - "rewards/rejected": -1.4627587795257568, + "rewards/chosen": -1.2057554721832275, + "rewards/margins": 0.26242905855178833, + "rewards/rejected": -1.4681843519210815, "step": 5450 }, { "epoch": 0.94, - "grad_norm": 2.953125, + "grad_norm": 2.859375, "learning_rate": 5.30374535268105e-08, - "logits/chosen": -3.0090737342834473, - "logits/rejected": -2.99898624420166, - "logps/chosen": -185.4371337890625, - "logps/rejected": -203.43051147460938, - "loss": 0.6304, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2107499837875366, - "rewards/margins": 0.21922187507152557, - "rewards/rejected": -1.4299719333648682, + "logits/chosen": -3.0079214572906494, + "logits/rejected": -2.997767448425293, + "logps/chosen": -185.90853881835938, + "logps/rejected": -203.79953002929688, + "loss": 0.6331, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2154641151428223, + "rewards/margins": 0.21819797158241272, + "rewards/rejected": -1.4336621761322021, "step": 5460 }, { "epoch": 0.94, - "grad_norm": 2.96875, + "grad_norm": 3.03125, "learning_rate": 5.0000172154129887e-08, - "logits/chosen": -3.0085060596466064, - "logits/rejected": -3.007678508758545, - "logps/chosen": -184.86740112304688, - "logps/rejected": -201.18594360351562, - "loss": 0.6685, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2326360940933228, - "rewards/margins": 0.18991947174072266, - "rewards/rejected": -1.4225553274154663, + "logits/chosen": -3.0055441856384277, + "logits/rejected": -3.005470037460327, + "logps/chosen": -186.6962432861328, + "logps/rejected": -203.15626525878906, + "loss": 0.6712, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2509243488311768, + "rewards/margins": 0.1913340985774994, + "rewards/rejected": -1.4422584772109985, "step": 5470 }, { "epoch": 0.94, - "grad_norm": 2.625, + "grad_norm": 2.671875, "learning_rate": 4.705156364925467e-08, - "logits/chosen": -2.9995064735412598, - "logits/rejected": -2.986332654953003, - "logps/chosen": -181.80081176757812, - "logps/rejected": -213.63790893554688, - "loss": 0.579, + "logits/chosen": -2.9974427223205566, + "logits/rejected": -2.98425555229187, + "logps/chosen": -182.87705993652344, + "logps/rejected": -215.0094757080078, + "loss": 0.5791, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1795815229415894, - "rewards/margins": 0.3651406764984131, - "rewards/rejected": -1.5447221994400024, + "rewards/chosen": -1.190344214439392, + "rewards/margins": 0.36809319257736206, + "rewards/rejected": -1.5584375858306885, "step": 5480 }, { "epoch": 0.95, - "grad_norm": 2.75, + "grad_norm": 2.78125, "learning_rate": 4.419173473120236e-08, - "logits/chosen": -2.9968180656433105, - "logits/rejected": -2.98726749420166, - "logps/chosen": -181.09336853027344, - "logps/rejected": -198.72418212890625, - "loss": 0.6289, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1412298679351807, - "rewards/margins": 0.24370375275611877, - "rewards/rejected": -1.3849337100982666, + "logits/chosen": -2.9959757328033447, + "logits/rejected": -2.9869399070739746, + "logps/chosen": -183.10800170898438, + "logps/rejected": -200.63238525390625, + "loss": 0.6302, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1613762378692627, + "rewards/margins": 0.24263951182365417, + "rewards/rejected": -1.4040155410766602, "step": 5490 }, { "epoch": 0.95, - "grad_norm": 2.796875, + "grad_norm": 3.109375, "learning_rate": 4.142078890578827e-08, - "logits/chosen": -3.0323545932769775, - "logits/rejected": -3.0259850025177, - "logps/chosen": -179.64572143554688, - "logps/rejected": -209.1461944580078, - "loss": 0.6025, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1536697149276733, - "rewards/margins": 0.31138697266578674, - "rewards/rejected": -1.4650566577911377, + "logits/chosen": -3.0294241905212402, + "logits/rejected": -3.022934913635254, + "logps/chosen": -181.2027587890625, + "logps/rejected": -211.42294311523438, + "loss": 0.6015, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.169240117073059, + "rewards/margins": 0.318584144115448, + "rewards/rejected": -1.4878242015838623, "step": 5500 }, { "epoch": 0.95, - "eval_logits/chosen": -3.0252060890197754, - "eval_logits/rejected": -3.0189244747161865, - "eval_logps/chosen": -166.7466583251953, - "eval_logps/rejected": -186.9980010986328, - "eval_loss": 0.6482976078987122, - "eval_rewards/accuracies": 0.6168680191040039, - "eval_rewards/chosen": -0.9533662796020508, - "eval_rewards/margins": 0.16554316878318787, - "eval_rewards/rejected": -1.118909478187561, - "eval_runtime": 485.7241, - "eval_samples_per_second": 8.861, - "eval_steps_per_second": 1.108, + "eval_logits/chosen": -3.0226480960845947, + "eval_logits/rejected": -3.016446352005005, + "eval_logps/chosen": -167.10562133789062, + "eval_logps/rejected": -188.0281982421875, + "eval_loss": 0.6462621092796326, + "eval_rewards/accuracies": 0.6191914677619934, + "eval_rewards/chosen": -0.9569559097290039, + "eval_rewards/margins": 0.17225554585456848, + "eval_rewards/rejected": -1.1292115449905396, + "eval_runtime": 484.0094, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.112, "step": 5500 }, { "epoch": 0.95, - "grad_norm": 2.640625, + "grad_norm": 2.78125, "learning_rate": 3.873882646188265e-08, - "logits/chosen": -3.009657382965088, - "logits/rejected": -3.0036680698394775, - "logps/chosen": -195.98202514648438, - "logps/rejected": -214.7615203857422, - "loss": 0.6468, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.330396294593811, - "rewards/margins": 0.2086518257856369, - "rewards/rejected": -1.539048194885254, + "logits/chosen": -3.007052183151245, + "logits/rejected": -3.0012123584747314, + "logps/chosen": -197.01986694335938, + "logps/rejected": -216.42385864257812, + "loss": 0.6467, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3407747745513916, + "rewards/margins": 0.21489660441875458, + "rewards/rejected": -1.5556714534759521, "step": 5510 }, { "epoch": 0.95, - "grad_norm": 2.59375, + "grad_norm": 2.546875, "learning_rate": 3.6145944467777525e-08, - "logits/chosen": -3.008087635040283, - "logits/rejected": -2.9986672401428223, - "logps/chosen": -182.70603942871094, - "logps/rejected": -208.48681640625, - "loss": 0.5892, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1439793109893799, - "rewards/margins": 0.34625551104545593, - "rewards/rejected": -1.4902350902557373, + "logits/chosen": -3.0063059329986572, + "logits/rejected": -2.997032642364502, + "logps/chosen": -184.0959014892578, + "logps/rejected": -211.39956665039062, + "loss": 0.5849, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.157877802848816, + "rewards/margins": 0.3614843785762787, + "rewards/rejected": -1.519362211227417, "step": 5520 }, { "epoch": 0.95, - "grad_norm": 3.46875, + "grad_norm": 3.5625, "learning_rate": 3.364223676767725e-08, - "logits/chosen": -3.005150556564331, - "logits/rejected": -2.9966344833374023, - "logps/chosen": -191.7229766845703, - "logps/rejected": -206.38412475585938, - "loss": 0.6328, + "logits/chosen": -3.0024030208587646, + "logits/rejected": -2.994372606277466, + "logps/chosen": -192.8942108154297, + "logps/rejected": -207.86978149414062, + "loss": 0.633, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2086269855499268, - "rewards/margins": 0.2549501061439514, - "rewards/rejected": -1.463577151298523, + "rewards/chosen": -1.220339298248291, + "rewards/margins": 0.2580941915512085, + "rewards/rejected": -1.4784334897994995, "step": 5530 }, { "epoch": 0.95, - "grad_norm": 2.96875, + "grad_norm": 3.078125, "learning_rate": 3.122779397829845e-08, - "logits/chosen": -3.016235828399658, - "logits/rejected": -3.0094943046569824, - "logps/chosen": -179.7984619140625, - "logps/rejected": -212.85830688476562, - "loss": 0.6018, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1654951572418213, - "rewards/margins": 0.3237941563129425, - "rewards/rejected": -1.4892891645431519, + "logits/chosen": -3.0149142742156982, + "logits/rejected": -3.007732391357422, + "logps/chosen": -181.58314514160156, + "logps/rejected": -215.8975830078125, + "loss": 0.5995, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1833419799804688, + "rewards/margins": 0.33634015917778015, + "rewards/rejected": -1.5196820497512817, "step": 5540 }, { "epoch": 0.96, - "grad_norm": 2.953125, + "grad_norm": 3.21875, "learning_rate": 2.8902703485593208e-08, - "logits/chosen": -2.991302967071533, - "logits/rejected": -2.9869298934936523, - "logps/chosen": -183.2467803955078, - "logps/rejected": -202.32962036132812, - "loss": 0.6599, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2108595371246338, - "rewards/margins": 0.19090045988559723, - "rewards/rejected": -1.4017599821090698, + "logits/chosen": -2.98718523979187, + "logits/rejected": -2.982654094696045, + "logps/chosen": -184.0675811767578, + "logps/rejected": -203.8521270751953, + "loss": 0.6588, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2190674543380737, + "rewards/margins": 0.1979176253080368, + "rewards/rejected": -1.4169851541519165, "step": 5550 }, { "epoch": 0.96, - "grad_norm": 3.671875, + "grad_norm": 3.578125, "learning_rate": 2.666704944158438e-08, - "logits/chosen": -3.012624740600586, - "logits/rejected": -3.0047385692596436, - "logps/chosen": -179.16905212402344, - "logps/rejected": -196.2181854248047, - "loss": 0.6331, + "logits/chosen": -3.010056734085083, + "logits/rejected": -3.002088785171509, + "logps/chosen": -179.8453369140625, + "logps/rejected": -197.3083038330078, + "loss": 0.634, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1613173484802246, - "rewards/margins": 0.2275480031967163, - "rewards/rejected": -1.388865351676941, + "rewards/chosen": -1.1680803298950195, + "rewards/margins": 0.23168618977069855, + "rewards/rejected": -1.399766445159912, "step": 5560 }, { "epoch": 0.96, - "grad_norm": 3.015625, + "grad_norm": 2.9375, "learning_rate": 2.4520912761320515e-08, - "logits/chosen": -3.006896495819092, - "logits/rejected": -3.0060629844665527, - "logps/chosen": -184.01107788085938, - "logps/rejected": -204.49400329589844, - "loss": 0.6539, + "logits/chosen": -3.004617214202881, + "logits/rejected": -3.0041353702545166, + "logps/chosen": -184.72076416015625, + "logps/rejected": -206.00390625, + "loss": 0.6545, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2240345478057861, - "rewards/margins": 0.17588286101818085, - "rewards/rejected": -1.3999173641204834, + "rewards/chosen": -1.2311315536499023, + "rewards/margins": 0.1838848888874054, + "rewards/rejected": -1.4150164127349854, "step": 5570 }, { "epoch": 0.96, - "grad_norm": 2.765625, + "grad_norm": 2.578125, "learning_rate": 2.2464371119947926e-08, - "logits/chosen": -3.013871908187866, - "logits/rejected": -3.0044972896575928, - "logps/chosen": -185.55514526367188, - "logps/rejected": -216.16012573242188, - "loss": 0.5986, + "logits/chosen": -3.0113699436187744, + "logits/rejected": -3.001690626144409, + "logps/chosen": -186.34310913085938, + "logps/rejected": -216.634765625, + "loss": 0.6002, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2228542566299438, - "rewards/margins": 0.31742024421691895, - "rewards/rejected": -1.5402743816375732, + "rewards/chosen": -1.2307337522506714, + "rewards/margins": 0.3142867386341095, + "rewards/rejected": -1.545020580291748, "step": 5580 }, { "epoch": 0.96, - "grad_norm": 3.65625, + "grad_norm": 3.515625, "learning_rate": 2.049749894989822e-08, - "logits/chosen": -3.023019313812256, - "logits/rejected": -3.017407178878784, - "logps/chosen": -189.84158325195312, - "logps/rejected": -214.7401123046875, - "loss": 0.6201, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2183094024658203, - "rewards/margins": 0.2928226590156555, - "rewards/rejected": -1.5111321210861206, + "logits/chosen": -3.020048141479492, + "logits/rejected": -3.014925479888916, + "logps/chosen": -191.5043487548828, + "logps/rejected": -217.0632781982422, + "loss": 0.6179, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2349369525909424, + "rewards/margins": 0.29942673444747925, + "rewards/rejected": -1.534363865852356, "step": 5590 }, { "epoch": 0.96, - "grad_norm": 3.015625, + "grad_norm": 3.3125, "learning_rate": 1.8620367438194898e-08, - "logits/chosen": -3.0226356983184814, - "logits/rejected": -3.015810489654541, - "logps/chosen": -182.97305297851562, - "logps/rejected": -214.4624786376953, - "loss": 0.6149, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1971943378448486, - "rewards/margins": 0.2937503159046173, - "rewards/rejected": -1.490944743156433, + "logits/chosen": -3.0188074111938477, + "logits/rejected": -3.011625051498413, + "logps/chosen": -184.4583282470703, + "logps/rejected": -216.49002075195312, + "loss": 0.6148, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2120471000671387, + "rewards/margins": 0.2991730570793152, + "rewards/rejected": -1.511220097541809, "step": 5600 }, { "epoch": 0.96, - "eval_logits/chosen": -3.0243780612945557, - "eval_logits/rejected": -3.0180587768554688, - "eval_logps/chosen": -166.78590393066406, - "eval_logps/rejected": -187.1136932373047, - "eval_loss": 0.6480231881141663, - "eval_rewards/accuracies": 0.6154739856719971, - "eval_rewards/chosen": -0.953758955001831, - "eval_rewards/margins": 0.1663074791431427, - "eval_rewards/rejected": -1.120066523551941, - "eval_runtime": 484.1208, - "eval_samples_per_second": 8.89, - "eval_steps_per_second": 1.111, + "eval_logits/chosen": -3.0204532146453857, + "eval_logits/rejected": -3.0142104625701904, + "eval_logps/chosen": -166.8396453857422, + "eval_logps/rejected": -187.79342651367188, + "eval_loss": 0.6461296081542969, + "eval_rewards/accuracies": 0.6194238066673279, + "eval_rewards/chosen": -0.9542962312698364, + "eval_rewards/margins": 0.17256729304790497, + "eval_rewards/rejected": -1.1268635988235474, + "eval_runtime": 483.9014, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.112, "step": 5600 }, { "epoch": 0.97, - "grad_norm": 2.84375, + "grad_norm": 2.9375, "learning_rate": 1.683304452387763e-08, - "logits/chosen": -3.020547389984131, - "logits/rejected": -3.0156941413879395, - "logps/chosen": -179.31251525878906, - "logps/rejected": -217.8151397705078, - "loss": 0.585, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1683876514434814, - "rewards/margins": 0.375874400138855, - "rewards/rejected": -1.5442620515823364, + "logits/chosen": -3.016793727874756, + "logits/rejected": -3.012073040008545, + "logps/chosen": -180.9460906982422, + "logps/rejected": -219.7074432373047, + "loss": 0.5845, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1847233772277832, + "rewards/margins": 0.37846142053604126, + "rewards/rejected": -1.5631848573684692, "step": 5610 }, { "epoch": 0.97, - "grad_norm": 4.03125, + "grad_norm": 3.859375, "learning_rate": 1.5135594895542005e-08, - "logits/chosen": -2.9989235401153564, - "logits/rejected": -2.9931704998016357, - "logps/chosen": -190.47634887695312, - "logps/rejected": -206.96914672851562, - "loss": 0.6365, + "logits/chosen": -2.995067834854126, + "logits/rejected": -2.9892044067382812, + "logps/chosen": -192.774169921875, + "logps/rejected": -210.0087890625, + "loss": 0.6356, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2506822347640991, - "rewards/margins": 0.22417505085468292, - "rewards/rejected": -1.474857211112976, + "rewards/chosen": -1.2736603021621704, + "rewards/margins": 0.2315937578678131, + "rewards/rejected": -1.5052540302276611, "step": 5620 }, { "epoch": 0.97, - "grad_norm": 2.875, + "grad_norm": 2.671875, "learning_rate": 1.352807998899891e-08, - "logits/chosen": -3.015920877456665, - "logits/rejected": -3.008056879043579, - "logps/chosen": -187.56515502929688, - "logps/rejected": -210.36080932617188, - "loss": 0.6209, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2147810459136963, - "rewards/margins": 0.2961927354335785, - "rewards/rejected": -1.5109736919403076, + "logits/chosen": -3.013383388519287, + "logits/rejected": -3.0054855346679688, + "logps/chosen": -187.69821166992188, + "logps/rejected": -210.5667266845703, + "loss": 0.6201, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2161115407943726, + "rewards/margins": 0.2969212830066681, + "rewards/rejected": -1.5130326747894287, "step": 5630 }, { "epoch": 0.97, - "grad_norm": 2.6875, + "grad_norm": 2.84375, "learning_rate": 1.2010557985051297e-08, - "logits/chosen": -3.017130136489868, - "logits/rejected": -3.0098698139190674, - "logps/chosen": -178.4241485595703, - "logps/rejected": -207.69338989257812, - "loss": 0.6181, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.12577223777771, - "rewards/margins": 0.3019818663597107, - "rewards/rejected": -1.4277540445327759, + "logits/chosen": -3.013478994369507, + "logits/rejected": -3.0061988830566406, + "logps/chosen": -179.22596740722656, + "logps/rejected": -209.5537872314453, + "loss": 0.6138, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1337904930114746, + "rewards/margins": 0.31256765127182007, + "rewards/rejected": -1.4463579654693604, "step": 5640 }, { "epoch": 0.97, - "grad_norm": 2.609375, + "grad_norm": 2.4375, "learning_rate": 1.0583083807387818e-08, - "logits/chosen": -3.018669843673706, - "logits/rejected": -3.0065765380859375, - "logps/chosen": -177.80455017089844, - "logps/rejected": -208.6228790283203, - "loss": 0.6129, + "logits/chosen": -3.0160446166992188, + "logits/rejected": -3.0032248497009277, + "logps/chosen": -178.92898559570312, + "logps/rejected": -210.04541015625, + "loss": 0.6143, "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1684799194335938, - "rewards/margins": 0.32036706805229187, - "rewards/rejected": -1.488847017288208, + "rewards/chosen": -1.179724097251892, + "rewards/margins": 0.3233483135700226, + "rewards/rejected": -1.5030725002288818, "step": 5650 }, { "epoch": 0.98, - "grad_norm": 3.203125, + "grad_norm": 3.3125, "learning_rate": 9.245709120595526e-09, - "logits/chosen": -3.0136618614196777, - "logits/rejected": -3.000986099243164, - "logps/chosen": -181.29054260253906, - "logps/rejected": -212.631591796875, - "loss": 0.5992, + "logits/chosen": -3.0110814571380615, + "logits/rejected": -2.997427225112915, + "logps/chosen": -183.3815155029297, + "logps/rejected": -215.2753143310547, + "loss": 0.6007, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1949920654296875, - "rewards/margins": 0.3406812846660614, - "rewards/rejected": -1.5356733798980713, + "rewards/chosen": -1.2159018516540527, + "rewards/margins": 0.34620875120162964, + "rewards/rejected": -1.5621105432510376, "step": 5660 }, { "epoch": 0.98, - "grad_norm": 2.53125, + "grad_norm": 2.359375, "learning_rate": 7.998482328289702e-09, - "logits/chosen": -3.011711597442627, - "logits/rejected": -3.0005130767822266, - "logps/chosen": -176.48794555664062, - "logps/rejected": -198.55197143554688, - "loss": 0.6149, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1117572784423828, - "rewards/margins": 0.25985077023506165, - "rewards/rejected": -1.371608018875122, + "logits/chosen": -3.0085549354553223, + "logits/rejected": -2.997692823410034, + "logps/chosen": -177.83279418945312, + "logps/rejected": -199.79359436035156, + "loss": 0.6171, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1252058744430542, + "rewards/margins": 0.2588183283805847, + "rewards/rejected": -1.3840242624282837, "step": 5670 }, { "epoch": 0.98, - "grad_norm": 4.4375, + "grad_norm": 4.65625, "learning_rate": 6.841448571361376e-09, - "logits/chosen": -3.0058178901672363, - "logits/rejected": -3.003483533859253, - "logps/chosen": -186.4090118408203, - "logps/rejected": -207.31539916992188, - "loss": 0.6124, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2142975330352783, - "rewards/margins": 0.2691487967967987, - "rewards/rejected": -1.4834461212158203, + "logits/chosen": -3.002760648727417, + "logits/rejected": -3.0002365112304688, + "logps/chosen": -186.73818969726562, + "logps/rejected": -208.52145385742188, + "loss": 0.6087, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2175891399383545, + "rewards/margins": 0.27791762351989746, + "rewards/rejected": -1.495506763458252, "step": 5680 }, { "epoch": 0.98, "grad_norm": 2.890625, "learning_rate": 5.774649726345283e-09, - "logits/chosen": -3.0200724601745605, - "logits/rejected": -3.005723476409912, - "logps/chosen": -187.30740356445312, - "logps/rejected": -212.05178833007812, - "loss": 0.5797, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1744122505187988, - "rewards/margins": 0.34947630763053894, - "rewards/rejected": -1.5238884687423706, + "logits/chosen": -3.0148816108703613, + "logits/rejected": -3.000577449798584, + "logps/chosen": -188.50425720214844, + "logps/rejected": -214.07492065429688, + "loss": 0.5782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1863806247711182, + "rewards/margins": 0.35773932933807373, + "rewards/rejected": -1.5441200733184814, "step": 5690 }, { "epoch": 0.98, - "grad_norm": 5.1875, + "grad_norm": 5.34375, "learning_rate": 4.798124403902205e-09, - "logits/chosen": -3.0057690143585205, - "logits/rejected": -2.9969992637634277, - "logps/chosen": -183.09750366210938, - "logps/rejected": -202.24923706054688, - "loss": 0.6275, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.1517393589019775, - "rewards/margins": 0.25165319442749023, - "rewards/rejected": -1.4033925533294678, + "logits/chosen": -3.0013067722320557, + "logits/rejected": -2.9927356243133545, + "logps/chosen": -184.54940795898438, + "logps/rejected": -203.5726776123047, + "loss": 0.6299, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1662585735321045, + "rewards/margins": 0.25036853551864624, + "rewards/rejected": -1.4166268110275269, "step": 5700 }, { "epoch": 0.98, - "eval_logits/chosen": -3.024533748626709, - "eval_logits/rejected": -3.018249750137329, - "eval_logps/chosen": -166.6790771484375, - "eval_logps/rejected": -186.94839477539062, - "eval_loss": 0.6482394337654114, - "eval_rewards/accuracies": 0.6177973747253418, - "eval_rewards/chosen": -0.9526904821395874, - "eval_rewards/margins": 0.16572298109531403, - "eval_rewards/rejected": -1.1184134483337402, - "eval_runtime": 483.6009, - "eval_samples_per_second": 8.9, + "eval_logits/chosen": -3.022794485092163, + "eval_logits/rejected": -3.0166068077087402, + "eval_logps/chosen": -166.8362579345703, + "eval_logps/rejected": -187.7362518310547, + "eval_loss": 0.6462457776069641, + "eval_rewards/accuracies": 0.6194238066673279, + "eval_rewards/chosen": -0.9542624354362488, + "eval_rewards/margins": 0.1720295399427414, + "eval_rewards/rejected": -1.1262919902801514, + "eval_runtime": 483.9007, + "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 5700 }, { "epoch": 0.98, - "grad_norm": 2.921875, + "grad_norm": 3.0, "learning_rate": 3.911907947422577e-09, - "logits/chosen": -3.0172641277313232, - "logits/rejected": -3.01008939743042, - "logps/chosen": -184.43312072753906, - "logps/rejected": -212.61965942382812, - "loss": 0.5931, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1918230056762695, - "rewards/margins": 0.332853227853775, - "rewards/rejected": -1.5246760845184326, + "logits/chosen": -3.0133697986602783, + "logits/rejected": -3.0069236755371094, + "logps/chosen": -186.69503784179688, + "logps/rejected": -214.79434204101562, + "loss": 0.5947, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2144420146942139, + "rewards/margins": 0.33198100328445435, + "rewards/rejected": -1.5464229583740234, "step": 5710 }, { "epoch": 0.99, - "grad_norm": 3.171875, + "grad_norm": 3.25, "learning_rate": 3.116032431747518e-09, - "logits/chosen": -3.0023066997528076, - "logits/rejected": -2.9940028190612793, - "logps/chosen": -186.46493530273438, - "logps/rejected": -214.6597137451172, - "loss": 0.6009, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2312668561935425, - "rewards/margins": 0.3432316184043884, - "rewards/rejected": -1.5744985342025757, + "logits/chosen": -3.001695156097412, + "logits/rejected": -2.993189811706543, + "logps/chosen": -187.8595428466797, + "logps/rejected": -216.593994140625, + "loss": 0.601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2452127933502197, + "rewards/margins": 0.34862878918647766, + "rewards/rejected": -1.5938416719436646, "step": 5720 }, { "epoch": 0.99, - "grad_norm": 3.125, + "grad_norm": 3.078125, "learning_rate": 2.410526662007251e-09, - "logits/chosen": -3.014190912246704, - "logits/rejected": -3.008674383163452, - "logps/chosen": -181.78567504882812, - "logps/rejected": -203.11676025390625, - "loss": 0.6337, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.188835859298706, - "rewards/margins": 0.24085529148578644, - "rewards/rejected": -1.429691195487976, + "logits/chosen": -3.010476589202881, + "logits/rejected": -3.004617691040039, + "logps/chosen": -182.94322204589844, + "logps/rejected": -204.91983032226562, + "loss": 0.6326, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2004114389419556, + "rewards/margins": 0.2473103255033493, + "rewards/rejected": -1.4477218389511108, "step": 5730 }, { "epoch": 0.99, - "grad_norm": 3.0, + "grad_norm": 2.890625, "learning_rate": 1.7954161725791674e-09, - "logits/chosen": -3.0035383701324463, - "logits/rejected": -2.988966464996338, - "logps/chosen": -197.06796264648438, - "logps/rejected": -226.5892791748047, - "loss": 0.5916, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.283865213394165, - "rewards/margins": 0.36810502409935, - "rewards/rejected": -1.6519702672958374, + "logits/chosen": -3.0005202293395996, + "logits/rejected": -2.9857637882232666, + "logps/chosen": -198.15444946289062, + "logps/rejected": -229.3535614013672, + "loss": 0.5853, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2947299480438232, + "rewards/margins": 0.38488340377807617, + "rewards/rejected": -1.679613471031189, "step": 5740 }, { "epoch": 0.99, - "grad_norm": 4.0625, + "grad_norm": 4.03125, "learning_rate": 1.270723226163284e-09, - "logits/chosen": -3.031846761703491, - "logits/rejected": -3.027182102203369, - "logps/chosen": -192.7350311279297, - "logps/rejected": -200.8181610107422, - "loss": 0.6526, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.245029330253601, - "rewards/margins": 0.17930933833122253, - "rewards/rejected": -1.424338698387146, + "logits/chosen": -3.02968168258667, + "logits/rejected": -3.025573253631592, + "logps/chosen": -192.98440551757812, + "logps/rejected": -201.9522705078125, + "loss": 0.6502, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2475230693817139, + "rewards/margins": 0.1881568729877472, + "rewards/rejected": -1.4356797933578491, "step": 5750 }, { "epoch": 0.99, - "grad_norm": 2.640625, + "grad_norm": 2.71875, "learning_rate": 8.364668129762221e-10, - "logits/chosen": -3.01188325881958, - "logits/rejected": -3.001596212387085, - "logps/chosen": -187.3595428466797, - "logps/rejected": -212.9336395263672, - "loss": 0.6134, + "logits/chosen": -3.0103919506073, + "logits/rejected": -3.0001320838928223, + "logps/chosen": -188.3455352783203, + "logps/rejected": -213.83786010742188, + "loss": 0.6157, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2060251235961914, - "rewards/margins": 0.28464144468307495, - "rewards/rejected": -1.490666389465332, + "rewards/chosen": -1.2158849239349365, + "rewards/margins": 0.2838238775730133, + "rewards/rejected": -1.499708890914917, "step": 5760 }, { "epoch": 0.99, - "grad_norm": 3.015625, + "grad_norm": 3.109375, "learning_rate": 4.926626500648124e-10, - "logits/chosen": -3.0015952587127686, - "logits/rejected": -2.9895007610321045, - "logps/chosen": -179.03176879882812, - "logps/rejected": -207.6799774169922, - "loss": 0.6018, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1616852283477783, - "rewards/margins": 0.2887330949306488, - "rewards/rejected": -1.45041823387146, + "logits/chosen": -2.9988603591918945, + "logits/rejected": -2.9865429401397705, + "logps/chosen": -179.5717010498047, + "logps/rejected": -208.49853515625, + "loss": 0.6028, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1670845746994019, + "rewards/margins": 0.2915195822715759, + "rewards/rejected": -1.4586042165756226, "step": 5770 }, { "epoch": 1.0, - "grad_norm": 3.53125, + "grad_norm": 3.453125, "learning_rate": 2.393231807362728e-10, - "logits/chosen": -3.0064327716827393, - "logits/rejected": -2.9963736534118652, - "logps/chosen": -185.0785675048828, - "logps/rejected": -212.9348602294922, - "loss": 0.619, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.22031569480896, - "rewards/margins": 0.2880048155784607, - "rewards/rejected": -1.5083205699920654, + "logits/chosen": -3.005258798599243, + "logits/rejected": -2.994863986968994, + "logps/chosen": -185.7277374267578, + "logps/rejected": -214.2324981689453, + "loss": 0.6172, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2268074750900269, + "rewards/margins": 0.2944895625114441, + "rewards/rejected": -1.5212970972061157, "step": 5780 }, { "epoch": 1.0, - "grad_norm": 2.515625, + "grad_norm": 2.640625, "learning_rate": 7.645757410912336e-11, - "logits/chosen": -2.999741315841675, - "logits/rejected": -2.9899821281433105, - "logps/chosen": -166.2742156982422, - "logps/rejected": -199.00619506835938, - "loss": 0.5938, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.0550991296768188, - "rewards/margins": 0.31244513392448425, - "rewards/rejected": -1.3675440549850464, + "logits/chosen": -2.9988582134246826, + "logits/rejected": -2.9891772270202637, + "logps/chosen": -166.65049743652344, + "logps/rejected": -200.60191345214844, + "loss": 0.5918, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0588618516921997, + "rewards/margins": 0.32463932037353516, + "rewards/rejected": -1.3835010528564453, "step": 5790 }, { "epoch": 1.0, - "grad_norm": 2.765625, + "grad_norm": 2.78125, "learning_rate": 4.071724779286523e-12, - "logits/chosen": -3.018495559692383, - "logits/rejected": -3.012821674346924, - "logps/chosen": -173.92466735839844, - "logps/rejected": -207.4680938720703, - "loss": 0.5876, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.1266343593597412, - "rewards/margins": 0.35082951188087463, - "rewards/rejected": -1.477463960647583, + "logits/chosen": -3.0157129764556885, + "logits/rejected": -3.0100326538085938, + "logps/chosen": -174.407958984375, + "logps/rejected": -209.0814208984375, + "loss": 0.5854, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1314672231674194, + "rewards/margins": 0.36212995648384094, + "rewards/rejected": -1.4935972690582275, "step": 5800 }, { "epoch": 1.0, - "eval_logits/chosen": -3.023923397064209, - "eval_logits/rejected": -3.017603874206543, - "eval_logps/chosen": -166.78807067871094, - "eval_logps/rejected": -187.0471954345703, - "eval_loss": 0.6481729745864868, - "eval_rewards/accuracies": 0.6171003580093384, - "eval_rewards/chosen": -0.9537805914878845, - "eval_rewards/margins": 0.1656205952167511, - "eval_rewards/rejected": -1.1194013357162476, - "eval_runtime": 483.4522, - "eval_samples_per_second": 8.903, - "eval_steps_per_second": 1.113, + "eval_logits/chosen": -3.0223519802093506, + "eval_logits/rejected": -3.0161619186401367, + "eval_logps/chosen": -167.01016235351562, + "eval_logps/rejected": -187.9011993408203, + "eval_loss": 0.6463221311569214, + "eval_rewards/accuracies": 0.6203531622886658, + "eval_rewards/chosen": -0.9560015797615051, + "eval_rewards/margins": 0.17193979024887085, + "eval_rewards/rejected": -1.127941370010376, + "eval_runtime": 483.8696, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.112, "step": 5800 }, { "epoch": 1.0, "step": 5803, "total_flos": 0.0, - "train_loss": 0.0003277428618961422, - "train_runtime": 17.7068, - "train_samples_per_second": 5244.214, - "train_steps_per_second": 327.728 + "train_loss": 0.6317814285541924, + "train_runtime": 53813.0859, + "train_samples_per_second": 1.726, + "train_steps_per_second": 0.108 } ], "logging_steps": 10,