{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 500, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 18.219385651116053, "learning_rate": 2.617801047120419e-09, "logits/chosen": 5870.685546875, "logits/rejected": 4942.87255859375, "logps/chosen": -300.06866455078125, "logps/rejected": -172.3806915283203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 17.235981665270252, "learning_rate": 2.6178010471204188e-08, "logits/chosen": 4513.25439453125, "logits/rejected": 4184.88818359375, "logps/chosen": -237.9716033935547, "logps/rejected": -219.00857543945312, "loss": 0.6933, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.00043410700163803995, "rewards/margins": -0.00041542822145856917, "rewards/rejected": -1.8678772903513163e-05, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 16.065934368869318, "learning_rate": 5.2356020942408376e-08, "logits/chosen": 6490.0400390625, "logits/rejected": 5858.52490234375, "logps/chosen": -313.576171875, "logps/rejected": -287.2350158691406, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0005803096573799849, "rewards/margins": 0.0009115642169490457, "rewards/rejected": -0.0003312545013613999, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 15.920025055683531, "learning_rate": 7.853403141361257e-08, "logits/chosen": 6130.9091796875, "logits/rejected": 4619.53173828125, "logps/chosen": -287.20556640625, "logps/rejected": -230.14352416992188, "loss": 0.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00016442504420410842, "rewards/margins": 0.0013787832576781511, "rewards/rejected": -0.0012143582571297884, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 19.041793171522134, "learning_rate": 1.0471204188481675e-07, "logits/chosen": 6250.5380859375, "logits/rejected": 5154.09716796875, "logps/chosen": -314.29571533203125, "logps/rejected": -284.4984130859375, "loss": 0.6927, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.001805333187803626, "rewards/margins": 0.0019232326885685325, "rewards/rejected": -0.00011789942800533026, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 15.141940304638405, "learning_rate": 1.3089005235602092e-07, "logits/chosen": 5869.23583984375, "logits/rejected": 5015.390625, "logps/chosen": -278.0210876464844, "logps/rejected": -260.28076171875, "loss": 0.6917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0023084317799657583, "rewards/margins": 0.0028667484875768423, "rewards/rejected": -0.0005583164747804403, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 17.08217856519977, "learning_rate": 1.5706806282722514e-07, "logits/chosen": 5986.6494140625, "logits/rejected": 4455.423828125, "logps/chosen": -321.358154296875, "logps/rejected": -236.8417205810547, "loss": 0.6891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0058924416080117226, "rewards/margins": 0.0071367500349879265, "rewards/rejected": -0.0012443081941455603, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 16.204216557148193, "learning_rate": 1.8324607329842932e-07, "logits/chosen": 5881.57421875, "logits/rejected": 5116.564453125, "logps/chosen": -285.27740478515625, "logps/rejected": -259.5113830566406, "loss": 0.6883, "rewards/accuracies": 0.6875, "rewards/chosen": 0.011950762942433357, "rewards/margins": 0.011513126082718372, "rewards/rejected": 0.0004376379365567118, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 15.596476937669543, "learning_rate": 2.094240837696335e-07, "logits/chosen": 5791.3642578125, "logits/rejected": 4847.74462890625, "logps/chosen": -272.760009765625, "logps/rejected": -241.96463012695312, "loss": 0.6827, "rewards/accuracies": 0.75, "rewards/chosen": 0.029420843347907066, "rewards/margins": 0.02470467798411846, "rewards/rejected": 0.004716166295111179, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 16.155654611877022, "learning_rate": 2.356020942408377e-07, "logits/chosen": 6104.376953125, "logits/rejected": 5388.201171875, "logps/chosen": -293.25665283203125, "logps/rejected": -278.4584655761719, "loss": 0.6801, "rewards/accuracies": 0.5625, "rewards/chosen": 0.03633292764425278, "rewards/margins": 0.020106201991438866, "rewards/rejected": 0.01622672937810421, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 16.685655525961554, "learning_rate": 2.6178010471204185e-07, "logits/chosen": 5451.1865234375, "logits/rejected": 4855.86181640625, "logps/chosen": -246.4558563232422, "logps/rejected": -211.8059844970703, "loss": 0.677, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.020030105486512184, "rewards/margins": 0.014237035997211933, "rewards/rejected": 0.005793069489300251, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 17.053147091965794, "learning_rate": 2.879581151832461e-07, "logits/chosen": 4958.31884765625, "logits/rejected": 3970.31396484375, "logps/chosen": -246.61898803710938, "logps/rejected": -188.33499145507812, "loss": 0.6659, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.029025157913565636, "rewards/margins": 0.057013750076293945, "rewards/rejected": -0.02798858843743801, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 16.545038194152365, "learning_rate": 3.1413612565445027e-07, "logits/chosen": 6173.68212890625, "logits/rejected": 5564.80078125, "logps/chosen": -292.63348388671875, "logps/rejected": -283.7936706542969, "loss": 0.662, "rewards/accuracies": 0.625, "rewards/chosen": 0.014459408819675446, "rewards/margins": 0.08033261448144913, "rewards/rejected": -0.06587319076061249, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 19.676877695748942, "learning_rate": 3.4031413612565446e-07, "logits/chosen": 6213.53125, "logits/rejected": 4406.7197265625, "logps/chosen": -277.0280456542969, "logps/rejected": -229.03775024414062, "loss": 0.6505, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.013297341763973236, "rewards/margins": 0.1205199584364891, "rewards/rejected": -0.10722261667251587, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 19.250986468179725, "learning_rate": 3.6649214659685864e-07, "logits/chosen": 5914.55908203125, "logits/rejected": 5749.5546875, "logps/chosen": -303.387939453125, "logps/rejected": -319.0635681152344, "loss": 0.6504, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.20868118107318878, "rewards/margins": 0.06486045569181442, "rewards/rejected": -0.2735416293144226, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 19.120241781934865, "learning_rate": 3.926701570680628e-07, "logits/chosen": 5598.470703125, "logits/rejected": 5063.0654296875, "logps/chosen": -288.4744873046875, "logps/rejected": -278.3064270019531, "loss": 0.6463, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03469898924231529, "rewards/margins": 0.1066732183098793, "rewards/rejected": -0.14137223362922668, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 20.879794925554606, "learning_rate": 4.18848167539267e-07, "logits/chosen": 5560.33642578125, "logits/rejected": 5010.7998046875, "logps/chosen": -242.72854614257812, "logps/rejected": -265.71160888671875, "loss": 0.6381, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.07217627763748169, "rewards/margins": 0.1330757886171341, "rewards/rejected": -0.060899507254362106, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 19.704247956213553, "learning_rate": 4.450261780104712e-07, "logits/chosen": 6847.02587890625, "logits/rejected": 5505.11083984375, "logps/chosen": -308.2012634277344, "logps/rejected": -309.3101806640625, "loss": 0.6024, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.14331135153770447, "rewards/margins": 0.27965664863586426, "rewards/rejected": -0.4229680001735687, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 24.516429995380633, "learning_rate": 4.712041884816754e-07, "logits/chosen": 6166.9541015625, "logits/rejected": 4428.91064453125, "logps/chosen": -315.04620361328125, "logps/rejected": -281.04083251953125, "loss": 0.6034, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.25546202063560486, "rewards/margins": 0.21370892226696014, "rewards/rejected": -0.4691709876060486, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 25.97707933799451, "learning_rate": 4.973821989528796e-07, "logits/chosen": 5838.8359375, "logits/rejected": 5683.42529296875, "logps/chosen": -275.9669189453125, "logps/rejected": -315.14813232421875, "loss": 0.6116, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.12693677842617035, "rewards/margins": 0.17842599749565125, "rewards/rejected": -0.3053628206253052, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 21.5648227076861, "learning_rate": 4.999661831436498e-07, "logits/chosen": 5913.36572265625, "logits/rejected": 5817.02001953125, "logps/chosen": -281.7383117675781, "logps/rejected": -325.94866943359375, "loss": 0.6186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1391076147556305, "rewards/margins": 0.27709800004959106, "rewards/rejected": -0.41620558500289917, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 23.653917732151342, "learning_rate": 4.998492971140339e-07, "logits/chosen": 5833.1513671875, "logits/rejected": 5763.98828125, "logps/chosen": -301.4639587402344, "logps/rejected": -368.21435546875, "loss": 0.6093, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3761170506477356, "rewards/margins": 0.3480328917503357, "rewards/rejected": -0.7241500020027161, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 24.40546569275766, "learning_rate": 4.996489634487865e-07, "logits/chosen": 5946.09765625, "logits/rejected": 5071.6171875, "logps/chosen": -338.2851257324219, "logps/rejected": -332.00750732421875, "loss": 0.6073, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4733448028564453, "rewards/margins": 0.3421005308628082, "rewards/rejected": -0.8154453039169312, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 27.21646553864263, "learning_rate": 4.993652490577246e-07, "logits/chosen": 6565.8515625, "logits/rejected": 5242.6064453125, "logps/chosen": -319.81707763671875, "logps/rejected": -330.72802734375, "loss": 0.5751, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.38077861070632935, "rewards/margins": 0.39619022607803345, "rewards/rejected": -0.7769688367843628, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 26.32441631739712, "learning_rate": 4.9899824869915e-07, "logits/chosen": 5868.26904296875, "logits/rejected": 4399.78662109375, "logps/chosen": -337.031982421875, "logps/rejected": -297.947998046875, "loss": 0.5993, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5988413095474243, "rewards/margins": 0.3144153952598572, "rewards/rejected": -0.9132567644119263, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 44.90708876251853, "learning_rate": 4.985480849482012e-07, "logits/chosen": 5798.130859375, "logits/rejected": 5872.59912109375, "logps/chosen": -307.9162902832031, "logps/rejected": -349.7005920410156, "loss": 0.5857, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.46432504057884216, "rewards/margins": 0.2551492154598236, "rewards/rejected": -0.7194742560386658, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 45.19986115391165, "learning_rate": 4.980149081559142e-07, "logits/chosen": 6476.58447265625, "logits/rejected": 6131.4462890625, "logps/chosen": -367.92474365234375, "logps/rejected": -391.8291320800781, "loss": 0.5694, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5577724575996399, "rewards/margins": 0.386624276638031, "rewards/rejected": -0.9443964958190918, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 42.08733545483534, "learning_rate": 4.973988963990065e-07, "logits/chosen": 5284.0224609375, "logits/rejected": 4501.1884765625, "logps/chosen": -320.0960693359375, "logps/rejected": -377.3937683105469, "loss": 0.5544, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6243572235107422, "rewards/margins": 0.6635113954544067, "rewards/rejected": -1.2878687381744385, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 53.662262542495505, "learning_rate": 4.967002554204008e-07, "logits/chosen": 5689.02197265625, "logits/rejected": 4741.4453125, "logps/chosen": -367.5455627441406, "logps/rejected": -406.5661315917969, "loss": 0.5339, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8560550808906555, "rewards/margins": 0.7749707698822021, "rewards/rejected": -1.6310256719589233, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 46.63789766427997, "learning_rate": 4.959192185605087e-07, "logits/chosen": 5927.48388671875, "logits/rejected": 5238.05615234375, "logps/chosen": -354.1465148925781, "logps/rejected": -415.89349365234375, "loss": 0.5585, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7841471433639526, "rewards/margins": 0.5948286056518555, "rewards/rejected": -1.3789756298065186, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 34.308974438258886, "learning_rate": 4.950560466792969e-07, "logits/chosen": 6596.2265625, "logits/rejected": 5299.0927734375, "logps/chosen": -406.15313720703125, "logps/rejected": -429.3497009277344, "loss": 0.5435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7677477598190308, "rewards/margins": 0.6641772985458374, "rewards/rejected": -1.4319250583648682, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 82.23748697014982, "learning_rate": 4.941110280691619e-07, "logits/chosen": 5986.08203125, "logits/rejected": 4740.41259765625, "logps/chosen": -357.654052734375, "logps/rejected": -365.6220703125, "loss": 0.5538, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7639473676681519, "rewards/margins": 0.6894143223762512, "rewards/rejected": -1.4533617496490479, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 41.19448913938724, "learning_rate": 4.930844783586424e-07, "logits/chosen": 5201.2353515625, "logits/rejected": 4921.05322265625, "logps/chosen": -310.82574462890625, "logps/rejected": -375.40509033203125, "loss": 0.5533, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9284757375717163, "rewards/margins": 0.5470661520957947, "rewards/rejected": -1.4755420684814453, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 76.38056074864738, "learning_rate": 4.919767404070033e-07, "logits/chosen": 6316.92236328125, "logits/rejected": 5181.3857421875, "logps/chosen": -405.858154296875, "logps/rejected": -420.537109375, "loss": 0.548, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1969449520111084, "rewards/margins": 0.556014895439148, "rewards/rejected": -1.752959966659546, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 67.01683401046546, "learning_rate": 4.907881841897216e-07, "logits/chosen": 5539.5302734375, "logits/rejected": 5639.63037109375, "logps/chosen": -424.65478515625, "logps/rejected": -517.6135864257812, "loss": 0.5626, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5640079975128174, "rewards/margins": 0.5679855942726135, "rewards/rejected": -2.1319937705993652, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 31.85827136868456, "learning_rate": 4.895192066749189e-07, "logits/chosen": 5924.69580078125, "logits/rejected": 4566.55419921875, "logps/chosen": -421.01739501953125, "logps/rejected": -438.85858154296875, "loss": 0.5291, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4220540523529053, "rewards/margins": 0.5200406312942505, "rewards/rejected": -1.9420945644378662, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 41.33731178691857, "learning_rate": 4.881702316907768e-07, "logits/chosen": 6177.900390625, "logits/rejected": 4649.4853515625, "logps/chosen": -359.7803039550781, "logps/rejected": -367.48541259765625, "loss": 0.5359, "rewards/accuracies": 0.6875, "rewards/chosen": -0.821795642375946, "rewards/margins": 0.5971574783325195, "rewards/rejected": -1.4189531803131104, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 49.52794113034413, "learning_rate": 4.86741709783982e-07, "logits/chosen": 5590.2451171875, "logits/rejected": 4720.5322265625, "logps/chosen": -358.56243896484375, "logps/rejected": -439.32843017578125, "loss": 0.5541, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0032289028167725, "rewards/margins": 0.9636434316635132, "rewards/rejected": -1.9668724536895752, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 44.092881923343576, "learning_rate": 4.85234118069247e-07, "logits/chosen": 6412.9873046875, "logits/rejected": 5594.14306640625, "logps/chosen": -396.65447998046875, "logps/rejected": -428.21490478515625, "loss": 0.5464, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0084364414215088, "rewards/margins": 0.5967626571655273, "rewards/rejected": -1.6051992177963257, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 76.52233652264678, "learning_rate": 4.836479600699578e-07, "logits/chosen": 5924.59326171875, "logits/rejected": 5504.5029296875, "logps/chosen": -342.6595153808594, "logps/rejected": -414.57427978515625, "loss": 0.59, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7394064664840698, "rewards/margins": 0.5539022088050842, "rewards/rejected": -1.2933086156845093, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 47.20888121465801, "learning_rate": 4.819837655500013e-07, "logits/chosen": 6445.34130859375, "logits/rejected": 6306.50390625, "logps/chosen": -414.34515380859375, "logps/rejected": -472.36212158203125, "loss": 0.5399, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1763949394226074, "rewards/margins": 0.5224038362503052, "rewards/rejected": -1.6987988948822021, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 39.87824487927702, "learning_rate": 4.802420903368285e-07, "logits/chosen": 5955.009765625, "logits/rejected": 4885.7529296875, "logps/chosen": -395.8122863769531, "logps/rejected": -484.23565673828125, "loss": 0.5291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4695124626159668, "rewards/margins": 0.973471462726593, "rewards/rejected": -2.442983865737915, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 53.89952582024282, "learning_rate": 4.784235161358123e-07, "logits/chosen": 6697.92822265625, "logits/rejected": 5091.77685546875, "logps/chosen": -452.1192321777344, "logps/rejected": -489.41015625, "loss": 0.522, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5608818531036377, "rewards/margins": 0.6900812983512878, "rewards/rejected": -2.2509632110595703, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 53.64934742868112, "learning_rate": 4.7652865033596314e-07, "logits/chosen": 6347.36865234375, "logits/rejected": 5186.87109375, "logps/chosen": -429.01214599609375, "logps/rejected": -489.399169921875, "loss": 0.5164, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5157774686813354, "rewards/margins": 0.6435315608978271, "rewards/rejected": -2.159308910369873, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 43.316512493038786, "learning_rate": 4.7455812580706534e-07, "logits/chosen": 5819.4365234375, "logits/rejected": 4712.92431640625, "logps/chosen": -383.89447021484375, "logps/rejected": -428.0326232910156, "loss": 0.5051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0773056745529175, "rewards/margins": 0.6253499388694763, "rewards/rejected": -1.702655553817749, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 40.32463549649978, "learning_rate": 4.725126006883046e-07, "logits/chosen": 5460.0400390625, "logits/rejected": 5187.6435546875, "logps/chosen": -382.9438781738281, "logps/rejected": -463.376220703125, "loss": 0.5456, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2239887714385986, "rewards/margins": 0.6800339818000793, "rewards/rejected": -1.9040225744247437, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 37.48349919523277, "learning_rate": 4.703927581684539e-07, "logits/chosen": 5890.5068359375, "logits/rejected": 5778.6552734375, "logps/chosen": -375.30609130859375, "logps/rejected": -398.3433532714844, "loss": 0.5675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0234365463256836, "rewards/margins": 0.4840970039367676, "rewards/rejected": -1.5075336694717407, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 33.46265791395517, "learning_rate": 4.68199306257695e-07, "logits/chosen": 5615.3662109375, "logits/rejected": 4484.6279296875, "logps/chosen": -362.4558410644531, "logps/rejected": -425.9368591308594, "loss": 0.5021, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9484370350837708, "rewards/margins": 0.7808512449264526, "rewards/rejected": -1.729288101196289, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 41.07649079601629, "learning_rate": 4.6593297755114776e-07, "logits/chosen": 6466.3056640625, "logits/rejected": 6035.984375, "logps/chosen": -378.1504821777344, "logps/rejected": -465.003173828125, "loss": 0.5469, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1055399179458618, "rewards/margins": 0.6051799654960632, "rewards/rejected": -1.7107200622558594, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 37.586937993939735, "learning_rate": 4.635945289841902e-07, "logits/chosen": 4972.3583984375, "logits/rejected": 5045.6435546875, "logps/chosen": -335.74884033203125, "logps/rejected": -420.6666564941406, "loss": 0.5707, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1341283321380615, "rewards/margins": 0.4123230576515198, "rewards/rejected": -1.546451210975647, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 42.61584498258183, "learning_rate": 4.611847415796476e-07, "logits/chosen": 6352.6376953125, "logits/rejected": 5433.37158203125, "logps/chosen": -395.6455383300781, "logps/rejected": -416.5750427246094, "loss": 0.5502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0464991331100464, "rewards/margins": 0.5715607404708862, "rewards/rejected": -1.6180601119995117, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 31.96372069934404, "learning_rate": 4.5870442018693773e-07, "logits/chosen": 5973.494140625, "logits/rejected": 5411.9462890625, "logps/chosen": -370.54351806640625, "logps/rejected": -440.241943359375, "loss": 0.5114, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9235299825668335, "rewards/margins": 0.6839796900749207, "rewards/rejected": -1.6075098514556885, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 54.78695276780696, "learning_rate": 4.5615439321325735e-07, "logits/chosen": 6326.53125, "logits/rejected": 5008.32275390625, "logps/chosen": -359.27716064453125, "logps/rejected": -423.32672119140625, "loss": 0.5171, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7673004269599915, "rewards/margins": 0.7084370255470276, "rewards/rejected": -1.475737452507019, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 41.821437903417056, "learning_rate": 4.535355123469008e-07, "logits/chosen": 5782.46484375, "logits/rejected": 5206.86962890625, "logps/chosen": -348.8133850097656, "logps/rejected": -427.0050354003906, "loss": 0.5162, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8036266565322876, "rewards/margins": 0.8553822636604309, "rewards/rejected": -1.6590089797973633, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 36.104040296185744, "learning_rate": 4.5084865227280366e-07, "logits/chosen": 5758.5625, "logits/rejected": 5162.15185546875, "logps/chosen": -382.82147216796875, "logps/rejected": -439.8946838378906, "loss": 0.5233, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0022233724594116, "rewards/margins": 0.8220928311347961, "rewards/rejected": -1.8243162631988525, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 73.05209001650977, "learning_rate": 4.4809471038040437e-07, "logits/chosen": 5572.75537109375, "logits/rejected": 4392.76708984375, "logps/chosen": -439.218994140625, "logps/rejected": -457.9751892089844, "loss": 0.5408, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.466726303100586, "rewards/margins": 0.7607309222221375, "rewards/rejected": -2.227457284927368, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 48.9029845725239, "learning_rate": 4.4527460646392386e-07, "logits/chosen": 5651.72216796875, "logits/rejected": 5173.35986328125, "logps/chosen": -379.19842529296875, "logps/rejected": -442.13751220703125, "loss": 0.5675, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3517531156539917, "rewards/margins": 0.562275230884552, "rewards/rejected": -1.9140284061431885, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 43.677593505995624, "learning_rate": 4.4238928241516163e-07, "logits/chosen": 6816.3515625, "logits/rejected": 5143.58349609375, "logps/chosen": -437.1297912597656, "logps/rejected": -473.25128173828125, "loss": 0.5295, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2189630270004272, "rewards/margins": 0.9794257879257202, "rewards/rejected": -2.1983885765075684, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 63.11423854936817, "learning_rate": 4.394397019089116e-07, "logits/chosen": 6103.3896484375, "logits/rejected": 4841.986328125, "logps/chosen": -409.55291748046875, "logps/rejected": -423.4261779785156, "loss": 0.5156, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1972742080688477, "rewards/margins": 0.6304734945297241, "rewards/rejected": -1.8277477025985718, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 54.584236713891464, "learning_rate": 4.3642685008110246e-07, "logits/chosen": 5786.09765625, "logits/rejected": 4412.03515625, "logps/chosen": -372.55584716796875, "logps/rejected": -439.2442321777344, "loss": 0.5591, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1865366697311401, "rewards/margins": 0.9408473968505859, "rewards/rejected": -2.1273841857910156, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 40.053790563245464, "learning_rate": 4.333517331997704e-07, "logits/chosen": 6298.62158203125, "logits/rejected": 5869.5048828125, "logps/chosen": -428.19195556640625, "logps/rejected": -478.00067138671875, "loss": 0.5143, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3069322109222412, "rewards/margins": 0.6191404461860657, "rewards/rejected": -1.9260727167129517, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 35.0814337208229, "learning_rate": 4.302153783289736e-07, "logits/chosen": 6017.439453125, "logits/rejected": 5107.21435546875, "logps/chosen": -382.84521484375, "logps/rejected": -509.23162841796875, "loss": 0.4236, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1522138118743896, "rewards/margins": 1.0982835292816162, "rewards/rejected": -2.250497341156006, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 56.2222562736252, "learning_rate": 4.2701883298576124e-07, "logits/chosen": 5797.8349609375, "logits/rejected": 5281.35791015625, "logps/chosen": -443.8690490722656, "logps/rejected": -503.81103515625, "loss": 0.5353, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6236203908920288, "rewards/margins": 0.9502062797546387, "rewards/rejected": -2.573826313018799, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 55.63889247197654, "learning_rate": 4.237631647903115e-07, "logits/chosen": 5690.2646484375, "logits/rejected": 4674.8740234375, "logps/chosen": -455.040283203125, "logps/rejected": -506.5398864746094, "loss": 0.4961, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7885202169418335, "rewards/margins": 0.806254506111145, "rewards/rejected": -2.5947747230529785, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 50.31232935422226, "learning_rate": 4.204494611093548e-07, "logits/chosen": 6034.45556640625, "logits/rejected": 4252.986328125, "logps/chosen": -460.38092041015625, "logps/rejected": -486.3749084472656, "loss": 0.5231, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4732345342636108, "rewards/margins": 0.9121103286743164, "rewards/rejected": -2.385344982147217, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 45.5866926508609, "learning_rate": 4.1707882869300235e-07, "logits/chosen": 6080.8759765625, "logits/rejected": 4943.0146484375, "logps/chosen": -413.87408447265625, "logps/rejected": -431.1224670410156, "loss": 0.5014, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.335010051727295, "rewards/margins": 0.7433810830116272, "rewards/rejected": -2.0783913135528564, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 49.967926368130314, "learning_rate": 4.136523933051005e-07, "logits/chosen": 6260.0546875, "logits/rejected": 5515.7265625, "logps/chosen": -427.33453369140625, "logps/rejected": -463.00830078125, "loss": 0.5026, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4806843996047974, "rewards/margins": 0.5847845673561096, "rewards/rejected": -2.0654690265655518, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 43.43487493207468, "learning_rate": 4.101712993472348e-07, "logits/chosen": 6464.7451171875, "logits/rejected": 5535.1884765625, "logps/chosen": -382.25323486328125, "logps/rejected": -416.734130859375, "loss": 0.5258, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.05479097366333, "rewards/margins": 0.7567101716995239, "rewards/rejected": -1.811500906944275, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 75.11106325290936, "learning_rate": 4.066367094765091e-07, "logits/chosen": 6027.20458984375, "logits/rejected": 4826.53515625, "logps/chosen": -394.6514587402344, "logps/rejected": -457.4222717285156, "loss": 0.4753, "rewards/accuracies": 0.75, "rewards/chosen": -1.0180187225341797, "rewards/margins": 1.0836880207061768, "rewards/rejected": -2.1017067432403564, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 39.3035282380294, "learning_rate": 4.0304980421722766e-07, "logits/chosen": 5874.466796875, "logits/rejected": 5295.6796875, "logps/chosen": -425.7220153808594, "logps/rejected": -494.6651916503906, "loss": 0.4952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3429622650146484, "rewards/margins": 0.8915923833847046, "rewards/rejected": -2.2345547676086426, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 72.63738048545449, "learning_rate": 3.994117815666095e-07, "logits/chosen": 5882.6201171875, "logits/rejected": 4352.89453125, "logps/chosen": -540.1171875, "logps/rejected": -573.46533203125, "loss": 0.5252, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0739383697509766, "rewards/margins": 1.0526468753814697, "rewards/rejected": -3.1265854835510254, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 43.821611769673716, "learning_rate": 3.957238565946671e-07, "logits/chosen": 5647.4677734375, "logits/rejected": 4672.6025390625, "logps/chosen": -402.78948974609375, "logps/rejected": -439.42181396484375, "loss": 0.5782, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4725525379180908, "rewards/margins": 0.6136714816093445, "rewards/rejected": -2.08622407913208, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 42.031950023528594, "learning_rate": 3.9198726103838306e-07, "logits/chosen": 5673.10546875, "logits/rejected": 5009.50537109375, "logps/chosen": -369.00616455078125, "logps/rejected": -408.2512512207031, "loss": 0.4932, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9950849413871765, "rewards/margins": 0.7163550853729248, "rewards/rejected": -1.711439847946167, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 40.901024619350444, "learning_rate": 3.8820324289031946e-07, "logits/chosen": 5839.904296875, "logits/rejected": 5013.7724609375, "logps/chosen": -351.48541259765625, "logps/rejected": -451.85772705078125, "loss": 0.4757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.088209629058838, "rewards/margins": 1.003163456916809, "rewards/rejected": -2.0913729667663574, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 67.27072060484619, "learning_rate": 3.84373065981799e-07, "logits/chosen": 6519.392578125, "logits/rejected": 4812.6298828125, "logps/chosen": -426.72235107421875, "logps/rejected": -512.9215087890625, "loss": 0.4597, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.334121584892273, "rewards/margins": 1.1064695119857788, "rewards/rejected": -2.440591335296631, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 59.86556321970916, "learning_rate": 3.8049800956079545e-07, "logits/chosen": 6076.96533203125, "logits/rejected": 5167.3095703125, "logps/chosen": -461.48333740234375, "logps/rejected": -533.3235473632812, "loss": 0.5323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7070420980453491, "rewards/margins": 1.1307730674743652, "rewards/rejected": -2.837815046310425, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 33.685203730626526, "learning_rate": 3.7657936786467525e-07, "logits/chosen": 5342.8798828125, "logits/rejected": 4421.5263671875, "logps/chosen": -402.7789001464844, "logps/rejected": -472.66015625, "loss": 0.4928, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5052649974822998, "rewards/margins": 0.917253851890564, "rewards/rejected": -2.422518491744995, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 40.729237557670544, "learning_rate": 3.7261844968793226e-07, "logits/chosen": 4545.2060546875, "logits/rejected": 4567.5732421875, "logps/chosen": -330.99951171875, "logps/rejected": -459.3143615722656, "loss": 0.5137, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1079671382904053, "rewards/margins": 0.9936937093734741, "rewards/rejected": -2.101661205291748, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 41.80861205828446, "learning_rate": 3.6861657794506187e-07, "logits/chosen": 5142.6376953125, "logits/rejected": 4762.04296875, "logps/chosen": -388.6526794433594, "logps/rejected": -440.11773681640625, "loss": 0.5791, "rewards/accuracies": 0.6875, "rewards/chosen": -1.457234263420105, "rewards/margins": 0.5115066766738892, "rewards/rejected": -1.968740701675415, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 33.75516962062128, "learning_rate": 3.6457508922871777e-07, "logits/chosen": 6393.16162109375, "logits/rejected": 4704.26171875, "logps/chosen": -405.71917724609375, "logps/rejected": -469.50787353515625, "loss": 0.4797, "rewards/accuracies": 0.75, "rewards/chosen": -1.3708717823028564, "rewards/margins": 0.9878827929496765, "rewards/rejected": -2.3587546348571777, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 69.41446168689265, "learning_rate": 3.6049533336330084e-07, "logits/chosen": 6274.9033203125, "logits/rejected": 4973.19140625, "logps/chosen": -432.57891845703125, "logps/rejected": -494.46240234375, "loss": 0.5065, "rewards/accuracies": 0.75, "rewards/chosen": -1.4693964719772339, "rewards/margins": 1.0210431814193726, "rewards/rejected": -2.4904398918151855, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 62.54198407820259, "learning_rate": 3.56378672954129e-07, "logits/chosen": 6452.43505859375, "logits/rejected": 4535.1796875, "logps/chosen": -467.36920166015625, "logps/rejected": -505.26416015625, "loss": 0.4896, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5940972566604614, "rewards/margins": 1.0991283655166626, "rewards/rejected": -2.693225383758545, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 60.6478894617012, "learning_rate": 3.5222648293233803e-07, "logits/chosen": 6424.5205078125, "logits/rejected": 5873.54150390625, "logps/chosen": -459.4623107910156, "logps/rejected": -547.5186767578125, "loss": 0.4902, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7614972591400146, "rewards/margins": 0.9000027775764465, "rewards/rejected": -2.6614999771118164, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 45.06218597777103, "learning_rate": 3.480401500956657e-07, "logits/chosen": 5537.083984375, "logits/rejected": 4656.86279296875, "logps/chosen": -401.616943359375, "logps/rejected": -469.45294189453125, "loss": 0.5468, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5688083171844482, "rewards/margins": 0.580168604850769, "rewards/rejected": -2.1489768028259277, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 54.52932769604021, "learning_rate": 3.438210726452724e-07, "logits/chosen": 6457.12548828125, "logits/rejected": 5661.3583984375, "logps/chosen": -436.4351501464844, "logps/rejected": -478.7416076660156, "loss": 0.5272, "rewards/accuracies": 0.75, "rewards/chosen": -1.2537381649017334, "rewards/margins": 0.7841897010803223, "rewards/rejected": -2.0379281044006348, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 50.065290886996, "learning_rate": 3.395706597187538e-07, "logits/chosen": 4831.3525390625, "logits/rejected": 4748.2353515625, "logps/chosen": -361.46270751953125, "logps/rejected": -433.6537170410156, "loss": 0.4847, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3145052194595337, "rewards/margins": 0.74875807762146, "rewards/rejected": -2.063263416290283, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 52.3633732314044, "learning_rate": 3.3529033091949986e-07, "logits/chosen": 5875.1552734375, "logits/rejected": 5419.3779296875, "logps/chosen": -449.91156005859375, "logps/rejected": -558.637451171875, "loss": 0.5165, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4811006784439087, "rewards/margins": 1.0674123764038086, "rewards/rejected": -2.548513174057007, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 73.44300921439607, "learning_rate": 3.309815158425591e-07, "logits/chosen": 5725.45703125, "logits/rejected": 5392.0048828125, "logps/chosen": -429.4095153808594, "logps/rejected": -524.0256958007812, "loss": 0.4983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4120854139328003, "rewards/margins": 1.0350888967514038, "rewards/rejected": -2.447174549102783, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 52.13481785442569, "learning_rate": 3.2664565359716536e-07, "logits/chosen": 5756.67041015625, "logits/rejected": 4672.0849609375, "logps/chosen": -459.2340393066406, "logps/rejected": -534.3760375976562, "loss": 0.4838, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9478585720062256, "rewards/margins": 1.086380124092102, "rewards/rejected": -3.034238576889038, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 67.63237467819759, "learning_rate": 3.222841923260869e-07, "logits/chosen": 5340.1484375, "logits/rejected": 4598.82177734375, "logps/chosen": -484.25640869140625, "logps/rejected": -572.4221801757812, "loss": 0.4754, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.19539213180542, "rewards/margins": 1.0763537883758545, "rewards/rejected": -3.2717461585998535, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 83.86420244213592, "learning_rate": 3.1789858872195887e-07, "logits/chosen": 6498.91650390625, "logits/rejected": 5262.67919921875, "logps/chosen": -523.2308349609375, "logps/rejected": -602.5567626953125, "loss": 0.4791, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.1959385871887207, "rewards/margins": 1.0925599336624146, "rewards/rejected": -3.288498640060425, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 36.7937222415298, "learning_rate": 3.1349030754075937e-07, "logits/chosen": 5431.06005859375, "logits/rejected": 4285.5322265625, "logps/chosen": -431.21502685546875, "logps/rejected": -537.7532958984375, "loss": 0.5054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7794075012207031, "rewards/margins": 1.3082810640335083, "rewards/rejected": -3.087688446044922, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 55.18035438094623, "learning_rate": 3.090608211125931e-07, "logits/chosen": 5392.5185546875, "logits/rejected": 4608.42236328125, "logps/chosen": -412.7171325683594, "logps/rejected": -508.71490478515625, "loss": 0.4741, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6127049922943115, "rewards/margins": 1.1365652084350586, "rewards/rejected": -2.749270439147949, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 44.58108503513362, "learning_rate": 3.0461160884994487e-07, "logits/chosen": 5840.9501953125, "logits/rejected": 5145.94580078125, "logps/chosen": -455.33843994140625, "logps/rejected": -515.6210327148438, "loss": 0.499, "rewards/accuracies": 0.75, "rewards/chosen": -1.8489354848861694, "rewards/margins": 0.7887415885925293, "rewards/rejected": -2.637676954269409, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 57.73411435833942, "learning_rate": 3.001441567535681e-07, "logits/chosen": 6431.67626953125, "logits/rejected": 5249.001953125, "logps/chosen": -440.92095947265625, "logps/rejected": -529.418701171875, "loss": 0.4821, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4907338619232178, "rewards/margins": 1.0362895727157593, "rewards/rejected": -2.5270237922668457, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 59.18958027082939, "learning_rate": 2.956599569161724e-07, "logits/chosen": 5414.20458984375, "logits/rejected": 4187.3544921875, "logps/chosen": -389.37335205078125, "logps/rejected": -450.3434143066406, "loss": 0.5052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5387694835662842, "rewards/margins": 0.6591954231262207, "rewards/rejected": -2.197964906692505, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 42.563050302084065, "learning_rate": 2.91160507024077e-07, "logits/chosen": 5768.6162109375, "logits/rejected": 4807.8056640625, "logps/chosen": -413.04205322265625, "logps/rejected": -478.17559814453125, "loss": 0.5195, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.485231637954712, "rewards/margins": 0.8964517712593079, "rewards/rejected": -2.381683826446533, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 33.46251118490837, "learning_rate": 2.866473098569953e-07, "logits/chosen": 5825.630859375, "logits/rejected": 4860.0419921875, "logps/chosen": -423.99835205078125, "logps/rejected": -493.2699279785156, "loss": 0.4919, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3664392232894897, "rewards/margins": 0.9449175596237183, "rewards/rejected": -2.311356782913208, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 32.803451972147634, "learning_rate": 2.8212187278611905e-07, "logits/chosen": 5577.02197265625, "logits/rejected": 4832.1171875, "logps/chosen": -447.1991271972656, "logps/rejected": -527.7049560546875, "loss": 0.4697, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5957579612731934, "rewards/margins": 1.033022165298462, "rewards/rejected": -2.628779888153076, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 64.00615935239229, "learning_rate": 2.775857072706684e-07, "logits/chosen": 6070.87353515625, "logits/rejected": 4420.8466796875, "logps/chosen": -438.16644287109375, "logps/rejected": -482.08465576171875, "loss": 0.5398, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4822652339935303, "rewards/margins": 1.0824673175811768, "rewards/rejected": -2.564732551574707, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 46.86520969500558, "learning_rate": 2.7304032835307667e-07, "logits/chosen": 6216.6162109375, "logits/rejected": 5469.23974609375, "logps/chosen": -451.27020263671875, "logps/rejected": -555.1627807617188, "loss": 0.5136, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7239328622817993, "rewards/margins": 0.8704110383987427, "rewards/rejected": -2.594343662261963, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 49.63995578440868, "learning_rate": 2.6848725415297884e-07, "logits/chosen": 6084.1416015625, "logits/rejected": 5248.6669921875, "logps/chosen": -470.7705078125, "logps/rejected": -499.703857421875, "loss": 0.5062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.663577675819397, "rewards/margins": 0.8588649034500122, "rewards/rejected": -2.522442579269409, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 64.97333091332597, "learning_rate": 2.6392800536017183e-07, "logits/chosen": 5355.8505859375, "logits/rejected": 5051.5439453125, "logps/chosen": -488.87176513671875, "logps/rejected": -567.3258666992188, "loss": 0.4809, "rewards/accuracies": 0.75, "rewards/chosen": -2.0089640617370605, "rewards/margins": 0.9219423532485962, "rewards/rejected": -2.930906295776367, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 46.67126991156967, "learning_rate": 2.59364104726716e-07, "logits/chosen": 5887.8046875, "logits/rejected": 5121.62890625, "logps/chosen": -468.025146484375, "logps/rejected": -593.8919677734375, "loss": 0.4498, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7689498662948608, "rewards/margins": 1.3238210678100586, "rewards/rejected": -3.092771053314209, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 61.29392397902382, "learning_rate": 2.547970765583491e-07, "logits/chosen": 5582.82763671875, "logits/rejected": 4876.9638671875, "logps/chosen": -430.79541015625, "logps/rejected": -515.9193115234375, "loss": 0.5278, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.700372338294983, "rewards/margins": 1.078627347946167, "rewards/rejected": -2.7789998054504395, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 41.39413522028797, "learning_rate": 2.502284462053799e-07, "logits/chosen": 6156.40283203125, "logits/rejected": 5941.8779296875, "logps/chosen": -476.8907165527344, "logps/rejected": -558.0145874023438, "loss": 0.508, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9446933269500732, "rewards/margins": 0.9444707632064819, "rewards/rejected": -2.8891639709472656, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 69.99679860346889, "learning_rate": 2.4565973955323374e-07, "logits/chosen": 5784.0166015625, "logits/rejected": 4964.3076171875, "logps/chosen": -465.17950439453125, "logps/rejected": -525.5794067382812, "loss": 0.5074, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7338411808013916, "rewards/margins": 1.0747594833374023, "rewards/rejected": -2.808600902557373, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 39.31688734230333, "learning_rate": 2.410924825128195e-07, "logits/chosen": 5454.869140625, "logits/rejected": 5118.14306640625, "logps/chosen": -430.4056701660156, "logps/rejected": -529.0426025390625, "loss": 0.4646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6206849813461304, "rewards/margins": 0.8924548029899597, "rewards/rejected": -2.5131397247314453, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 52.77189181501683, "learning_rate": 2.365282005108875e-07, "logits/chosen": 5776.9716796875, "logits/rejected": 4836.4609375, "logps/chosen": -423.0970153808594, "logps/rejected": -519.367431640625, "loss": 0.4835, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6719558238983154, "rewards/margins": 1.021319031715393, "rewards/rejected": -2.693274974822998, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 70.82459551115117, "learning_rate": 2.319684179805491e-07, "logits/chosen": 5663.40283203125, "logits/rejected": 4413.01171875, "logps/chosen": -462.0267028808594, "logps/rejected": -538.9208984375, "loss": 0.5123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7139892578125, "rewards/margins": 1.3090190887451172, "rewards/rejected": -3.0230085849761963, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 33.8748559668582, "learning_rate": 2.2741465785212902e-07, "logits/chosen": 5301.47216796875, "logits/rejected": 3999.432373046875, "logps/chosen": -420.2606506347656, "logps/rejected": -517.6099243164062, "loss": 0.416, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6415036916732788, "rewards/margins": 1.3193124532699585, "rewards/rejected": -2.9608161449432373, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 50.83769917179278, "learning_rate": 2.2286844104451843e-07, "logits/chosen": 5784.0478515625, "logits/rejected": 5007.18017578125, "logps/chosen": -490.86505126953125, "logps/rejected": -576.6304931640625, "loss": 0.5079, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.056772470474243, "rewards/margins": 1.0176784992218018, "rewards/rejected": -3.074450969696045, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 52.001448501596336, "learning_rate": 2.183312859572008e-07, "logits/chosen": 6639.57177734375, "logits/rejected": 5511.9033203125, "logps/chosen": -482.6524963378906, "logps/rejected": -556.4099731445312, "loss": 0.528, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9167563915252686, "rewards/margins": 1.0493156909942627, "rewards/rejected": -2.9660720825195312, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 44.25761412679498, "learning_rate": 2.138047079631184e-07, "logits/chosen": 5394.453125, "logits/rejected": 5371.2919921875, "logps/chosen": -488.195068359375, "logps/rejected": -600.7262573242188, "loss": 0.4819, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.340132474899292, "rewards/margins": 0.8797481656074524, "rewards/rejected": -3.2198805809020996, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 55.393528463173325, "learning_rate": 2.0929021890255068e-07, "logits/chosen": 6330.7919921875, "logits/rejected": 5427.1728515625, "logps/chosen": -502.2682189941406, "logps/rejected": -618.1027221679688, "loss": 0.5048, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.034196615219116, "rewards/margins": 1.0336921215057373, "rewards/rejected": -3.0678887367248535, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 50.46681050763751, "learning_rate": 2.0478932657817102e-07, "logits/chosen": 5141.21923828125, "logits/rejected": 4884.60009765625, "logps/chosen": -445.36236572265625, "logps/rejected": -531.31787109375, "loss": 0.5092, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.968653678894043, "rewards/margins": 0.8347317576408386, "rewards/rejected": -2.8033852577209473, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 70.3269151760612, "learning_rate": 2.0030353425145374e-07, "logits/chosen": 7235.20947265625, "logits/rejected": 6419.9287109375, "logps/chosen": -583.9832153320312, "logps/rejected": -640.3153076171875, "loss": 0.561, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1977429389953613, "rewards/margins": 0.6988611221313477, "rewards/rejected": -2.896604061126709, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 42.0515600415498, "learning_rate": 1.9583434014059635e-07, "logits/chosen": 5870.0048828125, "logits/rejected": 4960.2783203125, "logps/chosen": -460.8169860839844, "logps/rejected": -575.4650268554688, "loss": 0.4764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8832632303237915, "rewards/margins": 1.0969445705413818, "rewards/rejected": -2.9802074432373047, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 44.17752357905191, "learning_rate": 1.9138323692012733e-07, "logits/chosen": 5152.05322265625, "logits/rejected": 4995.10302734375, "logps/chosen": -465.43109130859375, "logps/rejected": -523.4608154296875, "loss": 0.4744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.928765058517456, "rewards/margins": 0.7397549748420715, "rewards/rejected": -2.668519973754883, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 61.75617287900426, "learning_rate": 1.8695171122236442e-07, "logits/chosen": 5305.31787109375, "logits/rejected": 5259.71630859375, "logps/chosen": -420.59771728515625, "logps/rejected": -538.1131591796875, "loss": 0.4765, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6423835754394531, "rewards/margins": 0.9299103021621704, "rewards/rejected": -2.572293996810913, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 70.3147665430388, "learning_rate": 1.8254124314089223e-07, "logits/chosen": 5743.0556640625, "logits/rejected": 5161.66015625, "logps/chosen": -456.74395751953125, "logps/rejected": -543.2876586914062, "loss": 0.5026, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.72748601436615, "rewards/margins": 1.018004059791565, "rewards/rejected": -2.745490074157715, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 55.2814901627422, "learning_rate": 1.7815330573622205e-07, "logits/chosen": 5943.31103515625, "logits/rejected": 5791.52685546875, "logps/chosen": -441.3788146972656, "logps/rejected": -568.0220336914062, "loss": 0.4927, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6020936965942383, "rewards/margins": 1.0032509565353394, "rewards/rejected": -2.605344533920288, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 36.81078378897197, "learning_rate": 1.7378936454380274e-07, "logits/chosen": 5846.7255859375, "logits/rejected": 4917.35595703125, "logps/chosen": -435.710693359375, "logps/rejected": -514.1156616210938, "loss": 0.4601, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7096188068389893, "rewards/margins": 1.0015608072280884, "rewards/rejected": -2.711179494857788, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 67.37657075563799, "learning_rate": 1.694508770845427e-07, "logits/chosen": 6779.4072265625, "logits/rejected": 5683.87646484375, "logps/chosen": -540.6749267578125, "logps/rejected": -585.6129760742188, "loss": 0.503, "rewards/accuracies": 0.75, "rewards/chosen": -2.0458195209503174, "rewards/margins": 0.9217261075973511, "rewards/rejected": -2.967545747756958, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 56.03321414275303, "learning_rate": 1.651392923780105e-07, "logits/chosen": 6311.9423828125, "logits/rejected": 5025.9326171875, "logps/chosen": -482.51629638671875, "logps/rejected": -529.3140869140625, "loss": 0.4719, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.012092113494873, "rewards/margins": 0.8922163248062134, "rewards/rejected": -2.904308557510376, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 51.240934983951576, "learning_rate": 1.6085605045847367e-07, "logits/chosen": 5766.875, "logits/rejected": 4679.556640625, "logps/chosen": -484.06036376953125, "logps/rejected": -574.91943359375, "loss": 0.503, "rewards/accuracies": 0.625, "rewards/chosen": -2.115018367767334, "rewards/margins": 0.8909432291984558, "rewards/rejected": -3.0059614181518555, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 44.83155429296669, "learning_rate": 1.5660258189393944e-07, "logits/chosen": 6001.896484375, "logits/rejected": 4623.4814453125, "logps/chosen": -481.5863342285156, "logps/rejected": -554.1494140625, "loss": 0.4855, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8362839221954346, "rewards/margins": 1.0993396043777466, "rewards/rejected": -2.9356234073638916, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 58.86551962180224, "learning_rate": 1.5238030730835577e-07, "logits/chosen": 5332.787109375, "logits/rejected": 5467.3818359375, "logps/chosen": -398.0010986328125, "logps/rejected": -541.9666748046875, "loss": 0.4572, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5968772172927856, "rewards/margins": 1.336118221282959, "rewards/rejected": -2.932995319366455, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 53.78027682064507, "learning_rate": 1.4819063690713564e-07, "logits/chosen": 6006.59130859375, "logits/rejected": 4786.06982421875, "logps/chosen": -449.716796875, "logps/rejected": -553.7860107421875, "loss": 0.4604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7109276056289673, "rewards/margins": 1.27021062374115, "rewards/rejected": -2.981138229370117, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 69.08546288730311, "learning_rate": 1.4403497000615883e-07, "logits/chosen": 5749.35546875, "logits/rejected": 5006.19580078125, "logps/chosen": -513.867431640625, "logps/rejected": -558.72509765625, "loss": 0.5407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0234084129333496, "rewards/margins": 1.0361706018447876, "rewards/rejected": -3.0595791339874268, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 94.32166607912914, "learning_rate": 1.3991469456441272e-07, "logits/chosen": 5560.58642578125, "logits/rejected": 5246.12646484375, "logps/chosen": -432.19964599609375, "logps/rejected": -541.0390625, "loss": 0.4939, "rewards/accuracies": 0.8125, "rewards/chosen": -1.547202706336975, "rewards/margins": 1.0479974746704102, "rewards/rejected": -2.5952000617980957, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 38.56424538056888, "learning_rate": 1.358311867204244e-07, "logits/chosen": 4675.93896484375, "logits/rejected": 4583.65625, "logps/chosen": -363.8262634277344, "logps/rejected": -492.0935974121094, "loss": 0.4581, "rewards/accuracies": 0.8125, "rewards/chosen": -1.402822732925415, "rewards/margins": 1.1607930660247803, "rewards/rejected": -2.5636157989501953, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 70.48363118369693, "learning_rate": 1.3178581033264216e-07, "logits/chosen": 6256.1904296875, "logits/rejected": 5279.4775390625, "logps/chosen": -481.8662109375, "logps/rejected": -563.3885498046875, "loss": 0.5067, "rewards/accuracies": 0.75, "rewards/chosen": -1.8727912902832031, "rewards/margins": 0.8880994915962219, "rewards/rejected": -2.7608909606933594, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 53.61690545001972, "learning_rate": 1.2777991652391757e-07, "logits/chosen": 5354.2919921875, "logits/rejected": 3956.951904296875, "logps/chosen": -457.558837890625, "logps/rejected": -519.9451904296875, "loss": 0.5103, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7837330102920532, "rewards/margins": 1.186250925064087, "rewards/rejected": -2.9699840545654297, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 85.94134232920602, "learning_rate": 1.2381484323024178e-07, "logits/chosen": 6099.58154296875, "logits/rejected": 5222.3310546875, "logps/chosen": -465.1321716308594, "logps/rejected": -540.4118041992188, "loss": 0.4825, "rewards/accuracies": 0.75, "rewards/chosen": -1.8389440774917603, "rewards/margins": 1.0195863246917725, "rewards/rejected": -2.8585305213928223, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 59.890970835357095, "learning_rate": 1.1989191475388516e-07, "logits/chosen": 5064.47119140625, "logits/rejected": 4606.1064453125, "logps/chosen": -396.0245056152344, "logps/rejected": -529.9171752929688, "loss": 0.4919, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6857776641845703, "rewards/margins": 1.1661508083343506, "rewards/rejected": -2.851928234100342, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 55.025653930575544, "learning_rate": 1.1601244132109179e-07, "logits/chosen": 5044.70556640625, "logits/rejected": 4524.1494140625, "logps/chosen": -439.0829162597656, "logps/rejected": -536.5711669921875, "loss": 0.4973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9668521881103516, "rewards/margins": 0.9730531573295593, "rewards/rejected": -2.9399051666259766, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 50.42441473566833, "learning_rate": 1.1217771864447395e-07, "logits/chosen": 5791.28662109375, "logits/rejected": 4876.34228515625, "logps/chosen": -454.65106201171875, "logps/rejected": -576.1729736328125, "loss": 0.5116, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.693499207496643, "rewards/margins": 1.1748238801956177, "rewards/rejected": -2.8683230876922607, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 45.59860401020204, "learning_rate": 1.0838902749025499e-07, "logits/chosen": 7000.99462890625, "logits/rejected": 5573.9833984375, "logps/chosen": -491.8744201660156, "logps/rejected": -530.9385986328125, "loss": 0.5155, "rewards/accuracies": 0.75, "rewards/chosen": -1.6647865772247314, "rewards/margins": 0.8948407173156738, "rewards/rejected": -2.559627056121826, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 68.88855255766244, "learning_rate": 1.0464763325050358e-07, "logits/chosen": 5260.3330078125, "logits/rejected": 4669.32958984375, "logps/chosen": -447.2159118652344, "logps/rejected": -515.1805419921875, "loss": 0.4911, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6555016040802002, "rewards/margins": 0.9187766909599304, "rewards/rejected": -2.5742781162261963, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 53.849978458276446, "learning_rate": 1.0095478552050346e-07, "logits/chosen": 6265.92041015625, "logits/rejected": 4139.224609375, "logps/chosen": -451.6341857910156, "logps/rejected": -501.037353515625, "loss": 0.4732, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4033886194229126, "rewards/margins": 1.0941402912139893, "rewards/rejected": -2.4975287914276123, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 71.49005742239994, "learning_rate": 9.731171768139806e-08, "logits/chosen": 5820.0, "logits/rejected": 4671.0771484375, "logps/chosen": -401.1160583496094, "logps/rejected": -485.373291015625, "loss": 0.493, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3254698514938354, "rewards/margins": 1.1134282350540161, "rewards/rejected": -2.4388980865478516, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 61.38384481626538, "learning_rate": 9.37196464882522e-08, "logits/chosen": 5571.05078125, "logits/rejected": 5003.7001953125, "logps/chosen": -405.25677490234375, "logps/rejected": -501.10931396484375, "loss": 0.5245, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5185030698776245, "rewards/margins": 0.9954677820205688, "rewards/rejected": -2.5139708518981934, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 56.09557615458594, "learning_rate": 9.017977166366444e-08, "logits/chosen": 5765.5498046875, "logits/rejected": 5031.99169921875, "logps/chosen": -432.7581481933594, "logps/rejected": -531.5502319335938, "loss": 0.4958, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4822924137115479, "rewards/margins": 1.0369850397109985, "rewards/rejected": -2.519277572631836, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 43.93135285453503, "learning_rate": 8.669327549707095e-08, "logits/chosen": 5835.9599609375, "logits/rejected": 4902.2099609375, "logps/chosen": -467.08721923828125, "logps/rejected": -531.9814453125, "loss": 0.4596, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.62697434425354, "rewards/margins": 1.0709320306777954, "rewards/rejected": -2.697906494140625, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 45.45268260539036, "learning_rate": 8.326132244986931e-08, "logits/chosen": 5231.73681640625, "logits/rejected": 4425.86572265625, "logps/chosen": -425.63995361328125, "logps/rejected": -521.69140625, "loss": 0.4698, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6223514080047607, "rewards/margins": 1.2424169778823853, "rewards/rejected": -2.8647682666778564, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 44.10137407870052, "learning_rate": 7.988505876649862e-08, "logits/chosen": 5436.15673828125, "logits/rejected": 4060.813232421875, "logps/chosen": -442.54400634765625, "logps/rejected": -547.1506958007812, "loss": 0.4985, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7412408590316772, "rewards/margins": 1.1485346555709839, "rewards/rejected": -2.889775514602661, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 34.2003751407198, "learning_rate": 7.656561209160248e-08, "logits/chosen": 5884.69921875, "logits/rejected": 4979.0634765625, "logps/chosen": -468.45892333984375, "logps/rejected": -524.2210693359375, "loss": 0.4535, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6417407989501953, "rewards/margins": 1.0903558731079102, "rewards/rejected": -2.7320969104766846, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 62.328404682337315, "learning_rate": 7.330409109340562e-08, "logits/chosen": 5976.05615234375, "logits/rejected": 5185.83984375, "logps/chosen": -475.17303466796875, "logps/rejected": -552.3870849609375, "loss": 0.4602, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5611035823822021, "rewards/margins": 1.1281805038452148, "rewards/rejected": -2.689283847808838, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 62.90207468441792, "learning_rate": 7.010158509342681e-08, "logits/chosen": 6559.21875, "logits/rejected": 4668.7568359375, "logps/chosen": -461.5740661621094, "logps/rejected": -515.0909423828125, "loss": 0.4662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6185649633407593, "rewards/margins": 1.0846556425094604, "rewards/rejected": -2.703220844268799, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 52.65265815732916, "learning_rate": 6.695916370265527e-08, "logits/chosen": 5316.6923828125, "logits/rejected": 4581.3759765625, "logps/chosen": -423.22406005859375, "logps/rejected": -458.8834533691406, "loss": 0.5321, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6553497314453125, "rewards/margins": 0.7974721789360046, "rewards/rejected": -2.452821731567383, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 39.19744570522527, "learning_rate": 6.387787646430853e-08, "logits/chosen": 6557.60546875, "logits/rejected": 5875.27685546875, "logps/chosen": -476.264404296875, "logps/rejected": -544.9144897460938, "loss": 0.5219, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7729346752166748, "rewards/margins": 0.808856189250946, "rewards/rejected": -2.5817906856536865, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 53.35492473376416, "learning_rate": 6.0858752503294e-08, "logits/chosen": 5201.9482421875, "logits/rejected": 4884.1943359375, "logps/chosen": -451.56707763671875, "logps/rejected": -502.1494140625, "loss": 0.4745, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6913883686065674, "rewards/margins": 0.7468551397323608, "rewards/rejected": -2.438243865966797, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 37.5730139468933, "learning_rate": 5.7902800182489385e-08, "logits/chosen": 5386.0400390625, "logits/rejected": 5056.7646484375, "logps/chosen": -412.1158752441406, "logps/rejected": -504.4266052246094, "loss": 0.4652, "rewards/accuracies": 0.8125, "rewards/chosen": -1.577048659324646, "rewards/margins": 1.137432336807251, "rewards/rejected": -2.7144808769226074, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 47.25236502782169, "learning_rate": 5.5011006765957604e-08, "logits/chosen": 6559.1689453125, "logits/rejected": 5847.15869140625, "logps/chosen": -477.906005859375, "logps/rejected": -593.715087890625, "loss": 0.4813, "rewards/accuracies": 0.75, "rewards/chosen": -1.720546007156372, "rewards/margins": 1.0101532936096191, "rewards/rejected": -2.7306995391845703, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 64.5306158701743, "learning_rate": 5.218433808920883e-08, "logits/chosen": 5732.14404296875, "logits/rejected": 5182.62109375, "logps/chosen": -454.5556640625, "logps/rejected": -543.537841796875, "loss": 0.4659, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6730997562408447, "rewards/margins": 0.9740368723869324, "rewards/rejected": -2.6471364498138428, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 48.449579077266975, "learning_rate": 4.942373823661927e-08, "logits/chosen": 6836.04052734375, "logits/rejected": 5074.99169921875, "logps/chosen": -489.0601501464844, "logps/rejected": -550.8364868164062, "loss": 0.4693, "rewards/accuracies": 0.75, "rewards/chosen": -1.6957050561904907, "rewards/margins": 1.1460716724395752, "rewards/rejected": -2.8417768478393555, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 47.436165301548996, "learning_rate": 4.6730129226114354e-08, "logits/chosen": 5166.6318359375, "logits/rejected": 4734.98779296875, "logps/chosen": -445.92669677734375, "logps/rejected": -492.08770751953125, "loss": 0.4721, "rewards/accuracies": 0.75, "rewards/chosen": -1.9005409479141235, "rewards/margins": 0.8562926054000854, "rewards/rejected": -2.756833553314209, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 46.42456774742332, "learning_rate": 4.41044107012227e-08, "logits/chosen": 6607.1123046875, "logits/rejected": 5159.0517578125, "logps/chosen": -491.15869140625, "logps/rejected": -544.934814453125, "loss": 0.4803, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5394710302352905, "rewards/margins": 1.0749366283416748, "rewards/rejected": -2.614407777786255, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 115.21349764331502, "learning_rate": 4.1547459630601966e-08, "logits/chosen": 5747.3759765625, "logits/rejected": 5143.2470703125, "logps/chosen": -473.9146423339844, "logps/rejected": -541.2026977539062, "loss": 0.5147, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8548256158828735, "rewards/margins": 0.8329262733459473, "rewards/rejected": -2.6877522468566895, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 46.00913012878008, "learning_rate": 3.9060130015138857e-08, "logits/chosen": 5326.37109375, "logits/rejected": 4686.98291015625, "logps/chosen": -470.4459533691406, "logps/rejected": -547.7535400390625, "loss": 0.496, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8834571838378906, "rewards/margins": 1.0830243825912476, "rewards/rejected": -2.9664816856384277, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 35.690049353036706, "learning_rate": 3.664325260271953e-08, "logits/chosen": 6072.751953125, "logits/rejected": 5098.45068359375, "logps/chosen": -512.0150146484375, "logps/rejected": -555.8195190429688, "loss": 0.4597, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9424760341644287, "rewards/margins": 0.8288620114326477, "rewards/rejected": -2.7713379859924316, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 46.255307725799796, "learning_rate": 3.429763461076676e-08, "logits/chosen": 5927.7353515625, "logits/rejected": 5079.93212890625, "logps/chosen": -456.045166015625, "logps/rejected": -560.6665649414062, "loss": 0.4653, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6067283153533936, "rewards/margins": 1.0832823514938354, "rewards/rejected": -2.6900105476379395, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 46.35436279492669, "learning_rate": 3.202405945663555e-08, "logits/chosen": 5855.36962890625, "logits/rejected": 3933.013671875, "logps/chosen": -460.8177185058594, "logps/rejected": -482.4466247558594, "loss": 0.489, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8878599405288696, "rewards/margins": 0.8325251340866089, "rewards/rejected": -2.7203853130340576, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 54.73002711133669, "learning_rate": 2.9823286495958556e-08, "logits/chosen": 4859.734375, "logits/rejected": 5366.44775390625, "logps/chosen": -439.7100524902344, "logps/rejected": -632.7385864257812, "loss": 0.4796, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9741220474243164, "rewards/margins": 0.9164485931396484, "rewards/rejected": -2.890570640563965, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 43.01757031434862, "learning_rate": 2.769605076902695e-08, "logits/chosen": 6194.2392578125, "logits/rejected": 5666.4248046875, "logps/chosen": -463.29083251953125, "logps/rejected": -570.9111328125, "loss": 0.4745, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7209036350250244, "rewards/margins": 0.9603285789489746, "rewards/rejected": -2.68123197555542, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 41.534018608207546, "learning_rate": 2.5643062755293403e-08, "logits/chosen": 5478.0517578125, "logits/rejected": 4659.22412109375, "logps/chosen": -460.326416015625, "logps/rejected": -499.4408264160156, "loss": 0.4895, "rewards/accuracies": 0.75, "rewards/chosen": -1.8074705600738525, "rewards/margins": 0.8185604214668274, "rewards/rejected": -2.626030921936035, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 60.86434476583357, "learning_rate": 2.366500813607733e-08, "logits/chosen": 6139.50146484375, "logits/rejected": 4718.34619140625, "logps/chosen": -445.7151794433594, "logps/rejected": -558.4569091796875, "loss": 0.4855, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6716018915176392, "rewards/margins": 1.3165209293365479, "rewards/rejected": -2.9881229400634766, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 75.4275871069272, "learning_rate": 2.176254756555329e-08, "logits/chosen": 6512.9912109375, "logits/rejected": 5728.6318359375, "logps/chosen": -498.4745178222656, "logps/rejected": -584.0284423828125, "loss": 0.4563, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7989261150360107, "rewards/margins": 1.1345270872116089, "rewards/rejected": -2.93345308303833, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 47.66108370078102, "learning_rate": 1.9936316450097468e-08, "logits/chosen": 5179.87646484375, "logits/rejected": 4646.3017578125, "logps/chosen": -436.65118408203125, "logps/rejected": -480.0382385253906, "loss": 0.5015, "rewards/accuracies": 0.625, "rewards/chosen": -1.8090522289276123, "rewards/margins": 0.7128003835678101, "rewards/rejected": -2.521852493286133, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 49.60045372554933, "learning_rate": 1.8186924736067477e-08, "logits/chosen": 5840.21240234375, "logits/rejected": 4393.1689453125, "logps/chosen": -455.3392639160156, "logps/rejected": -550.1962280273438, "loss": 0.4651, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6438064575195312, "rewards/margins": 1.2535064220428467, "rewards/rejected": -2.8973135948181152, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 43.86855453315871, "learning_rate": 1.651495670608488e-08, "logits/chosen": 6719.01708984375, "logits/rejected": 5168.751953125, "logps/chosen": -477.3172912597656, "logps/rejected": -558.49560546875, "loss": 0.4331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7377887964248657, "rewards/margins": 1.2401338815689087, "rewards/rejected": -2.9779226779937744, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 42.225143416295225, "learning_rate": 1.4920970783889737e-08, "logits/chosen": 6293.6005859375, "logits/rejected": 4633.51806640625, "logps/chosen": -493.03509521484375, "logps/rejected": -573.3130493164062, "loss": 0.4554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9286737442016602, "rewards/margins": 0.9963156580924988, "rewards/rejected": -2.9249894618988037, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 33.92052291437684, "learning_rate": 1.340549934783164e-08, "logits/chosen": 6018.5830078125, "logits/rejected": 5687.3076171875, "logps/chosen": -481.969970703125, "logps/rejected": -572.0182495117188, "loss": 0.4615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8394489288330078, "rewards/margins": 0.924017608165741, "rewards/rejected": -2.7634665966033936, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 44.157007984318106, "learning_rate": 1.1969048553059608e-08, "logits/chosen": 5706.2099609375, "logits/rejected": 4860.96533203125, "logps/chosen": -412.661865234375, "logps/rejected": -496.70074462890625, "loss": 0.4884, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6415414810180664, "rewards/margins": 0.9479316473007202, "rewards/rejected": -2.589473009109497, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 54.00838584546709, "learning_rate": 1.06120981624703e-08, "logits/chosen": 5393.56005859375, "logits/rejected": 5689.6533203125, "logps/chosen": -444.2010192871094, "logps/rejected": -582.6871948242188, "loss": 0.4802, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7044061422348022, "rewards/margins": 1.1069849729537964, "rewards/rejected": -2.8113913536071777, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 61.132927463591344, "learning_rate": 9.335101386471284e-09, "logits/chosen": 6236.1591796875, "logits/rejected": 5493.9794921875, "logps/chosen": -478.6182556152344, "logps/rejected": -549.0506591796875, "loss": 0.4714, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8473739624023438, "rewards/margins": 0.9510253667831421, "rewards/rejected": -2.7983996868133545, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 63.02028903638583, "learning_rate": 8.138484731612273e-09, "logits/chosen": 5896.7861328125, "logits/rejected": 4876.11328125, "logps/chosen": -457.31500244140625, "logps/rejected": -578.9633178710938, "loss": 0.4745, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.785424828529358, "rewards/margins": 1.2331361770629883, "rewards/rejected": -3.0185611248016357, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 45.2465032405233, "learning_rate": 7.0226478581355e-09, "logits/chosen": 5961.98974609375, "logits/rejected": 5157.7978515625, "logps/chosen": -488.4525451660156, "logps/rejected": -565.0822143554688, "loss": 0.5262, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.037466287612915, "rewards/margins": 0.9263374209403992, "rewards/rejected": -2.963803768157959, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 60.617493519779835, "learning_rate": 5.987963446492383e-09, "logits/chosen": 6006.6962890625, "logits/rejected": 5307.9892578125, "logps/chosen": -438.90753173828125, "logps/rejected": -521.6585693359375, "loss": 0.41, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6116359233856201, "rewards/margins": 1.0634849071502686, "rewards/rejected": -2.6751208305358887, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 59.87948034425728, "learning_rate": 5.0347770728713935e-09, "logits/chosen": 5996.91455078125, "logits/rejected": 4601.3720703125, "logps/chosen": -487.2227478027344, "logps/rejected": -516.2828979492188, "loss": 0.4885, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6710405349731445, "rewards/margins": 1.0333257913589478, "rewards/rejected": -2.7043662071228027, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 43.04367777321277, "learning_rate": 4.1634070937782424e-09, "logits/chosen": 5986.92822265625, "logits/rejected": 5393.41259765625, "logps/chosen": -493.161376953125, "logps/rejected": -621.3470458984375, "loss": 0.5037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9648311138153076, "rewards/margins": 1.164574146270752, "rewards/rejected": -3.1294054985046387, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 47.169800736825145, "learning_rate": 3.3741445397075797e-09, "logits/chosen": 6257.791015625, "logits/rejected": 5275.78759765625, "logps/chosen": -488.2510681152344, "logps/rejected": -594.5584106445312, "loss": 0.5021, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.759234070777893, "rewards/margins": 1.2129390239715576, "rewards/rejected": -2.9721732139587402, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 68.54918801823914, "learning_rate": 2.667253017941018e-09, "logits/chosen": 6221.16015625, "logits/rejected": 4841.1064453125, "logps/chosen": -486.86309814453125, "logps/rejected": -553.6325073242188, "loss": 0.4657, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8351036310195923, "rewards/margins": 0.9869117736816406, "rewards/rejected": -2.8220152854919434, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 81.5491614635087, "learning_rate": 2.0429686245045097e-09, "logits/chosen": 6046.38037109375, "logits/rejected": 4651.4619140625, "logps/chosen": -524.8812255859375, "logps/rejected": -546.822265625, "loss": 0.519, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8524370193481445, "rewards/margins": 0.9590626955032349, "rewards/rejected": -2.811499834060669, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 44.064739740774314, "learning_rate": 1.5014998653141708e-09, "logits/chosen": 5743.37060546875, "logits/rejected": 4843.56884765625, "logps/chosen": -482.69586181640625, "logps/rejected": -556.3638916015625, "loss": 0.4936, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.790981650352478, "rewards/margins": 1.2408983707427979, "rewards/rejected": -3.0318799018859863, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 42.50973909426173, "learning_rate": 1.0430275865371263e-09, "logits/chosen": 5961.98095703125, "logits/rejected": 4932.6357421875, "logps/chosen": -443.4388732910156, "logps/rejected": -550.4918212890625, "loss": 0.455, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.813359260559082, "rewards/margins": 1.199947476387024, "rewards/rejected": -3.0133066177368164, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 56.66273501138923, "learning_rate": 6.677049141901314e-10, "logits/chosen": 4880.40576171875, "logits/rejected": 4685.27197265625, "logps/chosen": -416.60089111328125, "logps/rejected": -539.0184936523438, "loss": 0.4833, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7454957962036133, "rewards/margins": 1.0450434684753418, "rewards/rejected": -2.790539264678955, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 38.32090090248521, "learning_rate": 3.7565720299687077e-10, "logits/chosen": 6260.2158203125, "logits/rejected": 5280.84912109375, "logps/chosen": -494.6107482910156, "logps/rejected": -552.1981201171875, "loss": 0.4459, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7900596857070923, "rewards/margins": 1.042823076248169, "rewards/rejected": -2.8328824043273926, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 64.27508021348248, "learning_rate": 1.6698199452053197e-10, "logits/chosen": 4530.2236328125, "logits/rejected": 4498.3388671875, "logps/chosen": -430.72576904296875, "logps/rejected": -518.1484985351562, "loss": 0.4666, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7727760076522827, "rewards/margins": 0.8958579301834106, "rewards/rejected": -2.6686339378356934, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 70.24208653050465, "learning_rate": 4.174898458556009e-11, "logits/chosen": 6094.0966796875, "logits/rejected": 4274.994140625, "logps/chosen": -462.11920166015625, "logps/rejected": -517.6158447265625, "loss": 0.4826, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8968086242675781, "rewards/margins": 0.9447473287582397, "rewards/rejected": -2.8415558338165283, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 90.73252809696429, "learning_rate": 0.0, "logits/chosen": 6198.6337890625, "logits/rejected": 5036.7548828125, "logps/chosen": -491.354736328125, "logps/rejected": -579.1531982421875, "loss": 0.497, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9158384799957275, "rewards/margins": 1.0406345129013062, "rewards/rejected": -2.956472873687744, "step": 1910 }, { "epoch": 0.9997382884061764, "step": 1910, "total_flos": 0.0, "train_loss": 0.5203473493066758, "train_runtime": 16903.37, "train_samples_per_second": 3.617, "train_steps_per_second": 0.113 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }