{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 7066, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001415227851684121, "grad_norm": 6.096909352817652, "learning_rate": 1.4144271570014144e-10, "logits/chosen": -2.516268491744995, "logits/rejected": -2.9614040851593018, "logps/chosen": -31.97142219543457, "logps/rejected": -32.77480697631836, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001415227851684121, "grad_norm": 6.320550229507857, "learning_rate": 1.4144271570014142e-09, "logits/chosen": -2.745706081390381, "logits/rejected": -2.991027355194092, "logps/chosen": -37.94630813598633, "logps/rejected": -39.26482391357422, "loss": 0.6931, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -1.4975615158618893e-05, "rewards/margins": -0.00026786618400365114, "rewards/rejected": 0.00025289057521149516, "step": 10 }, { "epoch": 0.002830455703368242, "grad_norm": 6.14907834400214, "learning_rate": 2.8288543140028285e-09, "logits/chosen": -2.734786033630371, "logits/rejected": -2.991663932800293, "logps/chosen": -39.1810302734375, "logps/rejected": -42.79104232788086, "loss": 0.6931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00012463497114367783, "rewards/margins": 0.00046449207002297044, "rewards/rejected": -0.00033985706977546215, "step": 20 }, { "epoch": 0.0042456835550523635, "grad_norm": 6.486737381252642, "learning_rate": 4.243281471004243e-09, "logits/chosen": -2.7341270446777344, "logits/rejected": -2.9648079872131348, "logps/chosen": -38.61527633666992, "logps/rejected": -42.214595794677734, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00015709995932411402, "rewards/margins": 3.292049950687215e-05, "rewards/rejected": 0.00012417949619702995, "step": 30 }, { "epoch": 0.005660911406736484, "grad_norm": 7.273426068166052, "learning_rate": 5.657708628005657e-09, "logits/chosen": -2.7229883670806885, "logits/rejected": -3.0071825981140137, "logps/chosen": -39.46904373168945, "logps/rejected": -45.59062957763672, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00021416519302874804, "rewards/margins": 8.534002699889243e-05, "rewards/rejected": 0.0001288251660298556, "step": 40 }, { "epoch": 0.007076139258420606, "grad_norm": 6.24856269956626, "learning_rate": 7.072135785007072e-09, "logits/chosen": -2.768751382827759, "logits/rejected": -3.033128023147583, "logps/chosen": -39.83363342285156, "logps/rejected": -42.75846481323242, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005629544029943645, "rewards/margins": 0.000582108972594142, "rewards/rejected": -1.915456414280925e-05, "step": 50 }, { "epoch": 0.008491367110104727, "grad_norm": 6.516665868923772, "learning_rate": 8.486562942008486e-09, "logits/chosen": -2.7458739280700684, "logits/rejected": -3.0326054096221924, "logps/chosen": -41.784202575683594, "logps/rejected": -44.732303619384766, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0005014288472011685, "rewards/margins": 0.0007212483906187117, "rewards/rejected": -0.00021981952886562794, "step": 60 }, { "epoch": 0.009906594961788848, "grad_norm": 6.109627139149071, "learning_rate": 9.900990099009902e-09, "logits/chosen": -2.759272336959839, "logits/rejected": -3.007999897003174, "logps/chosen": -40.46739196777344, "logps/rejected": -46.29994201660156, "loss": 0.6923, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.001430650008842349, "rewards/margins": 0.001463620224967599, "rewards/rejected": -3.2970187021419406e-05, "step": 70 }, { "epoch": 0.011321822813472968, "grad_norm": 6.665730767388637, "learning_rate": 1.1315417256011314e-08, "logits/chosen": -2.7059426307678223, "logits/rejected": -2.9863762855529785, "logps/chosen": -41.36830520629883, "logps/rejected": -39.80991744995117, "loss": 0.6918, "rewards/accuracies": 0.8125, "rewards/chosen": 0.002238479908555746, "rewards/margins": 0.0024959708098322153, "rewards/rejected": -0.0002574908721726388, "step": 80 }, { "epoch": 0.01273705066515709, "grad_norm": 6.384118040185425, "learning_rate": 1.272984441301273e-08, "logits/chosen": -2.730215311050415, "logits/rejected": -3.0040841102600098, "logps/chosen": -38.47834014892578, "logps/rejected": -48.39954376220703, "loss": 0.6914, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.003451574593782425, "rewards/margins": 0.0036884560249745846, "rewards/rejected": -0.00023688173678237945, "step": 90 }, { "epoch": 0.014152278516841211, "grad_norm": 6.320827867495901, "learning_rate": 1.4144271570014143e-08, "logits/chosen": -2.7393157482147217, "logits/rejected": -3.016116142272949, "logps/chosen": -37.531150817871094, "logps/rejected": -42.4915885925293, "loss": 0.6906, "rewards/accuracies": 0.9375, "rewards/chosen": 0.004805425181984901, "rewards/margins": 0.005305609665811062, "rewards/rejected": -0.000500184774864465, "step": 100 }, { "epoch": 0.015567506368525332, "grad_norm": 6.05947911661128, "learning_rate": 1.5558698727015555e-08, "logits/chosen": -2.732581377029419, "logits/rejected": -3.011148691177368, "logps/chosen": -38.805015563964844, "logps/rejected": -42.00470733642578, "loss": 0.6889, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.00841190293431282, "rewards/margins": 0.008628403767943382, "rewards/rejected": -0.00021650231792591512, "step": 110 }, { "epoch": 0.016982734220209454, "grad_norm": 6.454521162674983, "learning_rate": 1.6973125884016973e-08, "logits/chosen": -2.7343180179595947, "logits/rejected": -3.0151216983795166, "logps/chosen": -37.41632843017578, "logps/rejected": -40.901947021484375, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.010304552502930164, "rewards/margins": 0.010519719682633877, "rewards/rejected": -0.00021516799461096525, "step": 120 }, { "epoch": 0.018397962071893575, "grad_norm": 6.179100666148633, "learning_rate": 1.8387553041018386e-08, "logits/chosen": -2.7040038108825684, "logits/rejected": -2.9892570972442627, "logps/chosen": -36.53224182128906, "logps/rejected": -40.13109588623047, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.012823812663555145, "rewards/margins": 0.013301663100719452, "rewards/rejected": -0.0004778511938638985, "step": 130 }, { "epoch": 0.019813189923577695, "grad_norm": 6.351858411198628, "learning_rate": 1.9801980198019804e-08, "logits/chosen": -2.697268009185791, "logits/rejected": -2.961493968963623, "logps/chosen": -38.1439323425293, "logps/rejected": -42.87862777709961, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.015412276610732079, "rewards/margins": 0.016217520460486412, "rewards/rejected": -0.0008052419871091843, "step": 140 }, { "epoch": 0.021228417775261816, "grad_norm": 6.26330706343348, "learning_rate": 2.1216407355021214e-08, "logits/chosen": -2.728989839553833, "logits/rejected": -3.008030414581299, "logps/chosen": -37.14191436767578, "logps/rejected": -38.59169006347656, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.023079346865415573, "rewards/margins": 0.024182084947824478, "rewards/rejected": -0.0011027371510863304, "step": 150 }, { "epoch": 0.022643645626945937, "grad_norm": 6.067928335125853, "learning_rate": 2.2630834512022628e-08, "logits/chosen": -2.7412805557250977, "logits/rejected": -3.011881113052368, "logps/chosen": -34.500938415527344, "logps/rejected": -42.74951171875, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.03375203534960747, "rewards/margins": 0.03487534075975418, "rewards/rejected": -0.0011233009863644838, "step": 160 }, { "epoch": 0.02405887347863006, "grad_norm": 6.427112818428187, "learning_rate": 2.4045261669024045e-08, "logits/chosen": -2.7284936904907227, "logits/rejected": -2.9784600734710693, "logps/chosen": -36.761756896972656, "logps/rejected": -44.20045852661133, "loss": 0.6729, "rewards/accuracies": 1.0, "rewards/chosen": 0.0393458791077137, "rewards/margins": 0.0410490445792675, "rewards/rejected": -0.001703165122307837, "step": 170 }, { "epoch": 0.02547410133031418, "grad_norm": 6.097219800797538, "learning_rate": 2.545968882602546e-08, "logits/chosen": -2.6924800872802734, "logits/rejected": -2.9570908546447754, "logps/chosen": -34.237648010253906, "logps/rejected": -42.3128547668457, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": 0.04682927578687668, "rewards/margins": 0.04834538698196411, "rewards/rejected": -0.0015161134069785476, "step": 180 }, { "epoch": 0.026889329181998302, "grad_norm": 6.101967448244761, "learning_rate": 2.6874115983026873e-08, "logits/chosen": -2.676492214202881, "logits/rejected": -2.960750102996826, "logps/chosen": -33.696495056152344, "logps/rejected": -43.3915901184082, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 0.053539298474788666, "rewards/margins": 0.05540009215474129, "rewards/rejected": -0.0018607930978760123, "step": 190 }, { "epoch": 0.028304557033682422, "grad_norm": 6.15394155459427, "learning_rate": 2.8288543140028287e-08, "logits/chosen": -2.709066867828369, "logits/rejected": -2.966566324234009, "logps/chosen": -34.72453308105469, "logps/rejected": -44.05718231201172, "loss": 0.6629, "rewards/accuracies": 1.0, "rewards/chosen": 0.06043035909533501, "rewards/margins": 0.06302284449338913, "rewards/rejected": -0.002592484699562192, "step": 200 }, { "epoch": 0.029719784885366543, "grad_norm": 6.261318236788307, "learning_rate": 2.97029702970297e-08, "logits/chosen": -2.696261405944824, "logits/rejected": -2.9539904594421387, "logps/chosen": -32.231658935546875, "logps/rejected": -41.9385986328125, "loss": 0.6589, "rewards/accuracies": 1.0, "rewards/chosen": 0.06935092061758041, "rewards/margins": 0.07107644528150558, "rewards/rejected": -0.0017255315324291587, "step": 210 }, { "epoch": 0.031135012737050664, "grad_norm": 5.208897499403675, "learning_rate": 3.111739745403111e-08, "logits/chosen": -2.7241578102111816, "logits/rejected": -2.989107131958008, "logps/chosen": -31.135875701904297, "logps/rejected": -43.41024398803711, "loss": 0.6523, "rewards/accuracies": 1.0, "rewards/chosen": 0.07748748362064362, "rewards/margins": 0.08015582710504532, "rewards/rejected": -0.0026683362666517496, "step": 220 }, { "epoch": 0.032550240588734784, "grad_norm": 5.11065424822978, "learning_rate": 3.2531824611032535e-08, "logits/chosen": -2.647825241088867, "logits/rejected": -2.9242138862609863, "logps/chosen": -29.809467315673828, "logps/rejected": -41.419288635253906, "loss": 0.6462, "rewards/accuracies": 1.0, "rewards/chosen": 0.09223005175590515, "rewards/margins": 0.09507901966571808, "rewards/rejected": -0.002848951844498515, "step": 230 }, { "epoch": 0.03396546844041891, "grad_norm": 5.528490761660505, "learning_rate": 3.3946251768033945e-08, "logits/chosen": -2.657604455947876, "logits/rejected": -2.9161317348480225, "logps/chosen": -31.255264282226562, "logps/rejected": -40.34108352661133, "loss": 0.6415, "rewards/accuracies": 1.0, "rewards/chosen": 0.1065116673707962, "rewards/margins": 0.10971303284168243, "rewards/rejected": -0.0032013666350394487, "step": 240 }, { "epoch": 0.035380696292103025, "grad_norm": 5.6726152479256875, "learning_rate": 3.5360678925035356e-08, "logits/chosen": -2.6595115661621094, "logits/rejected": -2.9118244647979736, "logps/chosen": -28.175851821899414, "logps/rejected": -41.64971923828125, "loss": 0.6381, "rewards/accuracies": 1.0, "rewards/chosen": 0.11027149856090546, "rewards/margins": 0.11470510810613632, "rewards/rejected": -0.0044336155988276005, "step": 250 }, { "epoch": 0.03679592414378715, "grad_norm": 5.463371866642163, "learning_rate": 3.677510608203677e-08, "logits/chosen": -2.649853229522705, "logits/rejected": -2.9558053016662598, "logps/chosen": -26.124088287353516, "logps/rejected": -44.12666702270508, "loss": 0.6371, "rewards/accuracies": 1.0, "rewards/chosen": 0.11282335221767426, "rewards/margins": 0.11924733221530914, "rewards/rejected": -0.006423976272344589, "step": 260 }, { "epoch": 0.038211151995471274, "grad_norm": 5.940761894185382, "learning_rate": 3.8189533239038183e-08, "logits/chosen": -2.6147332191467285, "logits/rejected": -2.865729808807373, "logps/chosen": -27.76105308532715, "logps/rejected": -42.670692443847656, "loss": 0.6337, "rewards/accuracies": 1.0, "rewards/chosen": 0.11700008064508438, "rewards/margins": 0.12451908737421036, "rewards/rejected": -0.007519023958593607, "step": 270 }, { "epoch": 0.03962637984715539, "grad_norm": 5.226400791476826, "learning_rate": 3.960396039603961e-08, "logits/chosen": -2.6281259059906006, "logits/rejected": -2.925448417663574, "logps/chosen": -25.95822525024414, "logps/rejected": -41.846988677978516, "loss": 0.6337, "rewards/accuracies": 1.0, "rewards/chosen": 0.11560360342264175, "rewards/margins": 0.12124279886484146, "rewards/rejected": -0.0056392052210867405, "step": 280 }, { "epoch": 0.041041607698839515, "grad_norm": 5.327549077306007, "learning_rate": 4.101838755304102e-08, "logits/chosen": -2.632441997528076, "logits/rejected": -2.9272563457489014, "logps/chosen": -28.67806053161621, "logps/rejected": -41.754051208496094, "loss": 0.6308, "rewards/accuracies": 1.0, "rewards/chosen": 0.11841438710689545, "rewards/margins": 0.12624621391296387, "rewards/rejected": -0.007831819355487823, "step": 290 }, { "epoch": 0.04245683555052363, "grad_norm": 5.80891496831471, "learning_rate": 4.243281471004243e-08, "logits/chosen": -2.617936372756958, "logits/rejected": -2.881490707397461, "logps/chosen": -29.454381942749023, "logps/rejected": -40.290184020996094, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": 0.11685176193714142, "rewards/margins": 0.12901923060417175, "rewards/rejected": -0.012167491018772125, "step": 300 }, { "epoch": 0.043872063402207756, "grad_norm": 4.899685645903873, "learning_rate": 4.3847241867043845e-08, "logits/chosen": -2.616997718811035, "logits/rejected": -2.8568644523620605, "logps/chosen": -27.512847900390625, "logps/rejected": -44.55917739868164, "loss": 0.6285, "rewards/accuracies": 1.0, "rewards/chosen": 0.11944033950567245, "rewards/margins": 0.1369679868221283, "rewards/rejected": -0.017527643591165543, "step": 310 }, { "epoch": 0.04528729125389187, "grad_norm": 5.199189593923327, "learning_rate": 4.5261669024045256e-08, "logits/chosen": -2.5941834449768066, "logits/rejected": -2.8662502765655518, "logps/chosen": -28.010086059570312, "logps/rejected": -46.55924606323242, "loss": 0.6273, "rewards/accuracies": 1.0, "rewards/chosen": 0.12037129700183868, "rewards/margins": 0.140326589345932, "rewards/rejected": -0.01995529606938362, "step": 320 }, { "epoch": 0.046702519105576, "grad_norm": 5.580806917254681, "learning_rate": 4.667609618104667e-08, "logits/chosen": -2.597313165664673, "logits/rejected": -2.878422260284424, "logps/chosen": -26.2916316986084, "logps/rejected": -45.26839065551758, "loss": 0.6246, "rewards/accuracies": 1.0, "rewards/chosen": 0.11780872195959091, "rewards/margins": 0.14109370112419128, "rewards/rejected": -0.023284967988729477, "step": 330 }, { "epoch": 0.04811774695726012, "grad_norm": 6.284710863865947, "learning_rate": 4.809052333804809e-08, "logits/chosen": -2.6029016971588135, "logits/rejected": -2.902040481567383, "logps/chosen": -25.65376853942871, "logps/rejected": -45.43090057373047, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": 0.12351229041814804, "rewards/margins": 0.14844800531864166, "rewards/rejected": -0.02493571862578392, "step": 340 }, { "epoch": 0.04953297480894424, "grad_norm": 6.298629159171872, "learning_rate": 4.95049504950495e-08, "logits/chosen": -2.5725536346435547, "logits/rejected": -2.875755786895752, "logps/chosen": -25.44078254699707, "logps/rejected": -48.62675094604492, "loss": 0.6217, "rewards/accuracies": 1.0, "rewards/chosen": 0.12020155042409897, "rewards/margins": 0.14913299679756165, "rewards/rejected": -0.028931433334946632, "step": 350 }, { "epoch": 0.05094820266062836, "grad_norm": 6.348254238374495, "learning_rate": 5.091937765205092e-08, "logits/chosen": -2.5894312858581543, "logits/rejected": -2.881958484649658, "logps/chosen": -25.1923828125, "logps/rejected": -47.229835510253906, "loss": 0.6185, "rewards/accuracies": 1.0, "rewards/chosen": 0.12127149105072021, "rewards/margins": 0.15694397687911987, "rewards/rejected": -0.03567248582839966, "step": 360 }, { "epoch": 0.05236343051231248, "grad_norm": 6.605096513222183, "learning_rate": 5.233380480905233e-08, "logits/chosen": -2.5461363792419434, "logits/rejected": -2.8587663173675537, "logps/chosen": -27.1826229095459, "logps/rejected": -49.894630432128906, "loss": 0.6133, "rewards/accuracies": 1.0, "rewards/chosen": 0.1226259097456932, "rewards/margins": 0.1674787700176239, "rewards/rejected": -0.044852860271930695, "step": 370 }, { "epoch": 0.053778658363996604, "grad_norm": 6.506512500467565, "learning_rate": 5.3748231966053746e-08, "logits/chosen": -2.52848744392395, "logits/rejected": -2.8328468799591064, "logps/chosen": -24.901660919189453, "logps/rejected": -47.62281036376953, "loss": 0.6105, "rewards/accuracies": 1.0, "rewards/chosen": 0.123713418841362, "rewards/margins": 0.178060844540596, "rewards/rejected": -0.05434741452336311, "step": 380 }, { "epoch": 0.05519388621568073, "grad_norm": 7.307429643962034, "learning_rate": 5.5162659123055156e-08, "logits/chosen": -2.5473408699035645, "logits/rejected": -2.8370187282562256, "logps/chosen": -26.027790069580078, "logps/rejected": -43.829345703125, "loss": 0.6057, "rewards/accuracies": 1.0, "rewards/chosen": 0.11834053695201874, "rewards/margins": 0.17244763672351837, "rewards/rejected": -0.05410709232091904, "step": 390 }, { "epoch": 0.056609114067364845, "grad_norm": 7.621313764067034, "learning_rate": 5.657708628005657e-08, "logits/chosen": -2.5826587677001953, "logits/rejected": -2.852790117263794, "logps/chosen": -26.695632934570312, "logps/rejected": -49.61513137817383, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": 0.12091455608606339, "rewards/margins": 0.19664224982261658, "rewards/rejected": -0.07572769373655319, "step": 400 }, { "epoch": 0.05802434191904897, "grad_norm": 7.708496552727653, "learning_rate": 5.799151343705799e-08, "logits/chosen": -2.541948080062866, "logits/rejected": -2.83048152923584, "logps/chosen": -25.128698348999023, "logps/rejected": -47.12349319458008, "loss": 0.5967, "rewards/accuracies": 1.0, "rewards/chosen": 0.11786462366580963, "rewards/margins": 0.20276598632335663, "rewards/rejected": -0.08490137755870819, "step": 410 }, { "epoch": 0.059439569770733086, "grad_norm": 8.735039545139069, "learning_rate": 5.94059405940594e-08, "logits/chosen": -2.5301899909973145, "logits/rejected": -2.811450242996216, "logps/chosen": -27.968679428100586, "logps/rejected": -52.88923263549805, "loss": 0.5886, "rewards/accuracies": 1.0, "rewards/chosen": 0.10939796268939972, "rewards/margins": 0.22251757979393005, "rewards/rejected": -0.11311958730220795, "step": 420 }, { "epoch": 0.06085479762241721, "grad_norm": 10.832341015067975, "learning_rate": 6.082036775106082e-08, "logits/chosen": -2.542470932006836, "logits/rejected": -2.8501365184783936, "logps/chosen": -27.89850425720215, "logps/rejected": -58.09006881713867, "loss": 0.5773, "rewards/accuracies": 1.0, "rewards/chosen": 0.10059578716754913, "rewards/margins": 0.265701562166214, "rewards/rejected": -0.16510574519634247, "step": 430 }, { "epoch": 0.06227002547410133, "grad_norm": 11.588317015381765, "learning_rate": 6.223479490806222e-08, "logits/chosen": -2.461747169494629, "logits/rejected": -2.7756810188293457, "logps/chosen": -34.05313491821289, "logps/rejected": -66.36157989501953, "loss": 0.562, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05843954160809517, "rewards/margins": 0.2933076322078705, "rewards/rejected": -0.2348680943250656, "step": 440 }, { "epoch": 0.06368525332578545, "grad_norm": 11.929364811491821, "learning_rate": 6.364922206506365e-08, "logits/chosen": -2.504366397857666, "logits/rejected": -2.751574754714966, "logps/chosen": -32.42599105834961, "logps/rejected": -65.10383605957031, "loss": 0.5583, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.06826834380626678, "rewards/margins": 0.3041297495365143, "rewards/rejected": -0.2358613759279251, "step": 450 }, { "epoch": 0.06510048117746957, "grad_norm": 14.536836283756397, "learning_rate": 6.506364922206507e-08, "logits/chosen": -2.4566092491149902, "logits/rejected": -2.7547335624694824, "logps/chosen": -35.952911376953125, "logps/rejected": -74.74407958984375, "loss": 0.5465, "rewards/accuracies": 1.0, "rewards/chosen": 0.0376083105802536, "rewards/margins": 0.3633834719657898, "rewards/rejected": -0.3257751762866974, "step": 460 }, { "epoch": 0.0665157090291537, "grad_norm": 14.551996809313161, "learning_rate": 6.647807637906647e-08, "logits/chosen": -2.4127447605133057, "logits/rejected": -2.706569194793701, "logps/chosen": -37.13035583496094, "logps/rejected": -87.08662414550781, "loss": 0.5236, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.009625871665775776, "rewards/margins": 0.449057012796402, "rewards/rejected": -0.4394311010837555, "step": 470 }, { "epoch": 0.06793093688083782, "grad_norm": 19.39077417075077, "learning_rate": 6.789250353606789e-08, "logits/chosen": -2.3591737747192383, "logits/rejected": -2.6554903984069824, "logps/chosen": -41.80305480957031, "logps/rejected": -87.75968933105469, "loss": 0.5078, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.036822058260440826, "rewards/margins": 0.4254051744937897, "rewards/rejected": -0.4622272849082947, "step": 480 }, { "epoch": 0.06934616473252193, "grad_norm": 17.00918374503046, "learning_rate": 6.93069306930693e-08, "logits/chosen": -2.336452007293701, "logits/rejected": -2.5836803913116455, "logps/chosen": -46.162532806396484, "logps/rejected": -109.7291488647461, "loss": 0.5025, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.07311579585075378, "rewards/margins": 0.567710816860199, "rewards/rejected": -0.6408265829086304, "step": 490 }, { "epoch": 0.07076139258420605, "grad_norm": 18.33146368393545, "learning_rate": 7.072135785007071e-08, "logits/chosen": -2.296773910522461, "logits/rejected": -2.5716261863708496, "logps/chosen": -46.51655578613281, "logps/rejected": -99.77568054199219, "loss": 0.4951, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0907076969742775, "rewards/margins": 0.4834250807762146, "rewards/rejected": -0.5741328001022339, "step": 500 }, { "epoch": 0.07217662043589018, "grad_norm": 20.32026758805958, "learning_rate": 7.213578500707214e-08, "logits/chosen": -2.208195447921753, "logits/rejected": -2.5302977561950684, "logps/chosen": -50.92499542236328, "logps/rejected": -102.03929901123047, "loss": 0.4716, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.12573601305484772, "rewards/margins": 0.509817898273468, "rewards/rejected": -0.6355539560317993, "step": 510 }, { "epoch": 0.0735918482875743, "grad_norm": 20.26554163637613, "learning_rate": 7.355021216407355e-08, "logits/chosen": -2.2166197299957275, "logits/rejected": -2.490905284881592, "logps/chosen": -53.6128044128418, "logps/rejected": -118.39106750488281, "loss": 0.461, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.14843767881393433, "rewards/margins": 0.6152985095977783, "rewards/rejected": -0.7637362480163574, "step": 520 }, { "epoch": 0.07500707613925842, "grad_norm": 19.81800530410728, "learning_rate": 7.496463932107496e-08, "logits/chosen": -2.1861002445220947, "logits/rejected": -2.41257905960083, "logps/chosen": -56.83454513549805, "logps/rejected": -119.5007095336914, "loss": 0.4586, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.18680690228939056, "rewards/margins": 0.6132361888885498, "rewards/rejected": -0.8000432252883911, "step": 530 }, { "epoch": 0.07642230399094255, "grad_norm": 23.62634323905375, "learning_rate": 7.637906647807637e-08, "logits/chosen": -2.1579947471618652, "logits/rejected": -2.3853774070739746, "logps/chosen": -58.89481735229492, "logps/rejected": -128.14590454101562, "loss": 0.4469, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.20642514526844025, "rewards/margins": 0.6407147645950317, "rewards/rejected": -0.8471399545669556, "step": 540 }, { "epoch": 0.07783753184262666, "grad_norm": 20.929577975392473, "learning_rate": 7.779349363507779e-08, "logits/chosen": -2.1294140815734863, "logits/rejected": -2.389244318008423, "logps/chosen": -61.2080078125, "logps/rejected": -141.0167999267578, "loss": 0.4105, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.21886444091796875, "rewards/margins": 0.7542992830276489, "rewards/rejected": -0.9731637239456177, "step": 550 }, { "epoch": 0.07925275969431078, "grad_norm": 28.918881269661558, "learning_rate": 7.920792079207921e-08, "logits/chosen": -2.0543107986450195, "logits/rejected": -2.2905170917510986, "logps/chosen": -65.36962127685547, "logps/rejected": -148.9934844970703, "loss": 0.421, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.2730531096458435, "rewards/margins": 0.7848917841911316, "rewards/rejected": -1.057944893836975, "step": 560 }, { "epoch": 0.0806679875459949, "grad_norm": 23.059795812800427, "learning_rate": 8.062234794908061e-08, "logits/chosen": -2.0172817707061768, "logits/rejected": -2.2755608558654785, "logps/chosen": -62.034141540527344, "logps/rejected": -140.0832061767578, "loss": 0.4158, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.24802763760089874, "rewards/margins": 0.7464353442192078, "rewards/rejected": -0.9944629669189453, "step": 570 }, { "epoch": 0.08208321539767903, "grad_norm": 23.893123100956505, "learning_rate": 8.203677510608204e-08, "logits/chosen": -1.9565967321395874, "logits/rejected": -2.2042925357818604, "logps/chosen": -69.16587829589844, "logps/rejected": -162.86514282226562, "loss": 0.413, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3099169433116913, "rewards/margins": 0.8770081400871277, "rewards/rejected": -1.186924934387207, "step": 580 }, { "epoch": 0.08349844324936315, "grad_norm": 31.353350843448574, "learning_rate": 8.345120226308345e-08, "logits/chosen": -1.9329252243041992, "logits/rejected": -2.1768345832824707, "logps/chosen": -74.49883270263672, "logps/rejected": -142.05384826660156, "loss": 0.4161, "rewards/accuracies": 0.875, "rewards/chosen": -0.3521520495414734, "rewards/margins": 0.6750375032424927, "rewards/rejected": -1.0271894931793213, "step": 590 }, { "epoch": 0.08491367110104726, "grad_norm": 22.82703524753858, "learning_rate": 8.486562942008486e-08, "logits/chosen": -1.9055535793304443, "logits/rejected": -2.156700849533081, "logps/chosen": -72.07949829101562, "logps/rejected": -174.7797088623047, "loss": 0.3924, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3332705795764923, "rewards/margins": 0.970126748085022, "rewards/rejected": -1.303397297859192, "step": 600 }, { "epoch": 0.0863288989527314, "grad_norm": 26.98518861818738, "learning_rate": 8.628005657708628e-08, "logits/chosen": -1.9322850704193115, "logits/rejected": -2.1279869079589844, "logps/chosen": -76.9574966430664, "logps/rejected": -178.3206329345703, "loss": 0.4075, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.39242544770240784, "rewards/margins": 0.9578051567077637, "rewards/rejected": -1.3502305746078491, "step": 610 }, { "epoch": 0.08774412680441551, "grad_norm": 24.24709497205642, "learning_rate": 8.769448373408769e-08, "logits/chosen": -1.9125525951385498, "logits/rejected": -2.125107526779175, "logps/chosen": -91.86576843261719, "logps/rejected": -187.01052856445312, "loss": 0.3857, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5079001188278198, "rewards/margins": 0.9002768397331238, "rewards/rejected": -1.408177137374878, "step": 620 }, { "epoch": 0.08915935465609963, "grad_norm": 33.07632293513378, "learning_rate": 8.91089108910891e-08, "logits/chosen": -1.850388526916504, "logits/rejected": -2.056058406829834, "logps/chosen": -93.37017822265625, "logps/rejected": -189.3873748779297, "loss": 0.3758, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.514242947101593, "rewards/margins": 0.9366266131401062, "rewards/rejected": -1.4508694410324097, "step": 630 }, { "epoch": 0.09057458250778375, "grad_norm": 34.058014746455086, "learning_rate": 9.052333804809051e-08, "logits/chosen": -1.7643773555755615, "logits/rejected": -1.9705626964569092, "logps/chosen": -77.84427642822266, "logps/rejected": -170.46182250976562, "loss": 0.3824, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.4065326154232025, "rewards/margins": 0.9109088778495789, "rewards/rejected": -1.317441463470459, "step": 640 }, { "epoch": 0.09198981035946788, "grad_norm": 23.932292436028867, "learning_rate": 9.193776520509194e-08, "logits/chosen": -1.7377374172210693, "logits/rejected": -1.9665203094482422, "logps/chosen": -81.3836898803711, "logps/rejected": -185.78172302246094, "loss": 0.3824, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.431577205657959, "rewards/margins": 0.9908435940742493, "rewards/rejected": -1.422420620918274, "step": 650 }, { "epoch": 0.093405038211152, "grad_norm": 24.728586602390635, "learning_rate": 9.335219236209335e-08, "logits/chosen": -1.7382436990737915, "logits/rejected": -1.9586458206176758, "logps/chosen": -96.07732391357422, "logps/rejected": -198.5052947998047, "loss": 0.375, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5662888288497925, "rewards/margins": 1.0112369060516357, "rewards/rejected": -1.5775258541107178, "step": 660 }, { "epoch": 0.09482026606283611, "grad_norm": 31.577324987159066, "learning_rate": 9.476661951909476e-08, "logits/chosen": -1.7313789129257202, "logits/rejected": -1.981956124305725, "logps/chosen": -95.2544937133789, "logps/rejected": -220.0247802734375, "loss": 0.3621, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5649188160896301, "rewards/margins": 1.1813298463821411, "rewards/rejected": -1.7462486028671265, "step": 670 }, { "epoch": 0.09623549391452024, "grad_norm": 34.79275693278652, "learning_rate": 9.618104667609618e-08, "logits/chosen": -1.7121118307113647, "logits/rejected": -1.8936107158660889, "logps/chosen": -108.15220642089844, "logps/rejected": -226.04769897460938, "loss": 0.3283, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6753430962562561, "rewards/margins": 1.1736912727355957, "rewards/rejected": -1.8490345478057861, "step": 680 }, { "epoch": 0.09765072176620436, "grad_norm": 31.62411091810251, "learning_rate": 9.759547383309759e-08, "logits/chosen": -1.582474946975708, "logits/rejected": -1.8158750534057617, "logps/chosen": -104.5450439453125, "logps/rejected": -267.02398681640625, "loss": 0.3164, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6479911804199219, "rewards/margins": 1.5516403913497925, "rewards/rejected": -2.199631452560425, "step": 690 }, { "epoch": 0.09906594961788848, "grad_norm": 57.10881663646826, "learning_rate": 9.9009900990099e-08, "logits/chosen": -1.5919325351715088, "logits/rejected": -1.7519042491912842, "logps/chosen": -118.52108001708984, "logps/rejected": -225.95083618164062, "loss": 0.3363, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7822328805923462, "rewards/margins": 1.0894126892089844, "rewards/rejected": -1.8716453313827515, "step": 700 }, { "epoch": 0.10048117746957261, "grad_norm": 51.524573387974115, "learning_rate": 9.995282277087592e-08, "logits/chosen": -1.5440150499343872, "logits/rejected": -1.7541792392730713, "logps/chosen": -111.5463638305664, "logps/rejected": -273.5348205566406, "loss": 0.3322, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7268660664558411, "rewards/margins": 1.5481882095336914, "rewards/rejected": -2.2750542163848877, "step": 710 }, { "epoch": 0.10189640532125673, "grad_norm": 37.667362241714905, "learning_rate": 9.979556534046232e-08, "logits/chosen": -1.5246914625167847, "logits/rejected": -1.733755111694336, "logps/chosen": -112.36112976074219, "logps/rejected": -250.4267120361328, "loss": 0.3394, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.7294378876686096, "rewards/margins": 1.347788691520691, "rewards/rejected": -2.077226400375366, "step": 720 }, { "epoch": 0.10331163317294084, "grad_norm": 35.930319789670904, "learning_rate": 9.963830791004874e-08, "logits/chosen": -1.509680151939392, "logits/rejected": -1.7130193710327148, "logps/chosen": -116.8870620727539, "logps/rejected": -276.93609619140625, "loss": 0.3296, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.7822552919387817, "rewards/margins": 1.5479778051376343, "rewards/rejected": -2.330233097076416, "step": 730 }, { "epoch": 0.10472686102462496, "grad_norm": 43.020946331312025, "learning_rate": 9.948105047963516e-08, "logits/chosen": -1.5065861940383911, "logits/rejected": -1.7142763137817383, "logps/chosen": -125.27330017089844, "logps/rejected": -243.27023315429688, "loss": 0.3263, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.851860523223877, "rewards/margins": 1.1861908435821533, "rewards/rejected": -2.0380513668060303, "step": 740 }, { "epoch": 0.10614208887630909, "grad_norm": 37.06363617900084, "learning_rate": 9.932379304922157e-08, "logits/chosen": -1.5076411962509155, "logits/rejected": -1.7189722061157227, "logps/chosen": -105.21784973144531, "logps/rejected": -262.52386474609375, "loss": 0.3241, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6667578220367432, "rewards/margins": 1.54726243019104, "rewards/rejected": -2.214020252227783, "step": 750 }, { "epoch": 0.10755731672799321, "grad_norm": 36.988704501496386, "learning_rate": 9.916653561880799e-08, "logits/chosen": -1.503108263015747, "logits/rejected": -1.675087332725525, "logps/chosen": -111.3561782836914, "logps/rejected": -284.6456298828125, "loss": 0.2976, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7212788462638855, "rewards/margins": 1.656511664390564, "rewards/rejected": -2.3777904510498047, "step": 760 }, { "epoch": 0.10897254457967732, "grad_norm": 41.496083563517445, "learning_rate": 9.90092781883944e-08, "logits/chosen": -1.4830451011657715, "logits/rejected": -1.6992874145507812, "logps/chosen": -108.13719177246094, "logps/rejected": -269.7738037109375, "loss": 0.3036, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.6950986385345459, "rewards/margins": 1.5848690271377563, "rewards/rejected": -2.279967784881592, "step": 770 }, { "epoch": 0.11038777243136146, "grad_norm": 55.473983042205646, "learning_rate": 9.885202075798081e-08, "logits/chosen": -1.423952579498291, "logits/rejected": -1.6419944763183594, "logps/chosen": -136.60008239746094, "logps/rejected": -297.6886291503906, "loss": 0.3139, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.9484254121780396, "rewards/margins": 1.5715782642364502, "rewards/rejected": -2.520003318786621, "step": 780 }, { "epoch": 0.11180300028304557, "grad_norm": 43.899365271579185, "learning_rate": 9.869476332756723e-08, "logits/chosen": -1.4130809307098389, "logits/rejected": -1.6133434772491455, "logps/chosen": -125.2385025024414, "logps/rejected": -273.32269287109375, "loss": 0.3038, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.8610062599182129, "rewards/margins": 1.4537107944488525, "rewards/rejected": -2.3147170543670654, "step": 790 }, { "epoch": 0.11321822813472969, "grad_norm": 43.52194331471362, "learning_rate": 9.853750589715364e-08, "logits/chosen": -1.4019429683685303, "logits/rejected": -1.6567281484603882, "logps/chosen": -133.5843963623047, "logps/rejected": -260.038818359375, "loss": 0.3122, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9417110681533813, "rewards/margins": 1.2817494869232178, "rewards/rejected": -2.2234606742858887, "step": 800 }, { "epoch": 0.11463345598641381, "grad_norm": 32.3302708346943, "learning_rate": 9.838024846674005e-08, "logits/chosen": -1.3876116275787354, "logits/rejected": -1.5982298851013184, "logps/chosen": -111.20094299316406, "logps/rejected": -269.4052429199219, "loss": 0.2731, "rewards/accuracies": 0.9375, "rewards/chosen": -0.731584906578064, "rewards/margins": 1.5761277675628662, "rewards/rejected": -2.3077127933502197, "step": 810 }, { "epoch": 0.11604868383809794, "grad_norm": 43.34391051724229, "learning_rate": 9.822299103632646e-08, "logits/chosen": -1.379043459892273, "logits/rejected": -1.6137844324111938, "logps/chosen": -124.03498840332031, "logps/rejected": -302.324951171875, "loss": 0.2848, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8370897173881531, "rewards/margins": 1.7524621486663818, "rewards/rejected": -2.5895519256591797, "step": 820 }, { "epoch": 0.11746391168978206, "grad_norm": 40.52553793586795, "learning_rate": 9.806573360591287e-08, "logits/chosen": -1.3064250946044922, "logits/rejected": -1.5332471132278442, "logps/chosen": -126.62493896484375, "logps/rejected": -298.8015441894531, "loss": 0.2594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8739917874336243, "rewards/margins": 1.6985994577407837, "rewards/rejected": -2.5725913047790527, "step": 830 }, { "epoch": 0.11887913954146617, "grad_norm": 30.768330775047023, "learning_rate": 9.790847617549928e-08, "logits/chosen": -1.3012118339538574, "logits/rejected": -1.499026894569397, "logps/chosen": -118.91743469238281, "logps/rejected": -315.68463134765625, "loss": 0.2574, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8141660690307617, "rewards/margins": 1.9301646947860718, "rewards/rejected": -2.744330883026123, "step": 840 }, { "epoch": 0.1202943673931503, "grad_norm": 49.40931049765344, "learning_rate": 9.775121874508571e-08, "logits/chosen": -1.2338948249816895, "logits/rejected": -1.435060739517212, "logps/chosen": -138.16647338867188, "logps/rejected": -326.51019287109375, "loss": 0.2545, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0049320459365845, "rewards/margins": 1.8310034275054932, "rewards/rejected": -2.835935115814209, "step": 850 }, { "epoch": 0.12170959524483442, "grad_norm": 51.64693268922345, "learning_rate": 9.759396131467212e-08, "logits/chosen": -1.1943867206573486, "logits/rejected": -1.394303560256958, "logps/chosen": -127.90135192871094, "logps/rejected": -321.0440979003906, "loss": 0.2813, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9007476568222046, "rewards/margins": 1.9021390676498413, "rewards/rejected": -2.802886724472046, "step": 860 }, { "epoch": 0.12312482309651854, "grad_norm": 59.72591862002992, "learning_rate": 9.743670388425853e-08, "logits/chosen": -1.2120294570922852, "logits/rejected": -1.4069468975067139, "logps/chosen": -121.71861267089844, "logps/rejected": -332.93804931640625, "loss": 0.2527, "rewards/accuracies": 0.9375, "rewards/chosen": -0.840746283531189, "rewards/margins": 2.087014675140381, "rewards/rejected": -2.9277613162994385, "step": 870 }, { "epoch": 0.12454005094820265, "grad_norm": 48.092154777790135, "learning_rate": 9.727944645384494e-08, "logits/chosen": -1.1260106563568115, "logits/rejected": -1.3769376277923584, "logps/chosen": -131.41822814941406, "logps/rejected": -330.30621337890625, "loss": 0.2785, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9293329119682312, "rewards/margins": 1.9485353231430054, "rewards/rejected": -2.877868175506592, "step": 880 }, { "epoch": 0.12595527879988677, "grad_norm": 57.860275055783745, "learning_rate": 9.712218902343135e-08, "logits/chosen": -1.1933033466339111, "logits/rejected": -1.3541946411132812, "logps/chosen": -135.36154174804688, "logps/rejected": -362.2421875, "loss": 0.228, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.963970959186554, "rewards/margins": 2.2067832946777344, "rewards/rejected": -3.1707541942596436, "step": 890 }, { "epoch": 0.1273705066515709, "grad_norm": 62.797643520305144, "learning_rate": 9.696493159301777e-08, "logits/chosen": -1.0914766788482666, "logits/rejected": -1.2933642864227295, "logps/chosen": -156.9756317138672, "logps/rejected": -337.4516906738281, "loss": 0.2645, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.1694700717926025, "rewards/margins": 1.7843453884124756, "rewards/rejected": -2.953815460205078, "step": 900 }, { "epoch": 0.12878573450325503, "grad_norm": 91.10851895918252, "learning_rate": 9.680767416260418e-08, "logits/chosen": -1.0673291683197021, "logits/rejected": -1.2409955263137817, "logps/chosen": -145.54669189453125, "logps/rejected": -405.27923583984375, "loss": 0.221, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0767595767974854, "rewards/margins": 2.54182505607605, "rewards/rejected": -3.6185848712921143, "step": 910 }, { "epoch": 0.13020096235493914, "grad_norm": 69.3940964366668, "learning_rate": 9.665041673219059e-08, "logits/chosen": -1.1059695482254028, "logits/rejected": -1.2793653011322021, "logps/chosen": -143.28659057617188, "logps/rejected": -449.565185546875, "loss": 0.2548, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0457175970077515, "rewards/margins": 2.9862141609191895, "rewards/rejected": -4.031931400299072, "step": 920 }, { "epoch": 0.13161619020662327, "grad_norm": 81.0773707597494, "learning_rate": 9.6493159301777e-08, "logits/chosen": -1.118024468421936, "logits/rejected": -1.28127121925354, "logps/chosen": -155.6415557861328, "logps/rejected": -381.8045959472656, "loss": 0.2392, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1557610034942627, "rewards/margins": 2.2475712299346924, "rewards/rejected": -3.403332233428955, "step": 930 }, { "epoch": 0.1330314180583074, "grad_norm": 46.54393320084798, "learning_rate": 9.633590187136342e-08, "logits/chosen": -1.1994906663894653, "logits/rejected": -1.377077579498291, "logps/chosen": -171.7549591064453, "logps/rejected": -396.52288818359375, "loss": 0.244, "rewards/accuracies": 0.9375, "rewards/chosen": -1.303326964378357, "rewards/margins": 2.242004871368408, "rewards/rejected": -3.5453319549560547, "step": 940 }, { "epoch": 0.1344466459099915, "grad_norm": 71.72853921400007, "learning_rate": 9.617864444094982e-08, "logits/chosen": -1.1210739612579346, "logits/rejected": -1.316470980644226, "logps/chosen": -168.23574829101562, "logps/rejected": -396.61273193359375, "loss": 0.2363, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2759754657745361, "rewards/margins": 2.2978391647338867, "rewards/rejected": -3.573814868927002, "step": 950 }, { "epoch": 0.13586187376167563, "grad_norm": 69.31575796875791, "learning_rate": 9.602138701053625e-08, "logits/chosen": -1.1204301118850708, "logits/rejected": -1.346192717552185, "logps/chosen": -167.46847534179688, "logps/rejected": -423.07525634765625, "loss": 0.2218, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2844123840332031, "rewards/margins": 2.5160374641418457, "rewards/rejected": -3.800450086593628, "step": 960 }, { "epoch": 0.13727710161335976, "grad_norm": 34.31887636045361, "learning_rate": 9.586412958012266e-08, "logits/chosen": -1.1112850904464722, "logits/rejected": -1.3458985090255737, "logps/chosen": -164.8035888671875, "logps/rejected": -403.59930419921875, "loss": 0.2131, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.2485620975494385, "rewards/margins": 2.372694492340088, "rewards/rejected": -3.6212565898895264, "step": 970 }, { "epoch": 0.13869232946504387, "grad_norm": 44.77485721842629, "learning_rate": 9.570687214970907e-08, "logits/chosen": -1.0575435161590576, "logits/rejected": -1.2586482763290405, "logps/chosen": -166.92605590820312, "logps/rejected": -444.9717712402344, "loss": 0.223, "rewards/accuracies": 0.9375, "rewards/chosen": -1.282355785369873, "rewards/margins": 2.7698092460632324, "rewards/rejected": -4.0521650314331055, "step": 980 }, { "epoch": 0.140107557316728, "grad_norm": 98.49191899709453, "learning_rate": 9.554961471929548e-08, "logits/chosen": -1.066416621208191, "logits/rejected": -1.3061909675598145, "logps/chosen": -176.7513885498047, "logps/rejected": -457.9227600097656, "loss": 0.2233, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3662058115005493, "rewards/margins": 2.815774440765381, "rewards/rejected": -4.181980609893799, "step": 990 }, { "epoch": 0.1415227851684121, "grad_norm": 41.03376894733079, "learning_rate": 9.539235728888189e-08, "logits/chosen": -1.0047295093536377, "logits/rejected": -1.1804733276367188, "logps/chosen": -164.66671752929688, "logps/rejected": -391.6124572753906, "loss": 0.2458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.257197618484497, "rewards/margins": 2.2748355865478516, "rewards/rejected": -3.5320332050323486, "step": 1000 }, { "epoch": 0.14293801302009623, "grad_norm": 48.64981658448413, "learning_rate": 9.523509985846831e-08, "logits/chosen": -1.0136408805847168, "logits/rejected": -1.2105745077133179, "logps/chosen": -169.30267333984375, "logps/rejected": -415.40692138671875, "loss": 0.2259, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3027150630950928, "rewards/margins": 2.426024913787842, "rewards/rejected": -3.7287399768829346, "step": 1010 }, { "epoch": 0.14435324087178036, "grad_norm": 71.99375869955506, "learning_rate": 9.507784242805473e-08, "logits/chosen": -0.9875739216804504, "logits/rejected": -1.1862735748291016, "logps/chosen": -164.8997039794922, "logps/rejected": -428.5901794433594, "loss": 0.2017, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2870218753814697, "rewards/margins": 2.596653461456299, "rewards/rejected": -3.8836753368377686, "step": 1020 }, { "epoch": 0.14576846872346447, "grad_norm": 40.09704061902224, "learning_rate": 9.492058499764113e-08, "logits/chosen": -0.9395803213119507, "logits/rejected": -1.0865833759307861, "logps/chosen": -176.72305297851562, "logps/rejected": -483.11614990234375, "loss": 0.2205, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.380408763885498, "rewards/margins": 3.0454001426696777, "rewards/rejected": -4.425808906555176, "step": 1030 }, { "epoch": 0.1471836965751486, "grad_norm": 64.85709722575909, "learning_rate": 9.476332756722755e-08, "logits/chosen": -0.9294945597648621, "logits/rejected": -1.1701762676239014, "logps/chosen": -172.75656127929688, "logps/rejected": -492.38720703125, "loss": 0.1895, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3391387462615967, "rewards/margins": 3.137132167816162, "rewards/rejected": -4.4762701988220215, "step": 1040 }, { "epoch": 0.14859892442683273, "grad_norm": 59.821226877014645, "learning_rate": 9.460607013681396e-08, "logits/chosen": -0.9819586873054504, "logits/rejected": -1.1763591766357422, "logps/chosen": -184.96327209472656, "logps/rejected": -492.9900817871094, "loss": 0.2042, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4658451080322266, "rewards/margins": 3.037954807281494, "rewards/rejected": -4.5037994384765625, "step": 1050 }, { "epoch": 0.15001415227851683, "grad_norm": 44.6619172703806, "learning_rate": 9.444881270640037e-08, "logits/chosen": -0.8724802136421204, "logits/rejected": -1.075822114944458, "logps/chosen": -206.0810089111328, "logps/rejected": -508.6405334472656, "loss": 0.224, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.6609090566635132, "rewards/margins": 2.9831840991973877, "rewards/rejected": -4.6440935134887695, "step": 1060 }, { "epoch": 0.15142938013020096, "grad_norm": 71.29521770276854, "learning_rate": 9.42915552759868e-08, "logits/chosen": -0.8930387496948242, "logits/rejected": -1.0218263864517212, "logps/chosen": -189.92884826660156, "logps/rejected": -454.4833984375, "loss": 0.2481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.51095449924469, "rewards/margins": 2.6383283138275146, "rewards/rejected": -4.149282932281494, "step": 1070 }, { "epoch": 0.1528446079818851, "grad_norm": 49.69299773798377, "learning_rate": 9.41342978455732e-08, "logits/chosen": -0.9462124705314636, "logits/rejected": -1.128543734550476, "logps/chosen": -197.19908142089844, "logps/rejected": -508.23760986328125, "loss": 0.1944, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.575115442276001, "rewards/margins": 3.061042547225952, "rewards/rejected": -4.636157989501953, "step": 1080 }, { "epoch": 0.1542598358335692, "grad_norm": 51.55692542106136, "learning_rate": 9.397704041515961e-08, "logits/chosen": -0.9031047821044922, "logits/rejected": -1.1464694738388062, "logps/chosen": -201.2833251953125, "logps/rejected": -546.4022216796875, "loss": 0.1944, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.629265546798706, "rewards/margins": 3.449129581451416, "rewards/rejected": -5.078395366668701, "step": 1090 }, { "epoch": 0.15567506368525333, "grad_norm": 57.47601029489839, "learning_rate": 9.381978298474602e-08, "logits/chosen": -0.9093024134635925, "logits/rejected": -1.135372519493103, "logps/chosen": -241.76123046875, "logps/rejected": -573.9232177734375, "loss": 0.2092, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9982526302337646, "rewards/margins": 3.304992198944092, "rewards/rejected": -5.303244590759277, "step": 1100 }, { "epoch": 0.15709029153693746, "grad_norm": 93.16976801970328, "learning_rate": 9.366252555433243e-08, "logits/chosen": -0.9472804069519043, "logits/rejected": -1.1771047115325928, "logps/chosen": -197.79525756835938, "logps/rejected": -533.1463623046875, "loss": 0.1602, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5856291055679321, "rewards/margins": 3.331538438796997, "rewards/rejected": -4.917167663574219, "step": 1110 }, { "epoch": 0.15850551938862156, "grad_norm": 39.0094705916096, "learning_rate": 9.350526812391885e-08, "logits/chosen": -0.9758346676826477, "logits/rejected": -1.2019245624542236, "logps/chosen": -200.5435791015625, "logps/rejected": -563.6744995117188, "loss": 0.1863, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6058034896850586, "rewards/margins": 3.6129508018493652, "rewards/rejected": -5.218754291534424, "step": 1120 }, { "epoch": 0.1599207472403057, "grad_norm": 29.379290869077206, "learning_rate": 9.334801069350527e-08, "logits/chosen": -0.9032111167907715, "logits/rejected": -1.1321079730987549, "logps/chosen": -241.40969848632812, "logps/rejected": -565.4302368164062, "loss": 0.1856, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.0146124362945557, "rewards/margins": 3.25272798538208, "rewards/rejected": -5.267340660095215, "step": 1130 }, { "epoch": 0.1613359750919898, "grad_norm": 49.58085946706586, "learning_rate": 9.319075326309168e-08, "logits/chosen": -0.8407685160636902, "logits/rejected": -1.0584402084350586, "logps/chosen": -185.33547973632812, "logps/rejected": -544.2427978515625, "loss": 0.1855, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4806464910507202, "rewards/margins": 3.538909912109375, "rewards/rejected": -5.019555568695068, "step": 1140 }, { "epoch": 0.16275120294367393, "grad_norm": 85.40713821261605, "learning_rate": 9.303349583267809e-08, "logits/chosen": -0.8300951719284058, "logits/rejected": -1.06429922580719, "logps/chosen": -189.73721313476562, "logps/rejected": -532.9308471679688, "loss": 0.2035, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5178606510162354, "rewards/margins": 3.4086976051330566, "rewards/rejected": -4.926557540893555, "step": 1150 }, { "epoch": 0.16416643079535806, "grad_norm": 111.06423021308854, "learning_rate": 9.28762384022645e-08, "logits/chosen": -0.9224801063537598, "logits/rejected": -1.0644270181655884, "logps/chosen": -214.6028594970703, "logps/rejected": -586.1067504882812, "loss": 0.1619, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7378756999969482, "rewards/margins": 3.697253465652466, "rewards/rejected": -5.435128211975098, "step": 1160 }, { "epoch": 0.16558165864704216, "grad_norm": 50.55946724261199, "learning_rate": 9.271898097185091e-08, "logits/chosen": -0.8685497045516968, "logits/rejected": -1.0648750066757202, "logps/chosen": -185.4715576171875, "logps/rejected": -527.1909790039062, "loss": 0.138, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.483027696609497, "rewards/margins": 3.408876419067383, "rewards/rejected": -4.891903877258301, "step": 1170 }, { "epoch": 0.1669968864987263, "grad_norm": 35.72163701254993, "learning_rate": 9.256172354143734e-08, "logits/chosen": -0.8706784248352051, "logits/rejected": -1.1225640773773193, "logps/chosen": -214.45669555664062, "logps/rejected": -648.7603759765625, "loss": 0.1828, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7496837377548218, "rewards/margins": 4.3353376388549805, "rewards/rejected": -6.085021018981934, "step": 1180 }, { "epoch": 0.16841211435041042, "grad_norm": 66.46270432938285, "learning_rate": 9.240446611102374e-08, "logits/chosen": -0.8314210772514343, "logits/rejected": -1.0304864645004272, "logps/chosen": -218.22445678710938, "logps/rejected": -597.6668701171875, "loss": 0.1678, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8015992641448975, "rewards/margins": 3.7800726890563965, "rewards/rejected": -5.581672668457031, "step": 1190 }, { "epoch": 0.16982734220209453, "grad_norm": 50.268909021549526, "learning_rate": 9.224720868061016e-08, "logits/chosen": -0.9011184573173523, "logits/rejected": -1.1245768070220947, "logps/chosen": -237.99020385742188, "logps/rejected": -616.010498046875, "loss": 0.1864, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.9773225784301758, "rewards/margins": 3.7441768646240234, "rewards/rejected": -5.721499919891357, "step": 1200 }, { "epoch": 0.17124257005377866, "grad_norm": 95.99009977634113, "learning_rate": 9.208995125019656e-08, "logits/chosen": -0.9726642370223999, "logits/rejected": -1.1422998905181885, "logps/chosen": -216.80416870117188, "logps/rejected": -614.715576171875, "loss": 0.1448, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.768630027770996, "rewards/margins": 3.9792449474334717, "rewards/rejected": -5.747874736785889, "step": 1210 }, { "epoch": 0.1726577979054628, "grad_norm": 54.984579049174954, "learning_rate": 9.193269381978298e-08, "logits/chosen": -0.9490026235580444, "logits/rejected": -1.155291199684143, "logps/chosen": -239.46066284179688, "logps/rejected": -620.3589477539062, "loss": 0.1931, "rewards/accuracies": 0.9375, "rewards/chosen": -1.991468071937561, "rewards/margins": 3.7834575176239014, "rewards/rejected": -5.77492618560791, "step": 1220 }, { "epoch": 0.1740730257571469, "grad_norm": 62.72947935856315, "learning_rate": 9.177543638936939e-08, "logits/chosen": -1.0564813613891602, "logits/rejected": -1.2086031436920166, "logps/chosen": -224.183837890625, "logps/rejected": -640.0244750976562, "loss": 0.1654, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.8366531133651733, "rewards/margins": 4.138192176818848, "rewards/rejected": -5.974844932556152, "step": 1230 }, { "epoch": 0.17548825360883102, "grad_norm": 72.82553082881417, "learning_rate": 9.161817895895581e-08, "logits/chosen": -0.9752615094184875, "logits/rejected": -1.1782726049423218, "logps/chosen": -217.14077758789062, "logps/rejected": -710.7528076171875, "loss": 0.1388, "rewards/accuracies": 1.0, "rewards/chosen": -1.7814964056015015, "rewards/margins": 4.8858113288879395, "rewards/rejected": -6.6673078536987305, "step": 1240 }, { "epoch": 0.17690348146051516, "grad_norm": 64.7523211153716, "learning_rate": 9.146092152854223e-08, "logits/chosen": -0.9144706726074219, "logits/rejected": -1.1655237674713135, "logps/chosen": -230.52816772460938, "logps/rejected": -669.08642578125, "loss": 0.1473, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9162490367889404, "rewards/margins": 4.354998588562012, "rewards/rejected": -6.271247863769531, "step": 1250 }, { "epoch": 0.17831870931219926, "grad_norm": 33.1101276717767, "learning_rate": 9.130366409812863e-08, "logits/chosen": -0.9865830540657043, "logits/rejected": -1.1830883026123047, "logps/chosen": -243.4932403564453, "logps/rejected": -633.2340698242188, "loss": 0.1332, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0462841987609863, "rewards/margins": 3.8592209815979004, "rewards/rejected": -5.905505180358887, "step": 1260 }, { "epoch": 0.1797339371638834, "grad_norm": 95.80984998952557, "learning_rate": 9.114640666771504e-08, "logits/chosen": -0.9889016151428223, "logits/rejected": -1.2411140203475952, "logps/chosen": -221.8618927001953, "logps/rejected": -618.4512939453125, "loss": 0.1646, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.825467824935913, "rewards/margins": 3.934112071990967, "rewards/rejected": -5.759579658508301, "step": 1270 }, { "epoch": 0.1811491650155675, "grad_norm": 84.94430446231756, "learning_rate": 9.098914923730146e-08, "logits/chosen": -0.9496995806694031, "logits/rejected": -1.1947873830795288, "logps/chosen": -212.569091796875, "logps/rejected": -644.1823120117188, "loss": 0.1367, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7308467626571655, "rewards/margins": 4.309643745422363, "rewards/rejected": -6.04049015045166, "step": 1280 }, { "epoch": 0.18256439286725162, "grad_norm": 55.221902427280284, "learning_rate": 9.083189180688788e-08, "logits/chosen": -0.9798307418823242, "logits/rejected": -1.1983228921890259, "logps/chosen": -216.7918701171875, "logps/rejected": -648.9885864257812, "loss": 0.1544, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7788000106811523, "rewards/margins": 4.304306983947754, "rewards/rejected": -6.083106517791748, "step": 1290 }, { "epoch": 0.18397962071893575, "grad_norm": 83.13632812128361, "learning_rate": 9.067463437647428e-08, "logits/chosen": -0.9191730618476868, "logits/rejected": -1.1414375305175781, "logps/chosen": -218.7190704345703, "logps/rejected": -683.9268798828125, "loss": 0.148, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7999639511108398, "rewards/margins": 4.59537410736084, "rewards/rejected": -6.395338535308838, "step": 1300 }, { "epoch": 0.18539484857061986, "grad_norm": 105.398099572323, "learning_rate": 9.05173769460607e-08, "logits/chosen": -1.0336997509002686, "logits/rejected": -1.2598718404769897, "logps/chosen": -260.232177734375, "logps/rejected": -680.962890625, "loss": 0.1595, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2135777473449707, "rewards/margins": 4.209969997406006, "rewards/rejected": -6.423547267913818, "step": 1310 }, { "epoch": 0.186810076422304, "grad_norm": 90.82248244026033, "learning_rate": 9.036011951564711e-08, "logits/chosen": -1.0507996082305908, "logits/rejected": -1.219116449356079, "logps/chosen": -232.3948516845703, "logps/rejected": -671.6532592773438, "loss": 0.2234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.922427773475647, "rewards/margins": 4.371121406555176, "rewards/rejected": -6.293549537658691, "step": 1320 }, { "epoch": 0.18822530427398812, "grad_norm": 48.95966019891586, "learning_rate": 9.020286208523352e-08, "logits/chosen": -0.9687641263008118, "logits/rejected": -1.1578855514526367, "logps/chosen": -208.1879425048828, "logps/rejected": -595.1981201171875, "loss": 0.1347, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.6962944269180298, "rewards/margins": 3.8800883293151855, "rewards/rejected": -5.576382637023926, "step": 1330 }, { "epoch": 0.18964053212567222, "grad_norm": 44.655612060716365, "learning_rate": 9.004560465481993e-08, "logits/chosen": -1.037848711013794, "logits/rejected": -1.228533148765564, "logps/chosen": -220.9687042236328, "logps/rejected": -658.784423828125, "loss": 0.1054, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.8116741180419922, "rewards/margins": 4.366452217102051, "rewards/rejected": -6.178126811981201, "step": 1340 }, { "epoch": 0.19105575997735635, "grad_norm": 35.428293433205575, "learning_rate": 8.988834722440635e-08, "logits/chosen": -1.0142807960510254, "logits/rejected": -1.2566931247711182, "logps/chosen": -253.7073211669922, "logps/rejected": -687.623779296875, "loss": 0.1287, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.1584787368774414, "rewards/margins": 4.308650016784668, "rewards/rejected": -6.467129707336426, "step": 1350 }, { "epoch": 0.19247098782904049, "grad_norm": 105.41372295304758, "learning_rate": 8.973108979399277e-08, "logits/chosen": -1.0217695236206055, "logits/rejected": -1.2936155796051025, "logps/chosen": -219.22061157226562, "logps/rejected": -765.1702270507812, "loss": 0.1128, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8252952098846436, "rewards/margins": 5.428370475769043, "rewards/rejected": -7.253665924072266, "step": 1360 }, { "epoch": 0.1938862156807246, "grad_norm": 70.86787438033778, "learning_rate": 8.957383236357917e-08, "logits/chosen": -1.0182026624679565, "logits/rejected": -1.2437446117401123, "logps/chosen": -172.13351440429688, "logps/rejected": -631.3192749023438, "loss": 0.1344, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.3531746864318848, "rewards/margins": 4.559788703918457, "rewards/rejected": -5.912962913513184, "step": 1370 }, { "epoch": 0.19530144353240872, "grad_norm": 76.77180326295682, "learning_rate": 8.941657493316559e-08, "logits/chosen": -0.9426518678665161, "logits/rejected": -1.1705503463745117, "logps/chosen": -263.570556640625, "logps/rejected": -741.5023193359375, "loss": 0.1433, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.2462844848632812, "rewards/margins": 4.756198406219482, "rewards/rejected": -7.0024824142456055, "step": 1380 }, { "epoch": 0.19671667138409285, "grad_norm": 50.7240374853215, "learning_rate": 8.9259317502752e-08, "logits/chosen": -0.9886168241500854, "logits/rejected": -1.2130098342895508, "logps/chosen": -291.92169189453125, "logps/rejected": -776.5833129882812, "loss": 0.1322, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5073370933532715, "rewards/margins": 4.8198137283325195, "rewards/rejected": -7.327150821685791, "step": 1390 }, { "epoch": 0.19813189923577695, "grad_norm": 41.23267013947882, "learning_rate": 8.910206007233842e-08, "logits/chosen": -0.9542155265808105, "logits/rejected": -1.1512991189956665, "logps/chosen": -230.54147338867188, "logps/rejected": -715.4801635742188, "loss": 0.0835, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8985545635223389, "rewards/margins": 4.8432159423828125, "rewards/rejected": -6.7417707443237305, "step": 1400 }, { "epoch": 0.19954712708746108, "grad_norm": 62.97772603191195, "learning_rate": 8.894480264192484e-08, "logits/chosen": -0.9959174990653992, "logits/rejected": -1.272325873374939, "logps/chosen": -234.9702606201172, "logps/rejected": -780.912353515625, "loss": 0.1674, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.952972412109375, "rewards/margins": 5.438794136047363, "rewards/rejected": -7.3917670249938965, "step": 1410 }, { "epoch": 0.20096235493914522, "grad_norm": 67.86896162514005, "learning_rate": 8.878754521151124e-08, "logits/chosen": -0.9580492973327637, "logits/rejected": -1.2175936698913574, "logps/chosen": -223.46896362304688, "logps/rejected": -677.8434448242188, "loss": 0.1054, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8475925922393799, "rewards/margins": 4.52515172958374, "rewards/rejected": -6.372744083404541, "step": 1420 }, { "epoch": 0.20237758279082932, "grad_norm": 77.01068157354722, "learning_rate": 8.863028778109766e-08, "logits/chosen": -0.9134212732315063, "logits/rejected": -1.1295074224472046, "logps/chosen": -257.8914489746094, "logps/rejected": -727.1107788085938, "loss": 0.1777, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.172936201095581, "rewards/margins": 4.672344207763672, "rewards/rejected": -6.84528112411499, "step": 1430 }, { "epoch": 0.20379281064251345, "grad_norm": 155.1218358769449, "learning_rate": 8.847303035068406e-08, "logits/chosen": -1.0430316925048828, "logits/rejected": -1.3026642799377441, "logps/chosen": -262.93316650390625, "logps/rejected": -761.4536743164062, "loss": 0.1341, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.246554374694824, "rewards/margins": 4.968133449554443, "rewards/rejected": -7.214688777923584, "step": 1440 }, { "epoch": 0.20520803849419755, "grad_norm": 35.654732552138626, "learning_rate": 8.831577292027049e-08, "logits/chosen": -1.0949819087982178, "logits/rejected": -1.3082778453826904, "logps/chosen": -242.615234375, "logps/rejected": -775.3088989257812, "loss": 0.1098, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0398976802825928, "rewards/margins": 5.320385456085205, "rewards/rejected": -7.360282897949219, "step": 1450 }, { "epoch": 0.20662326634588168, "grad_norm": 161.27033751263224, "learning_rate": 8.815851548985689e-08, "logits/chosen": -1.0510737895965576, "logits/rejected": -1.3533920049667358, "logps/chosen": -255.3545379638672, "logps/rejected": -744.12158203125, "loss": 0.105, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1622469425201416, "rewards/margins": 4.849689483642578, "rewards/rejected": -7.011935234069824, "step": 1460 }, { "epoch": 0.20803849419756582, "grad_norm": 19.877202748047214, "learning_rate": 8.800125805944331e-08, "logits/chosen": -1.0803146362304688, "logits/rejected": -1.3254854679107666, "logps/chosen": -224.1918182373047, "logps/rejected": -784.4004516601562, "loss": 0.1106, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8500711917877197, "rewards/margins": 5.566481590270996, "rewards/rejected": -7.416552543640137, "step": 1470 }, { "epoch": 0.20945372204924992, "grad_norm": 81.5682804829961, "learning_rate": 8.784400062902971e-08, "logits/chosen": -0.9939289093017578, "logits/rejected": -1.2583261728286743, "logps/chosen": -255.2446746826172, "logps/rejected": -784.6729125976562, "loss": 0.0956, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.16072940826416, "rewards/margins": 5.2793474197387695, "rewards/rejected": -7.440077304840088, "step": 1480 }, { "epoch": 0.21086894990093405, "grad_norm": 44.246742183703134, "learning_rate": 8.768674319861613e-08, "logits/chosen": -1.0049759149551392, "logits/rejected": -1.2374731302261353, "logps/chosen": -232.8585968017578, "logps/rejected": -820.8663940429688, "loss": 0.117, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9336341619491577, "rewards/margins": 5.824029922485352, "rewards/rejected": -7.757663726806641, "step": 1490 }, { "epoch": 0.21228417775261818, "grad_norm": 99.09629702756041, "learning_rate": 8.752948576820254e-08, "logits/chosen": -1.0300439596176147, "logits/rejected": -1.3347264528274536, "logps/chosen": -250.06680297851562, "logps/rejected": -784.7390747070312, "loss": 0.1158, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.110076665878296, "rewards/margins": 5.3244194984436035, "rewards/rejected": -7.4344964027404785, "step": 1500 }, { "epoch": 0.21369940560430228, "grad_norm": 27.00501477009988, "learning_rate": 8.737222833778896e-08, "logits/chosen": -1.0014476776123047, "logits/rejected": -1.316119909286499, "logps/chosen": -236.68740844726562, "logps/rejected": -860.0587768554688, "loss": 0.0856, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9803781509399414, "rewards/margins": 6.201742649078369, "rewards/rejected": -8.182121276855469, "step": 1510 }, { "epoch": 0.21511463345598641, "grad_norm": 16.34141930408971, "learning_rate": 8.721497090737538e-08, "logits/chosen": -1.045770287513733, "logits/rejected": -1.2994716167449951, "logps/chosen": -258.20892333984375, "logps/rejected": -949.69287109375, "loss": 0.0485, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1979806423187256, "rewards/margins": 6.875175476074219, "rewards/rejected": -9.073156356811523, "step": 1520 }, { "epoch": 0.21652986130767055, "grad_norm": 79.31340758667446, "learning_rate": 8.705771347696178e-08, "logits/chosen": -1.0291882753372192, "logits/rejected": -1.3428499698638916, "logps/chosen": -301.4451599121094, "logps/rejected": -942.4006958007812, "loss": 0.1236, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6222176551818848, "rewards/margins": 6.3228373527526855, "rewards/rejected": -8.945055961608887, "step": 1530 }, { "epoch": 0.21794508915935465, "grad_norm": 27.50912263320526, "learning_rate": 8.69004560465482e-08, "logits/chosen": -0.9813095331192017, "logits/rejected": -1.261991262435913, "logps/chosen": -252.53466796875, "logps/rejected": -884.7117919921875, "loss": 0.0796, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1184792518615723, "rewards/margins": 6.3269853591918945, "rewards/rejected": -8.445463180541992, "step": 1540 }, { "epoch": 0.21936031701103878, "grad_norm": 9.435460508984587, "learning_rate": 8.67431986161346e-08, "logits/chosen": -1.0452015399932861, "logits/rejected": -1.3347809314727783, "logps/chosen": -239.9413604736328, "logps/rejected": -985.7352294921875, "loss": 0.0648, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0183427333831787, "rewards/margins": 7.426258087158203, "rewards/rejected": -9.444601058959961, "step": 1550 }, { "epoch": 0.2207755448627229, "grad_norm": 145.35922022845978, "learning_rate": 8.658594118572103e-08, "logits/chosen": -1.0197160243988037, "logits/rejected": -1.31730055809021, "logps/chosen": -315.4419250488281, "logps/rejected": -943.4249267578125, "loss": 0.1525, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7537436485290527, "rewards/margins": 6.288424491882324, "rewards/rejected": -9.042169570922852, "step": 1560 }, { "epoch": 0.22219077271440701, "grad_norm": 167.9401466523237, "learning_rate": 8.642868375530743e-08, "logits/chosen": -1.0365883111953735, "logits/rejected": -1.3355438709259033, "logps/chosen": -261.03070068359375, "logps/rejected": -849.4244995117188, "loss": 0.1118, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2043728828430176, "rewards/margins": 5.877452373504639, "rewards/rejected": -8.081826210021973, "step": 1570 }, { "epoch": 0.22360600056609115, "grad_norm": 64.76944414253013, "learning_rate": 8.627142632489385e-08, "logits/chosen": -1.0419955253601074, "logits/rejected": -1.344757080078125, "logps/chosen": -234.99984741210938, "logps/rejected": -913.7181396484375, "loss": 0.0765, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.977147102355957, "rewards/margins": 6.736481666564941, "rewards/rejected": -8.713628768920898, "step": 1580 }, { "epoch": 0.22502122841777525, "grad_norm": 78.74518748347779, "learning_rate": 8.611416889448027e-08, "logits/chosen": -1.1402156352996826, "logits/rejected": -1.3536899089813232, "logps/chosen": -251.9950408935547, "logps/rejected": -885.5095825195312, "loss": 0.0775, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1237950325012207, "rewards/margins": 6.305428504943848, "rewards/rejected": -8.429224014282227, "step": 1590 }, { "epoch": 0.22643645626945938, "grad_norm": 115.83816265329592, "learning_rate": 8.595691146406667e-08, "logits/chosen": -1.0886294841766357, "logits/rejected": -1.4165984392166138, "logps/chosen": -238.9476776123047, "logps/rejected": -744.6459350585938, "loss": 0.1214, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9994093179702759, "rewards/margins": 5.03980827331543, "rewards/rejected": -7.039217948913574, "step": 1600 }, { "epoch": 0.2278516841211435, "grad_norm": 8.475359715880463, "learning_rate": 8.579965403365309e-08, "logits/chosen": -1.0879220962524414, "logits/rejected": -1.3807090520858765, "logps/chosen": -237.75387573242188, "logps/rejected": -926.9197387695312, "loss": 0.0796, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0023555755615234, "rewards/margins": 6.842606544494629, "rewards/rejected": -8.844962120056152, "step": 1610 }, { "epoch": 0.22926691197282761, "grad_norm": 65.03442406620553, "learning_rate": 8.56423966032395e-08, "logits/chosen": -1.0935876369476318, "logits/rejected": -1.2841249704360962, "logps/chosen": -247.2875213623047, "logps/rejected": -870.8713989257812, "loss": 0.1084, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.0813257694244385, "rewards/margins": 6.209120750427246, "rewards/rejected": -8.290446281433105, "step": 1620 }, { "epoch": 0.23068213982451174, "grad_norm": 120.1225890910222, "learning_rate": 8.548513917282592e-08, "logits/chosen": -1.0898298025131226, "logits/rejected": -1.4127492904663086, "logps/chosen": -304.3668212890625, "logps/rejected": -1062.267578125, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": -2.6380114555358887, "rewards/margins": 7.534548759460449, "rewards/rejected": -10.17255973815918, "step": 1630 }, { "epoch": 0.23209736767619588, "grad_norm": 49.0595923982186, "learning_rate": 8.532788174241232e-08, "logits/chosen": -1.1507256031036377, "logits/rejected": -1.4658862352371216, "logps/chosen": -256.20977783203125, "logps/rejected": -972.4527587890625, "loss": 0.0824, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1857831478118896, "rewards/margins": 7.107316017150879, "rewards/rejected": -9.293098449707031, "step": 1640 }, { "epoch": 0.23351259552787998, "grad_norm": 240.5775376014875, "learning_rate": 8.517062431199874e-08, "logits/chosen": -1.1672465801239014, "logits/rejected": -1.4831058979034424, "logps/chosen": -273.6617736816406, "logps/rejected": -1001.8552856445312, "loss": 0.0686, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.3377811908721924, "rewards/margins": 7.2646660804748535, "rewards/rejected": -9.602446556091309, "step": 1650 }, { "epoch": 0.2349278233795641, "grad_norm": 48.258144982881824, "learning_rate": 8.501336688158514e-08, "logits/chosen": -1.1226476430892944, "logits/rejected": -1.398794412612915, "logps/chosen": -304.51483154296875, "logps/rejected": -980.697265625, "loss": 0.1084, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6522862911224365, "rewards/margins": 6.716960906982422, "rewards/rejected": -9.369247436523438, "step": 1660 }, { "epoch": 0.23634305123124824, "grad_norm": 34.693322692842955, "learning_rate": 8.485610945117157e-08, "logits/chosen": -1.1533557176589966, "logits/rejected": -1.51079261302948, "logps/chosen": -238.3592071533203, "logps/rejected": -925.0869140625, "loss": 0.07, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0108284950256348, "rewards/margins": 6.826669216156006, "rewards/rejected": -8.837496757507324, "step": 1670 }, { "epoch": 0.23775827908293234, "grad_norm": 23.546124397287297, "learning_rate": 8.469885202075797e-08, "logits/chosen": -1.158387541770935, "logits/rejected": -1.4747251272201538, "logps/chosen": -270.23187255859375, "logps/rejected": -1046.08154296875, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -2.313932180404663, "rewards/margins": 7.730319023132324, "rewards/rejected": -10.044252395629883, "step": 1680 }, { "epoch": 0.23917350693461648, "grad_norm": 154.056753586799, "learning_rate": 8.454159459034439e-08, "logits/chosen": -1.2301030158996582, "logits/rejected": -1.5585758686065674, "logps/chosen": -327.29522705078125, "logps/rejected": -1076.8006591796875, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -2.8804945945739746, "rewards/margins": 7.478305816650391, "rewards/rejected": -10.358799934387207, "step": 1690 }, { "epoch": 0.2405887347863006, "grad_norm": 185.92556613747118, "learning_rate": 8.438433715993081e-08, "logits/chosen": -1.321200966835022, "logits/rejected": -1.635229468345642, "logps/chosen": -269.4712219238281, "logps/rejected": -1082.8414306640625, "loss": 0.1156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.2980873584747314, "rewards/margins": 8.125906944274902, "rewards/rejected": -10.423994064331055, "step": 1700 }, { "epoch": 0.2420039626379847, "grad_norm": 29.63198636619727, "learning_rate": 8.422707972951721e-08, "logits/chosen": -1.247776746749878, "logits/rejected": -1.5093656778335571, "logps/chosen": -210.47506713867188, "logps/rejected": -899.1605224609375, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": -1.723257303237915, "rewards/margins": 6.848459720611572, "rewards/rejected": -8.571717262268066, "step": 1710 }, { "epoch": 0.24341919048966884, "grad_norm": 40.12261857303457, "learning_rate": 8.406982229910363e-08, "logits/chosen": -1.2773103713989258, "logits/rejected": -1.5729384422302246, "logps/chosen": -282.3523864746094, "logps/rejected": -1005.9052734375, "loss": 0.1065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4418158531188965, "rewards/margins": 7.220088958740234, "rewards/rejected": -9.661905288696289, "step": 1720 }, { "epoch": 0.24483441834135294, "grad_norm": 36.03582352358788, "learning_rate": 8.391256486869004e-08, "logits/chosen": -1.2736775875091553, "logits/rejected": -1.5491009950637817, "logps/chosen": -266.3653564453125, "logps/rejected": -918.5739135742188, "loss": 0.1591, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.2589383125305176, "rewards/margins": 6.503336429595947, "rewards/rejected": -8.762274742126465, "step": 1730 }, { "epoch": 0.24624964619303708, "grad_norm": 244.8423154274902, "learning_rate": 8.375530743827646e-08, "logits/chosen": -1.286417841911316, "logits/rejected": -1.6887359619140625, "logps/chosen": -208.16073608398438, "logps/rejected": -866.2664794921875, "loss": 0.0599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7032887935638428, "rewards/margins": 6.5796217918396, "rewards/rejected": -8.28291130065918, "step": 1740 }, { "epoch": 0.2476648740447212, "grad_norm": 5.9767081412877, "learning_rate": 8.359805000786286e-08, "logits/chosen": -1.3049333095550537, "logits/rejected": -1.707299828529358, "logps/chosen": -277.3440246582031, "logps/rejected": -1068.82373046875, "loss": 0.0837, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.3840079307556152, "rewards/margins": 7.867032527923584, "rewards/rejected": -10.251039505004883, "step": 1750 }, { "epoch": 0.2490801018964053, "grad_norm": 121.032027148172, "learning_rate": 8.344079257744928e-08, "logits/chosen": -1.2577747106552124, "logits/rejected": -1.5698373317718506, "logps/chosen": -283.487060546875, "logps/rejected": -1089.9752197265625, "loss": 0.0942, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.4383418560028076, "rewards/margins": 8.020307540893555, "rewards/rejected": -10.458650588989258, "step": 1760 }, { "epoch": 0.25049532974808947, "grad_norm": 10.280590562297668, "learning_rate": 8.328353514703568e-08, "logits/chosen": -1.2717167139053345, "logits/rejected": -1.6324609518051147, "logps/chosen": -296.6659851074219, "logps/rejected": -1060.2054443359375, "loss": 0.0861, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.578519344329834, "rewards/margins": 7.635247230529785, "rewards/rejected": -10.213766098022461, "step": 1770 }, { "epoch": 0.25191055759977354, "grad_norm": 39.60631233400898, "learning_rate": 8.312627771662211e-08, "logits/chosen": -1.2342199087142944, "logits/rejected": -1.6208432912826538, "logps/chosen": -290.3011474609375, "logps/rejected": -1114.697998046875, "loss": 0.0776, "rewards/accuracies": 1.0, "rewards/chosen": -2.5060887336730957, "rewards/margins": 8.187765121459961, "rewards/rejected": -10.693853378295898, "step": 1780 }, { "epoch": 0.2533257854514577, "grad_norm": 217.5207140782338, "learning_rate": 8.296902028620853e-08, "logits/chosen": -1.2590113878250122, "logits/rejected": -1.6326417922973633, "logps/chosen": -301.56298828125, "logps/rejected": -1069.114501953125, "loss": 0.0972, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6272332668304443, "rewards/margins": 7.661952018737793, "rewards/rejected": -10.2891845703125, "step": 1790 }, { "epoch": 0.2547410133031418, "grad_norm": 25.766040963917558, "learning_rate": 8.281176285579493e-08, "logits/chosen": -1.2759778499603271, "logits/rejected": -1.5915820598602295, "logps/chosen": -289.8285827636719, "logps/rejected": -1056.68701171875, "loss": 0.0836, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.5121772289276123, "rewards/margins": 7.6444091796875, "rewards/rejected": -10.156585693359375, "step": 1800 }, { "epoch": 0.25615624115482594, "grad_norm": 27.190317625745507, "learning_rate": 8.265450542538135e-08, "logits/chosen": -1.2489023208618164, "logits/rejected": -1.6761865615844727, "logps/chosen": -303.320556640625, "logps/rejected": -1305.0247802734375, "loss": 0.0856, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.647592782974243, "rewards/margins": 9.974966049194336, "rewards/rejected": -12.622559547424316, "step": 1810 }, { "epoch": 0.25757146900651007, "grad_norm": 18.771684679385068, "learning_rate": 8.249724799496775e-08, "logits/chosen": -1.2561554908752441, "logits/rejected": -1.5981924533843994, "logps/chosen": -292.35479736328125, "logps/rejected": -1171.3984375, "loss": 0.0749, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5544931888580322, "rewards/margins": 8.729516983032227, "rewards/rejected": -11.28400993347168, "step": 1820 }, { "epoch": 0.25898669685819414, "grad_norm": 16.96380308498728, "learning_rate": 8.233999056455417e-08, "logits/chosen": -1.2930470705032349, "logits/rejected": -1.7083736658096313, "logps/chosen": -278.38470458984375, "logps/rejected": -1068.476318359375, "loss": 0.0665, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.376704454421997, "rewards/margins": 7.880471229553223, "rewards/rejected": -10.25717544555664, "step": 1830 }, { "epoch": 0.2604019247098783, "grad_norm": 42.06946247564984, "learning_rate": 8.218273313414058e-08, "logits/chosen": -1.291431188583374, "logits/rejected": -1.6678968667984009, "logps/chosen": -283.7499694824219, "logps/rejected": -1060.979248046875, "loss": 0.0708, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.44490385055542, "rewards/margins": 7.7657623291015625, "rewards/rejected": -10.21066665649414, "step": 1840 }, { "epoch": 0.2618171525615624, "grad_norm": 12.756954265328979, "learning_rate": 8.2025475703727e-08, "logits/chosen": -1.3464069366455078, "logits/rejected": -1.6567636728286743, "logps/chosen": -394.8826599121094, "logps/rejected": -1212.0299072265625, "loss": 0.0621, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.5555508136749268, "rewards/margins": 8.148138999938965, "rewards/rejected": -11.703689575195312, "step": 1850 }, { "epoch": 0.26323238041324654, "grad_norm": 26.355764014418103, "learning_rate": 8.18682182733134e-08, "logits/chosen": -1.2956254482269287, "logits/rejected": -1.7453333139419556, "logps/chosen": -330.20849609375, "logps/rejected": -1248.768798828125, "loss": 0.0545, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.9076943397521973, "rewards/margins": 9.144111633300781, "rewards/rejected": -12.05180549621582, "step": 1860 }, { "epoch": 0.26464760826493067, "grad_norm": 181.30294048274453, "learning_rate": 8.171096084289982e-08, "logits/chosen": -1.2601182460784912, "logits/rejected": -1.6800670623779297, "logps/chosen": -359.07244873046875, "logps/rejected": -1318.252197265625, "loss": 0.0625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.1856770515441895, "rewards/margins": 9.553983688354492, "rewards/rejected": -12.73966121673584, "step": 1870 }, { "epoch": 0.2660628361166148, "grad_norm": 16.177347681253504, "learning_rate": 8.155370341248624e-08, "logits/chosen": -1.2959920167922974, "logits/rejected": -1.6526597738265991, "logps/chosen": -362.14361572265625, "logps/rejected": -1116.736083984375, "loss": 0.0759, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2228646278381348, "rewards/margins": 7.545233726501465, "rewards/rejected": -10.768098831176758, "step": 1880 }, { "epoch": 0.2674780639682989, "grad_norm": 27.57857473010759, "learning_rate": 8.139644598207265e-08, "logits/chosen": -1.2660375833511353, "logits/rejected": -1.5997194051742554, "logps/chosen": -318.9806213378906, "logps/rejected": -1131.208251953125, "loss": 0.083, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.805020809173584, "rewards/margins": 8.10765266418457, "rewards/rejected": -10.91267204284668, "step": 1890 }, { "epoch": 0.268893291819983, "grad_norm": 22.86210392047802, "learning_rate": 8.123918855165907e-08, "logits/chosen": -1.2314846515655518, "logits/rejected": -1.662764549255371, "logps/chosen": -282.24176025390625, "logps/rejected": -1029.7113037109375, "loss": 0.0952, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.446249485015869, "rewards/margins": 7.450293064117432, "rewards/rejected": -9.896541595458984, "step": 1900 }, { "epoch": 0.27030851967166714, "grad_norm": 23.77064104990046, "learning_rate": 8.108193112124547e-08, "logits/chosen": -1.2026584148406982, "logits/rejected": -1.5760929584503174, "logps/chosen": -290.0473937988281, "logps/rejected": -1149.2188720703125, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": -2.509028673171997, "rewards/margins": 8.592510223388672, "rewards/rejected": -11.101539611816406, "step": 1910 }, { "epoch": 0.27172374752335127, "grad_norm": 167.1935296887813, "learning_rate": 8.092467369083189e-08, "logits/chosen": -1.273739218711853, "logits/rejected": -1.701664686203003, "logps/chosen": -311.7950134277344, "logps/rejected": -1266.8057861328125, "loss": 0.0786, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7107794284820557, "rewards/margins": 9.501970291137695, "rewards/rejected": -12.212750434875488, "step": 1920 }, { "epoch": 0.2731389753750354, "grad_norm": 217.56098349285244, "learning_rate": 8.076741626041829e-08, "logits/chosen": -1.253987431526184, "logits/rejected": -1.593817949295044, "logps/chosen": -261.87664794921875, "logps/rejected": -1082.9873046875, "loss": 0.0844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.214069128036499, "rewards/margins": 8.202198028564453, "rewards/rejected": -10.416267395019531, "step": 1930 }, { "epoch": 0.27455420322671953, "grad_norm": 13.79772530531688, "learning_rate": 8.061015883000471e-08, "logits/chosen": -1.2009334564208984, "logits/rejected": -1.5788062810897827, "logps/chosen": -259.0775146484375, "logps/rejected": -1229.555908203125, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": -2.2062714099884033, "rewards/margins": 9.647671699523926, "rewards/rejected": -11.85394287109375, "step": 1940 }, { "epoch": 0.2759694310784036, "grad_norm": 131.6999872410733, "learning_rate": 8.045290139959113e-08, "logits/chosen": -1.214005470275879, "logits/rejected": -1.5889233350753784, "logps/chosen": -300.55328369140625, "logps/rejected": -1215.917724609375, "loss": 0.0677, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.62620210647583, "rewards/margins": 9.107135772705078, "rewards/rejected": -11.73333740234375, "step": 1950 }, { "epoch": 0.27738465893008774, "grad_norm": 67.2263592296307, "learning_rate": 8.029564396917754e-08, "logits/chosen": -1.2921075820922852, "logits/rejected": -1.6138296127319336, "logps/chosen": -298.8183898925781, "logps/rejected": -1229.69140625, "loss": 0.093, "rewards/accuracies": 0.9375, "rewards/chosen": -2.603281259536743, "rewards/margins": 9.284937858581543, "rewards/rejected": -11.888218879699707, "step": 1960 }, { "epoch": 0.27879988678177187, "grad_norm": 13.984022147872569, "learning_rate": 8.013838653876396e-08, "logits/chosen": -1.2443221807479858, "logits/rejected": -1.6034612655639648, "logps/chosen": -319.3099670410156, "logps/rejected": -1136.894775390625, "loss": 0.0367, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7922282218933105, "rewards/margins": 8.179818153381348, "rewards/rejected": -10.972046852111816, "step": 1970 }, { "epoch": 0.280215114633456, "grad_norm": 100.63247587342461, "learning_rate": 7.998112910835036e-08, "logits/chosen": -1.2507084608078003, "logits/rejected": -1.5561904907226562, "logps/chosen": -354.53570556640625, "logps/rejected": -1124.539306640625, "loss": 0.054, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1481170654296875, "rewards/margins": 7.6891584396362305, "rewards/rejected": -10.837276458740234, "step": 1980 }, { "epoch": 0.28163034248514013, "grad_norm": 30.011332507433018, "learning_rate": 7.982387167793678e-08, "logits/chosen": -1.314346194267273, "logits/rejected": -1.6933567523956299, "logps/chosen": -275.63873291015625, "logps/rejected": -1207.818115234375, "loss": 0.0794, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.365189790725708, "rewards/margins": 9.308884620666504, "rewards/rejected": -11.67407512664795, "step": 1990 }, { "epoch": 0.2830455703368242, "grad_norm": 109.77055870430452, "learning_rate": 7.96666142475232e-08, "logits/chosen": -1.2771005630493164, "logits/rejected": -1.6732105016708374, "logps/chosen": -290.7798767089844, "logps/rejected": -1269.3388671875, "loss": 0.0603, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4822163581848145, "rewards/margins": 9.75985050201416, "rewards/rejected": -12.242067337036133, "step": 2000 }, { "epoch": 0.28446079818850833, "grad_norm": 104.19634230028646, "learning_rate": 7.950935681710961e-08, "logits/chosen": -1.2529141902923584, "logits/rejected": -1.5660479068756104, "logps/chosen": -287.0341796875, "logps/rejected": -1135.184326171875, "loss": 0.1204, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.4632718563079834, "rewards/margins": 8.458829879760742, "rewards/rejected": -10.922101020812988, "step": 2010 }, { "epoch": 0.28587602604019247, "grad_norm": 63.758824571744455, "learning_rate": 7.935209938669601e-08, "logits/chosen": -1.2067530155181885, "logits/rejected": -1.587576985359192, "logps/chosen": -272.4522705078125, "logps/rejected": -1070.5614013671875, "loss": 0.0459, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.349116086959839, "rewards/margins": 7.980711936950684, "rewards/rejected": -10.329828262329102, "step": 2020 }, { "epoch": 0.2872912538918766, "grad_norm": 80.25562796959109, "learning_rate": 7.919484195628243e-08, "logits/chosen": -1.1856313943862915, "logits/rejected": -1.58025062084198, "logps/chosen": -316.2076110839844, "logps/rejected": -1147.622802734375, "loss": 0.0765, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7704365253448486, "rewards/margins": 8.33271598815918, "rewards/rejected": -11.10315227508545, "step": 2030 }, { "epoch": 0.28870648174356073, "grad_norm": 155.16073705452632, "learning_rate": 7.903758452586883e-08, "logits/chosen": -1.207275390625, "logits/rejected": -1.5621681213378906, "logps/chosen": -245.6144561767578, "logps/rejected": -1140.98681640625, "loss": 0.0769, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0646300315856934, "rewards/margins": 8.914287567138672, "rewards/rejected": -10.978918075561523, "step": 2040 }, { "epoch": 0.29012170959524486, "grad_norm": 78.76771750828243, "learning_rate": 7.888032709545525e-08, "logits/chosen": -1.2309887409210205, "logits/rejected": -1.6365396976470947, "logps/chosen": -274.21185302734375, "logps/rejected": -1226.4423828125, "loss": 0.0553, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3601770401000977, "rewards/margins": 9.486477851867676, "rewards/rejected": -11.846656799316406, "step": 2050 }, { "epoch": 0.29153693744692893, "grad_norm": 24.266427918320534, "learning_rate": 7.872306966504168e-08, "logits/chosen": -1.248676061630249, "logits/rejected": -1.574230432510376, "logps/chosen": -308.7959899902344, "logps/rejected": -1115.482177734375, "loss": 0.0526, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7142086029052734, "rewards/margins": 8.020590782165527, "rewards/rejected": -10.734800338745117, "step": 2060 }, { "epoch": 0.29295216529861307, "grad_norm": 90.04576063317717, "learning_rate": 7.856581223462808e-08, "logits/chosen": -1.2400405406951904, "logits/rejected": -1.5962671041488647, "logps/chosen": -284.86773681640625, "logps/rejected": -1279.018798828125, "loss": 0.0407, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.469104051589966, "rewards/margins": 9.926645278930664, "rewards/rejected": -12.395750045776367, "step": 2070 }, { "epoch": 0.2943673931502972, "grad_norm": 15.300542952803646, "learning_rate": 7.84085548042145e-08, "logits/chosen": -1.2773112058639526, "logits/rejected": -1.6145603656768799, "logps/chosen": -264.6637878417969, "logps/rejected": -1127.946533203125, "loss": 0.0602, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.262174606323242, "rewards/margins": 8.629032135009766, "rewards/rejected": -10.891206741333008, "step": 2080 }, { "epoch": 0.2957826210019813, "grad_norm": 27.855480892027153, "learning_rate": 7.82512973738009e-08, "logits/chosen": -1.287395715713501, "logits/rejected": -1.6918151378631592, "logps/chosen": -304.06805419921875, "logps/rejected": -1227.5576171875, "loss": 0.0512, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.648946762084961, "rewards/margins": 9.228964805603027, "rewards/rejected": -11.877911567687988, "step": 2090 }, { "epoch": 0.29719784885366546, "grad_norm": 7.5461546111448214, "learning_rate": 7.809403994338732e-08, "logits/chosen": -1.2620644569396973, "logits/rejected": -1.6742318868637085, "logps/chosen": -296.4757995605469, "logps/rejected": -1338.3111572265625, "loss": 0.0487, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5868494510650635, "rewards/margins": 10.381125450134277, "rewards/rejected": -12.967976570129395, "step": 2100 }, { "epoch": 0.29861307670534953, "grad_norm": 60.83584449217975, "learning_rate": 7.793678251297374e-08, "logits/chosen": -1.3573805093765259, "logits/rejected": -1.8214982748031616, "logps/chosen": -320.2242736816406, "logps/rejected": -1498.3624267578125, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -2.8052756786346436, "rewards/margins": 11.750875473022461, "rewards/rejected": -14.556150436401367, "step": 2110 }, { "epoch": 0.30002830455703366, "grad_norm": 216.4206403742119, "learning_rate": 7.777952508256015e-08, "logits/chosen": -1.344424843788147, "logits/rejected": -1.7294727563858032, "logps/chosen": -350.6932678222656, "logps/rejected": -1310.095947265625, "loss": 0.0917, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1095123291015625, "rewards/margins": 9.54160213470459, "rewards/rejected": -12.651114463806152, "step": 2120 }, { "epoch": 0.3014435324087178, "grad_norm": 41.57179823577414, "learning_rate": 7.762226765214656e-08, "logits/chosen": -1.358493447303772, "logits/rejected": -1.7193434238433838, "logps/chosen": -262.81170654296875, "logps/rejected": -1269.5130615234375, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": -2.242377996444702, "rewards/margins": 10.045086860656738, "rewards/rejected": -12.28746509552002, "step": 2130 }, { "epoch": 0.3028587602604019, "grad_norm": 37.69201489029228, "learning_rate": 7.746501022173297e-08, "logits/chosen": -1.3357547521591187, "logits/rejected": -1.7983388900756836, "logps/chosen": -325.90130615234375, "logps/rejected": -1452.2110595703125, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": -2.864941120147705, "rewards/margins": 11.226659774780273, "rewards/rejected": -14.09160041809082, "step": 2140 }, { "epoch": 0.30427398811208606, "grad_norm": 205.82334825088913, "learning_rate": 7.730775279131939e-08, "logits/chosen": -1.4062248468399048, "logits/rejected": -1.8384281396865845, "logps/chosen": -276.4056091308594, "logps/rejected": -1311.0147705078125, "loss": 0.1317, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.3662867546081543, "rewards/margins": 10.33594799041748, "rewards/rejected": -12.702235221862793, "step": 2150 }, { "epoch": 0.3056892159637702, "grad_norm": 21.517193283100838, "learning_rate": 7.71504953609058e-08, "logits/chosen": -1.3995798826217651, "logits/rejected": -1.689388632774353, "logps/chosen": -288.53521728515625, "logps/rejected": -1185.4766845703125, "loss": 0.0502, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.4929263591766357, "rewards/margins": 8.971369743347168, "rewards/rejected": -11.464296340942383, "step": 2160 }, { "epoch": 0.30710444381545426, "grad_norm": 236.87979450757004, "learning_rate": 7.699323793049222e-08, "logits/chosen": -1.3971426486968994, "logits/rejected": -1.766861915588379, "logps/chosen": -284.4190368652344, "logps/rejected": -1252.9423828125, "loss": 0.0529, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4633755683898926, "rewards/margins": 9.63781452178955, "rewards/rejected": -12.101190567016602, "step": 2170 }, { "epoch": 0.3085196716671384, "grad_norm": 66.74401001729865, "learning_rate": 7.683598050007862e-08, "logits/chosen": -1.4180428981781006, "logits/rejected": -1.7685191631317139, "logps/chosen": -288.4472351074219, "logps/rejected": -1331.633544921875, "loss": 0.0407, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4807467460632324, "rewards/margins": 10.367265701293945, "rewards/rejected": -12.848012924194336, "step": 2180 }, { "epoch": 0.3099348995188225, "grad_norm": 22.746309229871354, "learning_rate": 7.667872306966504e-08, "logits/chosen": -1.397639513015747, "logits/rejected": -1.7072055339813232, "logps/chosen": -254.79995727539062, "logps/rejected": -1123.602783203125, "loss": 0.0852, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.166815996170044, "rewards/margins": 8.68336296081543, "rewards/rejected": -10.850178718566895, "step": 2190 }, { "epoch": 0.31135012737050666, "grad_norm": 87.8628264261788, "learning_rate": 7.652146563925144e-08, "logits/chosen": -1.4035488367080688, "logits/rejected": -1.7525675296783447, "logps/chosen": -331.1602783203125, "logps/rejected": -1206.490234375, "loss": 0.0431, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.934971809387207, "rewards/margins": 8.741484642028809, "rewards/rejected": -11.676457405090332, "step": 2200 }, { "epoch": 0.3127653552221908, "grad_norm": 26.958760417903573, "learning_rate": 7.636420820883786e-08, "logits/chosen": -1.4322383403778076, "logits/rejected": -1.8511091470718384, "logps/chosen": -371.06768798828125, "logps/rejected": -1365.235107421875, "loss": 0.0327, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3136534690856934, "rewards/margins": 9.920957565307617, "rewards/rejected": -13.234613418579102, "step": 2210 }, { "epoch": 0.3141805830738749, "grad_norm": 35.02655443433355, "learning_rate": 7.620695077842428e-08, "logits/chosen": -1.3680996894836426, "logits/rejected": -1.7797648906707764, "logps/chosen": -314.8093566894531, "logps/rejected": -1378.62158203125, "loss": 0.0551, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.763277769088745, "rewards/margins": 10.599538803100586, "rewards/rejected": -13.362815856933594, "step": 2220 }, { "epoch": 0.315595810925559, "grad_norm": 55.2840733018424, "learning_rate": 7.60496933480107e-08, "logits/chosen": -1.3878624439239502, "logits/rejected": -1.7299811840057373, "logps/chosen": -284.9055480957031, "logps/rejected": -1231.7318115234375, "loss": 0.0442, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.472432851791382, "rewards/margins": 9.47036075592041, "rewards/rejected": -11.942793846130371, "step": 2230 }, { "epoch": 0.3170110387772431, "grad_norm": 87.51644402649218, "learning_rate": 7.589243591759711e-08, "logits/chosen": -1.3961400985717773, "logits/rejected": -1.7693138122558594, "logps/chosen": -383.29742431640625, "logps/rejected": -1335.23388671875, "loss": 0.0641, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.429373264312744, "rewards/margins": 9.518962860107422, "rewards/rejected": -12.948336601257324, "step": 2240 }, { "epoch": 0.31842626662892726, "grad_norm": 35.55977234558315, "learning_rate": 7.573517848718351e-08, "logits/chosen": -1.3850884437561035, "logits/rejected": -1.7548625469207764, "logps/chosen": -248.69949340820312, "logps/rejected": -1213.7552490234375, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -2.1136794090270996, "rewards/margins": 9.61608600616455, "rewards/rejected": -11.729764938354492, "step": 2250 }, { "epoch": 0.3198414944806114, "grad_norm": 10.04678453216828, "learning_rate": 7.557792105676993e-08, "logits/chosen": -1.4487738609313965, "logits/rejected": -1.8587297201156616, "logps/chosen": -323.7012634277344, "logps/rejected": -1641.881591796875, "loss": 0.0279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.840880870819092, "rewards/margins": 13.115617752075195, "rewards/rejected": -15.956497192382812, "step": 2260 }, { "epoch": 0.3212567223322955, "grad_norm": 47.55859373761358, "learning_rate": 7.542066362635635e-08, "logits/chosen": -1.5291962623596191, "logits/rejected": -1.9372308254241943, "logps/chosen": -347.1799011230469, "logps/rejected": -1421.714599609375, "loss": 0.1083, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.066540002822876, "rewards/margins": 10.695998191833496, "rewards/rejected": -13.762537956237793, "step": 2270 }, { "epoch": 0.3226719501839796, "grad_norm": 74.26456030675523, "learning_rate": 7.526340619594276e-08, "logits/chosen": -1.474798560142517, "logits/rejected": -1.8395397663116455, "logps/chosen": -285.46240234375, "logps/rejected": -1384.0584716796875, "loss": 0.0524, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4762847423553467, "rewards/margins": 10.958345413208008, "rewards/rejected": -13.434629440307617, "step": 2280 }, { "epoch": 0.3240871780356637, "grad_norm": 143.15396669254517, "learning_rate": 7.510614876552917e-08, "logits/chosen": -1.4447365999221802, "logits/rejected": -1.9038598537445068, "logps/chosen": -370.3476867675781, "logps/rejected": -1529.310302734375, "loss": 0.0849, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.31262469291687, "rewards/margins": 11.560396194458008, "rewards/rejected": -14.873022079467773, "step": 2290 }, { "epoch": 0.32550240588734786, "grad_norm": 17.085570566676907, "learning_rate": 7.494889133511558e-08, "logits/chosen": -1.477617859840393, "logits/rejected": -1.8695939779281616, "logps/chosen": -275.5816345214844, "logps/rejected": -1394.7835693359375, "loss": 0.1142, "rewards/accuracies": 1.0, "rewards/chosen": -2.3510284423828125, "rewards/margins": 11.168787002563477, "rewards/rejected": -13.519816398620605, "step": 2300 }, { "epoch": 0.326917633739032, "grad_norm": 46.85006256314189, "learning_rate": 7.479163390470199e-08, "logits/chosen": -1.5871881246566772, "logits/rejected": -2.0096652507781982, "logps/chosen": -356.324462890625, "logps/rejected": -1539.2891845703125, "loss": 0.0491, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1692569255828857, "rewards/margins": 11.801499366760254, "rewards/rejected": -14.970754623413086, "step": 2310 }, { "epoch": 0.3283328615907161, "grad_norm": 83.18982073453546, "learning_rate": 7.46343764742884e-08, "logits/chosen": -1.6399648189544678, "logits/rejected": -1.9872640371322632, "logps/chosen": -306.36712646484375, "logps/rejected": -1332.884521484375, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": -2.680358409881592, "rewards/margins": 10.228045463562012, "rewards/rejected": -12.908403396606445, "step": 2320 }, { "epoch": 0.32974808944240025, "grad_norm": 363.0092389145674, "learning_rate": 7.447711904387482e-08, "logits/chosen": -1.5927752256393433, "logits/rejected": -1.996066689491272, "logps/chosen": -301.03619384765625, "logps/rejected": -1345.107177734375, "loss": 0.1248, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6177186965942383, "rewards/margins": 10.415854454040527, "rewards/rejected": -13.033574104309082, "step": 2330 }, { "epoch": 0.3311633172940843, "grad_norm": 38.650409533691864, "learning_rate": 7.431986161346123e-08, "logits/chosen": -1.6282703876495361, "logits/rejected": -2.043545961380005, "logps/chosen": -302.26470947265625, "logps/rejected": -1397.432861328125, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -2.6410791873931885, "rewards/margins": 10.93382740020752, "rewards/rejected": -13.574907302856445, "step": 2340 }, { "epoch": 0.33257854514576846, "grad_norm": 183.18285151110442, "learning_rate": 7.416260418304765e-08, "logits/chosen": -1.6369259357452393, "logits/rejected": -2.089512586593628, "logps/chosen": -345.23883056640625, "logps/rejected": -1528.251953125, "loss": 0.061, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.074882984161377, "rewards/margins": 11.783578872680664, "rewards/rejected": -14.8584623336792, "step": 2350 }, { "epoch": 0.3339937729974526, "grad_norm": 7.129884637914644, "learning_rate": 7.400534675263405e-08, "logits/chosen": -1.6120641231536865, "logits/rejected": -2.0618209838867188, "logps/chosen": -343.13995361328125, "logps/rejected": -1629.648193359375, "loss": 0.0444, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.027924060821533, "rewards/margins": 12.815966606140137, "rewards/rejected": -15.843889236450195, "step": 2360 }, { "epoch": 0.3354090008491367, "grad_norm": 287.4346324603478, "learning_rate": 7.384808932222047e-08, "logits/chosen": -1.6858441829681396, "logits/rejected": -2.038790225982666, "logps/chosen": -323.852294921875, "logps/rejected": -1462.35888671875, "loss": 0.0796, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.853132724761963, "rewards/margins": 11.357009887695312, "rewards/rejected": -14.2101411819458, "step": 2370 }, { "epoch": 0.33682422870082085, "grad_norm": 320.7965474340445, "learning_rate": 7.369083189180689e-08, "logits/chosen": -1.5887577533721924, "logits/rejected": -1.9774423837661743, "logps/chosen": -317.09051513671875, "logps/rejected": -1412.6298828125, "loss": 0.0662, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.786877393722534, "rewards/margins": 10.911337852478027, "rewards/rejected": -13.698216438293457, "step": 2380 }, { "epoch": 0.338239456552505, "grad_norm": 18.340326010002958, "learning_rate": 7.35335744613933e-08, "logits/chosen": -1.5466973781585693, "logits/rejected": -1.9860531091690063, "logps/chosen": -289.51409912109375, "logps/rejected": -1487.199462890625, "loss": 0.0311, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4993820190429688, "rewards/margins": 11.95008373260498, "rewards/rejected": -14.449467658996582, "step": 2390 }, { "epoch": 0.33965468440418906, "grad_norm": 11.514888338214984, "learning_rate": 7.337631703097971e-08, "logits/chosen": -1.6941970586776733, "logits/rejected": -2.1250176429748535, "logps/chosen": -336.9012756347656, "logps/rejected": -1496.2261962890625, "loss": 0.1178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.984144926071167, "rewards/margins": 11.549200057983398, "rewards/rejected": -14.533345222473145, "step": 2400 }, { "epoch": 0.3410699122558732, "grad_norm": 10.542580060448135, "learning_rate": 7.321905960056612e-08, "logits/chosen": -1.5872318744659424, "logits/rejected": -2.0610241889953613, "logps/chosen": -314.43292236328125, "logps/rejected": -1604.426025390625, "loss": 0.0529, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7496137619018555, "rewards/margins": 12.860952377319336, "rewards/rejected": -15.610567092895508, "step": 2410 }, { "epoch": 0.3424851401075573, "grad_norm": 146.46785590172865, "learning_rate": 7.306180217015253e-08, "logits/chosen": -1.6502329111099243, "logits/rejected": -2.0606493949890137, "logps/chosen": -346.0991516113281, "logps/rejected": -1606.683837890625, "loss": 0.0448, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.06435227394104, "rewards/margins": 12.575037002563477, "rewards/rejected": -15.63939094543457, "step": 2420 }, { "epoch": 0.34390036795924145, "grad_norm": 207.3071916279182, "learning_rate": 7.290454473973894e-08, "logits/chosen": -1.6406952142715454, "logits/rejected": -2.013338804244995, "logps/chosen": -295.64862060546875, "logps/rejected": -1421.2301025390625, "loss": 0.1214, "rewards/accuracies": 1.0, "rewards/chosen": -2.579883575439453, "rewards/margins": 11.213606834411621, "rewards/rejected": -13.793490409851074, "step": 2430 }, { "epoch": 0.3453155958109256, "grad_norm": 27.531976834084258, "learning_rate": 7.274728730932537e-08, "logits/chosen": -1.6326968669891357, "logits/rejected": -2.0451548099517822, "logps/chosen": -380.2967834472656, "logps/rejected": -1673.4527587890625, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": -3.390984058380127, "rewards/margins": 12.877812385559082, "rewards/rejected": -16.268795013427734, "step": 2440 }, { "epoch": 0.34673082366260966, "grad_norm": 15.12821070948695, "learning_rate": 7.259002987891178e-08, "logits/chosen": -1.6437921524047852, "logits/rejected": -2.0460140705108643, "logps/chosen": -339.6479187011719, "logps/rejected": -1505.6207275390625, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": -2.999807834625244, "rewards/margins": 11.609813690185547, "rewards/rejected": -14.60962200164795, "step": 2450 }, { "epoch": 0.3481460515142938, "grad_norm": 55.45610065263612, "learning_rate": 7.243277244849819e-08, "logits/chosen": -1.7140308618545532, "logits/rejected": -2.113220691680908, "logps/chosen": -342.88079833984375, "logps/rejected": -1659.638916015625, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -3.014754056930542, "rewards/margins": 13.140100479125977, "rewards/rejected": -16.154855728149414, "step": 2460 }, { "epoch": 0.3495612793659779, "grad_norm": 8.433404454685677, "learning_rate": 7.22755150180846e-08, "logits/chosen": -1.633204460144043, "logits/rejected": -2.0809226036071777, "logps/chosen": -327.14007568359375, "logps/rejected": -1523.533447265625, "loss": 0.056, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8811075687408447, "rewards/margins": 11.979182243347168, "rewards/rejected": -14.860288619995117, "step": 2470 }, { "epoch": 0.35097650721766205, "grad_norm": 18.847014532900165, "learning_rate": 7.211825758767101e-08, "logits/chosen": -1.6106102466583252, "logits/rejected": -2.124812126159668, "logps/chosen": -297.57611083984375, "logps/rejected": -1460.460693359375, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -2.5759716033935547, "rewards/margins": 11.617395401000977, "rewards/rejected": -14.193368911743164, "step": 2480 }, { "epoch": 0.3523917350693462, "grad_norm": 1.116064702187744, "learning_rate": 7.196100015725743e-08, "logits/chosen": -1.620153784751892, "logits/rejected": -2.014458179473877, "logps/chosen": -296.50433349609375, "logps/rejected": -1461.1083984375, "loss": 0.0285, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5817620754241943, "rewards/margins": 11.606282234191895, "rewards/rejected": -14.188043594360352, "step": 2490 }, { "epoch": 0.3538069629210303, "grad_norm": 24.379556953028306, "learning_rate": 7.180374272684385e-08, "logits/chosen": -1.6207551956176758, "logits/rejected": -2.0720067024230957, "logps/chosen": -324.2480163574219, "logps/rejected": -1658.7515869140625, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": -2.8545784950256348, "rewards/margins": 13.314114570617676, "rewards/rejected": -16.16869354248047, "step": 2500 }, { "epoch": 0.3552221907727144, "grad_norm": 12.643483576990194, "learning_rate": 7.164648529643025e-08, "logits/chosen": -1.7241023778915405, "logits/rejected": -2.084317445755005, "logps/chosen": -407.10272216796875, "logps/rejected": -1610.753662109375, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -3.6673145294189453, "rewards/margins": 12.05979061126709, "rewards/rejected": -15.727106094360352, "step": 2510 }, { "epoch": 0.3566374186243985, "grad_norm": 4.948095527403773, "learning_rate": 7.148922786601666e-08, "logits/chosen": -1.6630722284317017, "logits/rejected": -2.0018155574798584, "logps/chosen": -342.88446044921875, "logps/rejected": -1519.061279296875, "loss": 0.026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0446925163269043, "rewards/margins": 11.725151062011719, "rewards/rejected": -14.769845962524414, "step": 2520 }, { "epoch": 0.35805264647608265, "grad_norm": 9.288038011778745, "learning_rate": 7.133197043560308e-08, "logits/chosen": -1.5955145359039307, "logits/rejected": -2.066114902496338, "logps/chosen": -391.6807861328125, "logps/rejected": -1761.83203125, "loss": 0.1023, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5171267986297607, "rewards/margins": 13.652249336242676, "rewards/rejected": -17.169376373291016, "step": 2530 }, { "epoch": 0.3594678743277668, "grad_norm": 56.841681648217445, "learning_rate": 7.117471300518948e-08, "logits/chosen": -1.5839009284973145, "logits/rejected": -2.0502469539642334, "logps/chosen": -294.4014587402344, "logps/rejected": -1643.1888427734375, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": -2.55129337310791, "rewards/margins": 13.45917797088623, "rewards/rejected": -16.01047134399414, "step": 2540 }, { "epoch": 0.3608831021794509, "grad_norm": 1.2292235198074768, "learning_rate": 7.101745557477591e-08, "logits/chosen": -1.5673253536224365, "logits/rejected": -1.9745044708251953, "logps/chosen": -310.2901611328125, "logps/rejected": -1575.437744140625, "loss": 0.0261, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.714529514312744, "rewards/margins": 12.612936019897461, "rewards/rejected": -15.32746410369873, "step": 2550 }, { "epoch": 0.362298330031135, "grad_norm": 75.09717922110721, "learning_rate": 7.086019814436232e-08, "logits/chosen": -1.6256577968597412, "logits/rejected": -2.0356929302215576, "logps/chosen": -430.09552001953125, "logps/rejected": -1724.4287109375, "loss": 0.0271, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.9229702949523926, "rewards/margins": 12.927932739257812, "rewards/rejected": -16.85090446472168, "step": 2560 }, { "epoch": 0.3637135578828191, "grad_norm": 77.69281528004241, "learning_rate": 7.070294071394873e-08, "logits/chosen": -1.5762232542037964, "logits/rejected": -2.0005598068237305, "logps/chosen": -353.38214111328125, "logps/rejected": -1579.438720703125, "loss": 0.04, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.146477460861206, "rewards/margins": 12.215415954589844, "rewards/rejected": -15.361892700195312, "step": 2570 }, { "epoch": 0.36512878573450325, "grad_norm": 40.2322911291011, "learning_rate": 7.054568328353514e-08, "logits/chosen": -1.6111199855804443, "logits/rejected": -2.0533080101013184, "logps/chosen": -346.27825927734375, "logps/rejected": -1635.140869140625, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -3.063382387161255, "rewards/margins": 12.903676986694336, "rewards/rejected": -15.967058181762695, "step": 2580 }, { "epoch": 0.3665440135861874, "grad_norm": 2.5622766288554417, "learning_rate": 7.038842585312155e-08, "logits/chosen": -1.6702229976654053, "logits/rejected": -2.0971198081970215, "logps/chosen": -462.57330322265625, "logps/rejected": -1747.056884765625, "loss": 0.0199, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.2432355880737305, "rewards/margins": 12.835832595825195, "rewards/rejected": -17.079069137573242, "step": 2590 }, { "epoch": 0.3679592414378715, "grad_norm": 27.70274312978273, "learning_rate": 7.023116842270797e-08, "logits/chosen": -1.592308759689331, "logits/rejected": -2.010150909423828, "logps/chosen": -397.7125549316406, "logps/rejected": -1805.203369140625, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -3.591384172439575, "rewards/margins": 14.068499565124512, "rewards/rejected": -17.659881591796875, "step": 2600 }, { "epoch": 0.36937446928955564, "grad_norm": 48.44436222534193, "learning_rate": 7.007391099229439e-08, "logits/chosen": -1.6258445978164673, "logits/rejected": -2.034083366394043, "logps/chosen": -379.82666015625, "logps/rejected": -1708.1614990234375, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -3.400048017501831, "rewards/margins": 13.269001960754395, "rewards/rejected": -16.669048309326172, "step": 2610 }, { "epoch": 0.3707896971412397, "grad_norm": 38.56299741125284, "learning_rate": 6.99166535618808e-08, "logits/chosen": -1.6488759517669678, "logits/rejected": -2.1131227016448975, "logps/chosen": -358.9078674316406, "logps/rejected": -1853.8203125, "loss": 0.0529, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.199666976928711, "rewards/margins": 14.921438217163086, "rewards/rejected": -18.121105194091797, "step": 2620 }, { "epoch": 0.37220492499292385, "grad_norm": 97.94651585170143, "learning_rate": 6.97593961314672e-08, "logits/chosen": -1.6238409280776978, "logits/rejected": -2.026181936264038, "logps/chosen": -408.40301513671875, "logps/rejected": -1750.146728515625, "loss": 0.0417, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.698347806930542, "rewards/margins": 13.394502639770508, "rewards/rejected": -17.092849731445312, "step": 2630 }, { "epoch": 0.373620152844608, "grad_norm": 41.22446053060253, "learning_rate": 6.960213870105362e-08, "logits/chosen": -1.6069132089614868, "logits/rejected": -2.06937313079834, "logps/chosen": -405.0937805175781, "logps/rejected": -1754.7890625, "loss": 0.0491, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6593146324157715, "rewards/margins": 13.450398445129395, "rewards/rejected": -17.109712600708008, "step": 2640 }, { "epoch": 0.3750353806962921, "grad_norm": 129.31289326116178, "learning_rate": 6.944488127064003e-08, "logits/chosen": -1.6329457759857178, "logits/rejected": -2.074740409851074, "logps/chosen": -343.6166687011719, "logps/rejected": -1647.436279296875, "loss": 0.0668, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.055292844772339, "rewards/margins": 12.996803283691406, "rewards/rejected": -16.05209732055664, "step": 2650 }, { "epoch": 0.37645060854797624, "grad_norm": 39.77207375530953, "learning_rate": 6.928762384022646e-08, "logits/chosen": -1.6719210147857666, "logits/rejected": -2.1230344772338867, "logps/chosen": -346.02044677734375, "logps/rejected": -1651.367431640625, "loss": 0.0253, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.075352191925049, "rewards/margins": 13.005976676940918, "rewards/rejected": -16.081329345703125, "step": 2660 }, { "epoch": 0.37786583639966037, "grad_norm": 117.22016308398294, "learning_rate": 6.913036640981286e-08, "logits/chosen": -1.655086874961853, "logits/rejected": -2.1560072898864746, "logps/chosen": -285.994140625, "logps/rejected": -1616.6405029296875, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -2.4853250980377197, "rewards/margins": 13.271791458129883, "rewards/rejected": -15.757116317749023, "step": 2670 }, { "epoch": 0.37928106425134445, "grad_norm": 182.21906912559666, "learning_rate": 6.897310897939928e-08, "logits/chosen": -1.683842420578003, "logits/rejected": -2.1181440353393555, "logps/chosen": -378.0453186035156, "logps/rejected": -1780.839111328125, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": -3.3866989612579346, "rewards/margins": 14.015111923217773, "rewards/rejected": -17.401813507080078, "step": 2680 }, { "epoch": 0.3806962921030286, "grad_norm": 10.043696318057698, "learning_rate": 6.881585154898568e-08, "logits/chosen": -1.6122413873672485, "logits/rejected": -2.0693984031677246, "logps/chosen": -301.2664794921875, "logps/rejected": -1724.064208984375, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -2.619446277618408, "rewards/margins": 14.197431564331055, "rewards/rejected": -16.816875457763672, "step": 2690 }, { "epoch": 0.3821115199547127, "grad_norm": 97.61273272879414, "learning_rate": 6.86585941185721e-08, "logits/chosen": -1.6892757415771484, "logits/rejected": -2.161856174468994, "logps/chosen": -365.36285400390625, "logps/rejected": -1692.2232666015625, "loss": 0.0841, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2694320678710938, "rewards/margins": 13.249957084655762, "rewards/rejected": -16.51938819885254, "step": 2700 }, { "epoch": 0.38352674780639684, "grad_norm": 0.5096447054401555, "learning_rate": 6.850133668815851e-08, "logits/chosen": -1.6148254871368408, "logits/rejected": -2.0306143760681152, "logps/chosen": -296.62115478515625, "logps/rejected": -1714.5933837890625, "loss": 0.0362, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5825157165527344, "rewards/margins": 14.180386543273926, "rewards/rejected": -16.762903213500977, "step": 2710 }, { "epoch": 0.38494197565808097, "grad_norm": 156.0797801196094, "learning_rate": 6.834407925774493e-08, "logits/chosen": -1.539468765258789, "logits/rejected": -2.0323593616485596, "logps/chosen": -281.68280029296875, "logps/rejected": -1816.458984375, "loss": 0.0584, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.42946195602417, "rewards/margins": 15.292485237121582, "rewards/rejected": -17.721948623657227, "step": 2720 }, { "epoch": 0.38635720350976505, "grad_norm": 310.3277771095054, "learning_rate": 6.818682182733134e-08, "logits/chosen": -1.6329898834228516, "logits/rejected": -2.023862361907959, "logps/chosen": -338.9956970214844, "logps/rejected": -1575.6949462890625, "loss": 0.0702, "rewards/accuracies": 1.0, "rewards/chosen": -3.0068275928497314, "rewards/margins": 12.370141983032227, "rewards/rejected": -15.376970291137695, "step": 2730 }, { "epoch": 0.3877724313614492, "grad_norm": 216.83976994050497, "learning_rate": 6.802956439691775e-08, "logits/chosen": -1.5792593955993652, "logits/rejected": -2.0799946784973145, "logps/chosen": -313.86944580078125, "logps/rejected": -1593.2783203125, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -2.7574234008789062, "rewards/margins": 12.7500581741333, "rewards/rejected": -15.507482528686523, "step": 2740 }, { "epoch": 0.3891876592131333, "grad_norm": 5.737341780101278, "learning_rate": 6.787230696650416e-08, "logits/chosen": -1.6299835443496704, "logits/rejected": -2.10296368598938, "logps/chosen": -316.49713134765625, "logps/rejected": -1632.460205078125, "loss": 0.0338, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7659528255462646, "rewards/margins": 13.159126281738281, "rewards/rejected": -15.925079345703125, "step": 2750 }, { "epoch": 0.39060288706481744, "grad_norm": 5.369340986695017, "learning_rate": 6.771504953609058e-08, "logits/chosen": -1.6143462657928467, "logits/rejected": -2.0783729553222656, "logps/chosen": -394.4059143066406, "logps/rejected": -1669.2191162109375, "loss": 0.0196, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5480220317840576, "rewards/margins": 12.741310119628906, "rewards/rejected": -16.289331436157227, "step": 2760 }, { "epoch": 0.39201811491650157, "grad_norm": 57.62293046036734, "learning_rate": 6.7557792105677e-08, "logits/chosen": -1.767592430114746, "logits/rejected": -2.132505416870117, "logps/chosen": -431.6483459472656, "logps/rejected": -1717.332763671875, "loss": 0.0945, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9191672801971436, "rewards/margins": 12.806909561157227, "rewards/rejected": -16.726078033447266, "step": 2770 }, { "epoch": 0.3934333427681857, "grad_norm": 149.06504831821894, "learning_rate": 6.74005346752634e-08, "logits/chosen": -1.6717336177825928, "logits/rejected": -2.0713839530944824, "logps/chosen": -369.3962097167969, "logps/rejected": -1679.397216796875, "loss": 0.0299, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.3114006519317627, "rewards/margins": 13.070971488952637, "rewards/rejected": -16.382373809814453, "step": 2780 }, { "epoch": 0.3948485706198698, "grad_norm": 462.8361496045081, "learning_rate": 6.724327724484982e-08, "logits/chosen": -1.6305053234100342, "logits/rejected": -2.1064248085021973, "logps/chosen": -313.5611572265625, "logps/rejected": -1777.444091796875, "loss": 0.0673, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7836575508117676, "rewards/margins": 14.588735580444336, "rewards/rejected": -17.372392654418945, "step": 2790 }, { "epoch": 0.3962637984715539, "grad_norm": 153.64933412551179, "learning_rate": 6.708601981443623e-08, "logits/chosen": -1.6458349227905273, "logits/rejected": -2.128026008605957, "logps/chosen": -398.8099365234375, "logps/rejected": -1876.7154541015625, "loss": 0.0755, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.603341579437256, "rewards/margins": 14.749521255493164, "rewards/rejected": -18.352863311767578, "step": 2800 }, { "epoch": 0.39767902632323804, "grad_norm": 310.7711418066022, "learning_rate": 6.692876238402264e-08, "logits/chosen": -1.691168189048767, "logits/rejected": -2.1216793060302734, "logps/chosen": -387.11334228515625, "logps/rejected": -1816.487060546875, "loss": 0.0438, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.492159605026245, "rewards/margins": 14.248188972473145, "rewards/rejected": -17.740346908569336, "step": 2810 }, { "epoch": 0.39909425417492217, "grad_norm": 19.758913192859772, "learning_rate": 6.677150495360905e-08, "logits/chosen": -1.6019102334976196, "logits/rejected": -2.0953118801116943, "logps/chosen": -334.22979736328125, "logps/rejected": -1938.86328125, "loss": 0.0558, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.971038818359375, "rewards/margins": 15.959806442260742, "rewards/rejected": -18.930843353271484, "step": 2820 }, { "epoch": 0.4005094820266063, "grad_norm": 151.04511945792152, "learning_rate": 6.661424752319547e-08, "logits/chosen": -1.7038681507110596, "logits/rejected": -2.1113948822021484, "logps/chosen": -424.6219787597656, "logps/rejected": -1713.521484375, "loss": 0.0453, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.8723366260528564, "rewards/margins": 12.867660522460938, "rewards/rejected": -16.7399959564209, "step": 2830 }, { "epoch": 0.40192470987829043, "grad_norm": 38.8147864081314, "learning_rate": 6.645699009278189e-08, "logits/chosen": -1.7412998676300049, "logits/rejected": -2.1640028953552246, "logps/chosen": -418.5709533691406, "logps/rejected": -1747.800048828125, "loss": 0.0572, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8045105934143066, "rewards/margins": 13.289278984069824, "rewards/rejected": -17.093788146972656, "step": 2840 }, { "epoch": 0.4033399377299745, "grad_norm": 3.406974968982036, "learning_rate": 6.629973266236829e-08, "logits/chosen": -1.715972661972046, "logits/rejected": -2.2071011066436768, "logps/chosen": -275.1501770019531, "logps/rejected": -1667.9752197265625, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": -2.3720498085021973, "rewards/margins": 13.860176086425781, "rewards/rejected": -16.232227325439453, "step": 2850 }, { "epoch": 0.40475516558165864, "grad_norm": 232.74829711710385, "learning_rate": 6.61424752319547e-08, "logits/chosen": -1.7281615734100342, "logits/rejected": -2.1951839923858643, "logps/chosen": -389.48846435546875, "logps/rejected": -1785.396240234375, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": -3.4976437091827393, "rewards/margins": 13.916694641113281, "rewards/rejected": -17.41434097290039, "step": 2860 }, { "epoch": 0.40617039343334277, "grad_norm": 2.7721178545921927, "learning_rate": 6.598521780154112e-08, "logits/chosen": -1.760538101196289, "logits/rejected": -2.1716790199279785, "logps/chosen": -299.60845947265625, "logps/rejected": -1886.2747802734375, "loss": 0.0535, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6055760383605957, "rewards/margins": 15.824651718139648, "rewards/rejected": -18.43022918701172, "step": 2870 }, { "epoch": 0.4075856212850269, "grad_norm": 0.6058236676790464, "learning_rate": 6.582796037112754e-08, "logits/chosen": -1.7129313945770264, "logits/rejected": -2.1940369606018066, "logps/chosen": -353.15545654296875, "logps/rejected": -1766.2646484375, "loss": 0.0603, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.145327568054199, "rewards/margins": 14.095637321472168, "rewards/rejected": -17.240962982177734, "step": 2880 }, { "epoch": 0.40900084913671103, "grad_norm": 19.106828031047016, "learning_rate": 6.567070294071395e-08, "logits/chosen": -1.7730369567871094, "logits/rejected": -2.333508014678955, "logps/chosen": -441.4756774902344, "logps/rejected": -2073.084228515625, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -4.016941070556641, "rewards/margins": 16.273906707763672, "rewards/rejected": -20.290847778320312, "step": 2890 }, { "epoch": 0.4104160769883951, "grad_norm": 54.87264846140475, "learning_rate": 6.551344551030036e-08, "logits/chosen": -1.7715944051742554, "logits/rejected": -2.2663934230804443, "logps/chosen": -369.23455810546875, "logps/rejected": -1771.80859375, "loss": 0.0207, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.305203914642334, "rewards/margins": 14.014638900756836, "rewards/rejected": -17.31984519958496, "step": 2900 }, { "epoch": 0.41183130484007924, "grad_norm": 21.79924467663445, "learning_rate": 6.535618807988677e-08, "logits/chosen": -1.8207908868789673, "logits/rejected": -2.328901529312134, "logps/chosen": -406.04058837890625, "logps/rejected": -2009.9713134765625, "loss": 0.0572, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6757571697235107, "rewards/margins": 16.028261184692383, "rewards/rejected": -19.704021453857422, "step": 2910 }, { "epoch": 0.41324653269176337, "grad_norm": 290.1428591590642, "learning_rate": 6.519893064947318e-08, "logits/chosen": -1.8095754384994507, "logits/rejected": -2.3101630210876465, "logps/chosen": -434.2896423339844, "logps/rejected": -2011.3304443359375, "loss": 0.0474, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9313278198242188, "rewards/margins": 15.760790824890137, "rewards/rejected": -19.692119598388672, "step": 2920 }, { "epoch": 0.4146617605434475, "grad_norm": 339.7692735072591, "learning_rate": 6.504167321905961e-08, "logits/chosen": -1.8212589025497437, "logits/rejected": -2.2574362754821777, "logps/chosen": -413.346923828125, "logps/rejected": -1975.060302734375, "loss": 0.0472, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.730653762817383, "rewards/margins": 15.605514526367188, "rewards/rejected": -19.336166381835938, "step": 2930 }, { "epoch": 0.41607698839513163, "grad_norm": 421.32204765603603, "learning_rate": 6.488441578864601e-08, "logits/chosen": -1.763830542564392, "logits/rejected": -2.2216010093688965, "logps/chosen": -347.43804931640625, "logps/rejected": -1668.090087890625, "loss": 0.0595, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.088451623916626, "rewards/margins": 13.197721481323242, "rewards/rejected": -16.28617286682129, "step": 2940 }, { "epoch": 0.41749221624681576, "grad_norm": 354.6374309011303, "learning_rate": 6.472715835823243e-08, "logits/chosen": -1.6916707754135132, "logits/rejected": -2.1287922859191895, "logps/chosen": -348.3275451660156, "logps/rejected": -1654.610595703125, "loss": 0.0605, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0832135677337646, "rewards/margins": 13.050596237182617, "rewards/rejected": -16.13381004333496, "step": 2950 }, { "epoch": 0.41890744409849984, "grad_norm": 0.429949622422265, "learning_rate": 6.456990092781883e-08, "logits/chosen": -1.7463035583496094, "logits/rejected": -2.1836342811584473, "logps/chosen": -288.4531555175781, "logps/rejected": -1681.282958984375, "loss": 0.0109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5127227306365967, "rewards/margins": 13.888616561889648, "rewards/rejected": -16.401338577270508, "step": 2960 }, { "epoch": 0.42032267195018397, "grad_norm": 2.05345662710077, "learning_rate": 6.441264349740525e-08, "logits/chosen": -1.673840880393982, "logits/rejected": -2.2059547901153564, "logps/chosen": -288.7801208496094, "logps/rejected": -1884.7113037109375, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": -2.521371841430664, "rewards/margins": 15.905367851257324, "rewards/rejected": -18.426740646362305, "step": 2970 }, { "epoch": 0.4217378998018681, "grad_norm": 117.73729781525974, "learning_rate": 6.425538606699166e-08, "logits/chosen": -1.715338110923767, "logits/rejected": -2.233445644378662, "logps/chosen": -330.02490234375, "logps/rejected": -1673.373046875, "loss": 0.0491, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9169726371765137, "rewards/margins": 13.40394115447998, "rewards/rejected": -16.320913314819336, "step": 2980 }, { "epoch": 0.42315312765355223, "grad_norm": 75.07354752592072, "learning_rate": 6.409812863657808e-08, "logits/chosen": -1.7055187225341797, "logits/rejected": -2.1617043018341064, "logps/chosen": -302.3951416015625, "logps/rejected": -1654.751708984375, "loss": 0.0315, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6326980590820312, "rewards/margins": 13.501760482788086, "rewards/rejected": -16.13446044921875, "step": 2990 }, { "epoch": 0.42456835550523636, "grad_norm": 180.22631115133086, "learning_rate": 6.39408712061645e-08, "logits/chosen": -1.7161242961883545, "logits/rejected": -2.232123851776123, "logps/chosen": -399.74237060546875, "logps/rejected": -1792.8804931640625, "loss": 0.0516, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5985159873962402, "rewards/margins": 13.90559196472168, "rewards/rejected": -17.50411033630371, "step": 3000 }, { "epoch": 0.42598358335692044, "grad_norm": 8.971520556170214, "learning_rate": 6.37836137757509e-08, "logits/chosen": -1.6589568853378296, "logits/rejected": -2.1844029426574707, "logps/chosen": -331.3720703125, "logps/rejected": -1839.558837890625, "loss": 0.0504, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.909336805343628, "rewards/margins": 15.056086540222168, "rewards/rejected": -17.965425491333008, "step": 3010 }, { "epoch": 0.42739881120860457, "grad_norm": 131.96668383296117, "learning_rate": 6.362635634533732e-08, "logits/chosen": -1.7280495166778564, "logits/rejected": -2.18296217918396, "logps/chosen": -368.10205078125, "logps/rejected": -1769.5787353515625, "loss": 0.0316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2928664684295654, "rewards/margins": 13.979486465454102, "rewards/rejected": -17.27235221862793, "step": 3020 }, { "epoch": 0.4288140390602887, "grad_norm": 3.7013958360888526, "learning_rate": 6.346909891492372e-08, "logits/chosen": -1.7062219381332397, "logits/rejected": -2.2822248935699463, "logps/chosen": -346.65631103515625, "logps/rejected": -2074.977783203125, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -3.0666332244873047, "rewards/margins": 17.234237670898438, "rewards/rejected": -20.300870895385742, "step": 3030 }, { "epoch": 0.43022926691197283, "grad_norm": 11.540017519976166, "learning_rate": 6.331184148451015e-08, "logits/chosen": -1.765067458152771, "logits/rejected": -2.292782783508301, "logps/chosen": -402.9501647949219, "logps/rejected": -2130.71875, "loss": 0.0947, "rewards/accuracies": 1.0, "rewards/chosen": -3.639850616455078, "rewards/margins": 17.243175506591797, "rewards/rejected": -20.883028030395508, "step": 3040 }, { "epoch": 0.43164449476365696, "grad_norm": 0.6503874733524095, "learning_rate": 6.315458405409655e-08, "logits/chosen": -1.7863109111785889, "logits/rejected": -2.2801694869995117, "logps/chosen": -477.6725158691406, "logps/rejected": -1948.598876953125, "loss": 0.023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.368145942687988, "rewards/margins": 14.700187683105469, "rewards/rejected": -19.068334579467773, "step": 3050 }, { "epoch": 0.4330597226153411, "grad_norm": 369.53568556479377, "learning_rate": 6.299732662368297e-08, "logits/chosen": -1.785314917564392, "logits/rejected": -2.2671291828155518, "logps/chosen": -412.31622314453125, "logps/rejected": -1947.394775390625, "loss": 0.0346, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.7201523780822754, "rewards/margins": 15.349873542785645, "rewards/rejected": -19.070026397705078, "step": 3060 }, { "epoch": 0.43447495046702517, "grad_norm": 162.28953218598429, "learning_rate": 6.284006919326937e-08, "logits/chosen": -1.843496561050415, "logits/rejected": -2.370943546295166, "logps/chosen": -410.10186767578125, "logps/rejected": -2083.18212890625, "loss": 0.0162, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.703716993331909, "rewards/margins": 16.70279312133789, "rewards/rejected": -20.406511306762695, "step": 3070 }, { "epoch": 0.4358901783187093, "grad_norm": 49.94159033796179, "learning_rate": 6.268281176285579e-08, "logits/chosen": -1.8512115478515625, "logits/rejected": -2.3238143920898438, "logps/chosen": -504.066162109375, "logps/rejected": -2276.795654296875, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -4.645949363708496, "rewards/margins": 17.661602020263672, "rewards/rejected": -22.307552337646484, "step": 3080 }, { "epoch": 0.43730540617039343, "grad_norm": 12.504046900010039, "learning_rate": 6.25255543324422e-08, "logits/chosen": -1.8256876468658447, "logits/rejected": -2.3837506771087646, "logps/chosen": -514.2593383789062, "logps/rejected": -2254.97705078125, "loss": 0.029, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.75529146194458, "rewards/margins": 17.377225875854492, "rewards/rejected": -22.132516860961914, "step": 3090 }, { "epoch": 0.43872063402207756, "grad_norm": 412.74148377977554, "learning_rate": 6.236829690202862e-08, "logits/chosen": -1.8341659307479858, "logits/rejected": -2.2606141567230225, "logps/chosen": -393.8352355957031, "logps/rejected": -1949.8687744140625, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": -3.5665855407714844, "rewards/margins": 15.500185012817383, "rewards/rejected": -19.0667724609375, "step": 3100 }, { "epoch": 0.4401358618737617, "grad_norm": 12.59915722308121, "learning_rate": 6.221103947161504e-08, "logits/chosen": -1.8221811056137085, "logits/rejected": -2.333721876144409, "logps/chosen": -431.2979431152344, "logps/rejected": -2015.99609375, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -3.9143710136413574, "rewards/margins": 15.83216667175293, "rewards/rejected": -19.746536254882812, "step": 3110 }, { "epoch": 0.4415510897254458, "grad_norm": 394.89022229674197, "learning_rate": 6.205378204120144e-08, "logits/chosen": -1.815194845199585, "logits/rejected": -2.2697837352752686, "logps/chosen": -411.0352478027344, "logps/rejected": -2009.853515625, "loss": 0.055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7213547229766846, "rewards/margins": 15.965069770812988, "rewards/rejected": -19.68642234802246, "step": 3120 }, { "epoch": 0.4429663175771299, "grad_norm": 0.501987278113392, "learning_rate": 6.189652461078786e-08, "logits/chosen": -1.8822548389434814, "logits/rejected": -2.319658041000366, "logps/chosen": -448.5939025878906, "logps/rejected": -2018.6402587890625, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -4.084114074707031, "rewards/margins": 15.658414840698242, "rewards/rejected": -19.74252700805664, "step": 3130 }, { "epoch": 0.44438154542881403, "grad_norm": 246.18804911176667, "learning_rate": 6.173926718037426e-08, "logits/chosen": -1.8160037994384766, "logits/rejected": -2.35274076461792, "logps/chosen": -379.2480163574219, "logps/rejected": -1970.660400390625, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -3.3902201652526855, "rewards/margins": 15.905115127563477, "rewards/rejected": -19.295337677001953, "step": 3140 }, { "epoch": 0.44579677328049816, "grad_norm": 12.252434715517323, "learning_rate": 6.158200974996069e-08, "logits/chosen": -1.7822126150131226, "logits/rejected": -2.2919201850891113, "logps/chosen": -405.60235595703125, "logps/rejected": -2177.811279296875, "loss": 0.0325, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.659182071685791, "rewards/margins": 17.660125732421875, "rewards/rejected": -21.31930923461914, "step": 3150 }, { "epoch": 0.4472120011321823, "grad_norm": 130.26110455746746, "learning_rate": 6.142475231954709e-08, "logits/chosen": -1.7888820171356201, "logits/rejected": -2.328519582748413, "logps/chosen": -469.3672790527344, "logps/rejected": -2259.90283203125, "loss": 0.0628, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.299753665924072, "rewards/margins": 17.865779876708984, "rewards/rejected": -22.1655330657959, "step": 3160 }, { "epoch": 0.4486272289838664, "grad_norm": 15.486072965893953, "learning_rate": 6.126749488913351e-08, "logits/chosen": -1.8065866231918335, "logits/rejected": -2.2963669300079346, "logps/chosen": -347.935546875, "logps/rejected": -2085.42236328125, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -3.086002826690674, "rewards/margins": 17.31363296508789, "rewards/rejected": -20.399633407592773, "step": 3170 }, { "epoch": 0.4500424568355505, "grad_norm": 12.01223544611477, "learning_rate": 6.111023745871993e-08, "logits/chosen": -1.8585693836212158, "logits/rejected": -2.277873992919922, "logps/chosen": -437.9632263183594, "logps/rejected": -2062.424560546875, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.9937655925750732, "rewards/margins": 16.219470977783203, "rewards/rejected": -20.213237762451172, "step": 3180 }, { "epoch": 0.45145768468723463, "grad_norm": 23.429245881175067, "learning_rate": 6.095298002830633e-08, "logits/chosen": -1.8341306447982788, "logits/rejected": -2.4010422229766846, "logps/chosen": -454.07940673828125, "logps/rejected": -2081.429931640625, "loss": 0.0575, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.137267112731934, "rewards/margins": 16.256786346435547, "rewards/rejected": -20.394052505493164, "step": 3190 }, { "epoch": 0.45287291253891876, "grad_norm": 68.94968559652548, "learning_rate": 6.079572259789275e-08, "logits/chosen": -1.7813432216644287, "logits/rejected": -2.2879951000213623, "logps/chosen": -375.6089782714844, "logps/rejected": -2440.23291015625, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -3.37333607673645, "rewards/margins": 20.567264556884766, "rewards/rejected": -23.94059944152832, "step": 3200 }, { "epoch": 0.4542881403906029, "grad_norm": 169.95022233275458, "learning_rate": 6.063846516747916e-08, "logits/chosen": -1.7860438823699951, "logits/rejected": -2.2503819465637207, "logps/chosen": -428.6514587402344, "logps/rejected": -2200.8701171875, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": -3.8957550525665283, "rewards/margins": 17.689456939697266, "rewards/rejected": -21.58521270751953, "step": 3210 }, { "epoch": 0.455703368242287, "grad_norm": 13.396810730683102, "learning_rate": 6.048120773706558e-08, "logits/chosen": -1.8093593120574951, "logits/rejected": -2.269320487976074, "logps/chosen": -455.52734375, "logps/rejected": -2094.930419921875, "loss": 0.0664, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.162112236022949, "rewards/margins": 16.37947654724121, "rewards/rejected": -20.541589736938477, "step": 3220 }, { "epoch": 0.45711859609397115, "grad_norm": 291.768867871934, "learning_rate": 6.032395030665198e-08, "logits/chosen": -1.8449703454971313, "logits/rejected": -2.340804100036621, "logps/chosen": -412.7440490722656, "logps/rejected": -2094.7314453125, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -3.749612808227539, "rewards/margins": 16.798900604248047, "rewards/rejected": -20.548513412475586, "step": 3230 }, { "epoch": 0.45853382394565523, "grad_norm": 5.868081835308945, "learning_rate": 6.01666928762384e-08, "logits/chosen": -1.8716949224472046, "logits/rejected": -2.395799160003662, "logps/chosen": -462.8614807128906, "logps/rejected": -2383.872802734375, "loss": 0.0432, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.238659858703613, "rewards/margins": 19.159006118774414, "rewards/rejected": -23.39766502380371, "step": 3240 }, { "epoch": 0.45994905179733936, "grad_norm": 0.8127577264506353, "learning_rate": 6.00094354458248e-08, "logits/chosen": -1.9061635732650757, "logits/rejected": -2.3319504261016846, "logps/chosen": -422.4518127441406, "logps/rejected": -2276.231201171875, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -3.829763889312744, "rewards/margins": 18.472150802612305, "rewards/rejected": -22.30191421508789, "step": 3250 }, { "epoch": 0.4613642796490235, "grad_norm": 57.24128199839525, "learning_rate": 5.985217801541123e-08, "logits/chosen": -1.8994219303131104, "logits/rejected": -2.3576340675354004, "logps/chosen": -465.5694885253906, "logps/rejected": -2125.767822265625, "loss": 0.0159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.2480244636535645, "rewards/margins": 16.599397659301758, "rewards/rejected": -20.8474178314209, "step": 3260 }, { "epoch": 0.4627795075007076, "grad_norm": 44.930151543772084, "learning_rate": 5.969492058499765e-08, "logits/chosen": -1.866918921470642, "logits/rejected": -2.3719286918640137, "logps/chosen": -472.20574951171875, "logps/rejected": -2111.079833984375, "loss": 0.0654, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.340450286865234, "rewards/margins": 16.348058700561523, "rewards/rejected": -20.68851089477539, "step": 3270 }, { "epoch": 0.46419473535239175, "grad_norm": 144.18910802031593, "learning_rate": 5.953766315458405e-08, "logits/chosen": -1.8395946025848389, "logits/rejected": -2.2954721450805664, "logps/chosen": -371.288818359375, "logps/rejected": -1911.0491943359375, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -3.328839063644409, "rewards/margins": 15.39087963104248, "rewards/rejected": -18.7197208404541, "step": 3280 }, { "epoch": 0.4656099632040759, "grad_norm": 42.663285748343824, "learning_rate": 5.938040572417047e-08, "logits/chosen": -1.868486762046814, "logits/rejected": -2.2948148250579834, "logps/chosen": -353.94775390625, "logps/rejected": -2002.547119140625, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -3.16620135307312, "rewards/margins": 16.448184967041016, "rewards/rejected": -19.6143856048584, "step": 3290 }, { "epoch": 0.46702519105575996, "grad_norm": 39.190285869519236, "learning_rate": 5.922314829375688e-08, "logits/chosen": -1.851377248764038, "logits/rejected": -2.315969228744507, "logps/chosen": -318.1407165527344, "logps/rejected": -2049.31298828125, "loss": 0.0883, "rewards/accuracies": 1.0, "rewards/chosen": -2.810586929321289, "rewards/margins": 17.273977279663086, "rewards/rejected": -20.084564208984375, "step": 3300 }, { "epoch": 0.4684404189074441, "grad_norm": 0.2857884880157783, "learning_rate": 5.9065890863343294e-08, "logits/chosen": -1.8264890909194946, "logits/rejected": -2.247379779815674, "logps/chosen": -343.6062316894531, "logps/rejected": -1898.0634765625, "loss": 0.058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.05055832862854, "rewards/margins": 15.545207023620605, "rewards/rejected": -18.59576416015625, "step": 3310 }, { "epoch": 0.4698556467591282, "grad_norm": 2.816819610470867, "learning_rate": 5.89086334329297e-08, "logits/chosen": -1.8392388820648193, "logits/rejected": -2.2676749229431152, "logps/chosen": -385.3665466308594, "logps/rejected": -1890.410400390625, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -3.476350784301758, "rewards/margins": 15.027132034301758, "rewards/rejected": -18.503482818603516, "step": 3320 }, { "epoch": 0.47127087461081235, "grad_norm": 2.8211592862830233, "learning_rate": 5.875137600251612e-08, "logits/chosen": -1.8047840595245361, "logits/rejected": -2.3769490718841553, "logps/chosen": -352.0120544433594, "logps/rejected": -2222.458984375, "loss": 0.0457, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1342532634735107, "rewards/margins": 18.662059783935547, "rewards/rejected": -21.796316146850586, "step": 3330 }, { "epoch": 0.4726861024624965, "grad_norm": 664.348302321289, "learning_rate": 5.859411857210252e-08, "logits/chosen": -1.8269994258880615, "logits/rejected": -2.408629894256592, "logps/chosen": -395.7481689453125, "logps/rejected": -2106.75390625, "loss": 0.033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.567420244216919, "rewards/margins": 17.099597930908203, "rewards/rejected": -20.66701889038086, "step": 3340 }, { "epoch": 0.47410133031418056, "grad_norm": 48.81622586463172, "learning_rate": 5.843686114168894e-08, "logits/chosen": -1.9566017389297485, "logits/rejected": -2.3282978534698486, "logps/chosen": -532.4284057617188, "logps/rejected": -2077.0, "loss": 0.0699, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.937098503112793, "rewards/margins": 15.424572944641113, "rewards/rejected": -20.361671447753906, "step": 3350 }, { "epoch": 0.4755165581658647, "grad_norm": 0.20155689427866041, "learning_rate": 5.827960371127536e-08, "logits/chosen": -1.8653078079223633, "logits/rejected": -2.3440439701080322, "logps/chosen": -418.0625915527344, "logps/rejected": -2042.7808837890625, "loss": 0.0126, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.8021717071533203, "rewards/margins": 16.23184585571289, "rewards/rejected": -20.03401756286621, "step": 3360 }, { "epoch": 0.4769317860175488, "grad_norm": 48.243236879538884, "learning_rate": 5.8122346280861766e-08, "logits/chosen": -1.8719675540924072, "logits/rejected": -2.3867926597595215, "logps/chosen": -463.9722595214844, "logps/rejected": -2239.607666015625, "loss": 0.0234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.237305641174316, "rewards/margins": 17.760501861572266, "rewards/rejected": -21.99781036376953, "step": 3370 }, { "epoch": 0.47834701386923295, "grad_norm": 128.50153060078745, "learning_rate": 5.796508885044818e-08, "logits/chosen": -1.8609254360198975, "logits/rejected": -2.3479647636413574, "logps/chosen": -406.599853515625, "logps/rejected": -2228.214111328125, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -3.6765244007110596, "rewards/margins": 18.181293487548828, "rewards/rejected": -21.857818603515625, "step": 3380 }, { "epoch": 0.4797622417209171, "grad_norm": 6.153379496454454, "learning_rate": 5.780783142003459e-08, "logits/chosen": -1.9142709970474243, "logits/rejected": -2.3390250205993652, "logps/chosen": -472.4739685058594, "logps/rejected": -2108.2236328125, "loss": 0.0403, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.322788715362549, "rewards/margins": 16.340526580810547, "rewards/rejected": -20.663314819335938, "step": 3390 }, { "epoch": 0.4811774695726012, "grad_norm": 119.1503830621879, "learning_rate": 5.765057398962101e-08, "logits/chosen": -1.936768889427185, "logits/rejected": -2.421146869659424, "logps/chosen": -474.7774353027344, "logps/rejected": -2321.90771484375, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -4.3404741287231445, "rewards/margins": 18.4345645904541, "rewards/rejected": -22.775035858154297, "step": 3400 }, { "epoch": 0.4825926974242853, "grad_norm": 231.89141323172277, "learning_rate": 5.749331655920742e-08, "logits/chosen": -1.8889204263687134, "logits/rejected": -2.4186294078826904, "logps/chosen": -456.69622802734375, "logps/rejected": -2300.83251953125, "loss": 0.0177, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.176141262054443, "rewards/margins": 18.387182235717773, "rewards/rejected": -22.563322067260742, "step": 3410 }, { "epoch": 0.4840079252759694, "grad_norm": 0.8440831675306427, "learning_rate": 5.7336059128793835e-08, "logits/chosen": -1.9790607690811157, "logits/rejected": -2.4660239219665527, "logps/chosen": -468.88330078125, "logps/rejected": -2240.77099609375, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": -4.313877582550049, "rewards/margins": 17.639280319213867, "rewards/rejected": -21.95315933227539, "step": 3420 }, { "epoch": 0.48542315312765355, "grad_norm": 3.2177991673292223, "learning_rate": 5.717880169838024e-08, "logits/chosen": -1.8856712579727173, "logits/rejected": -2.392756462097168, "logps/chosen": -356.53082275390625, "logps/rejected": -2046.959716796875, "loss": 0.0173, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1780455112457275, "rewards/margins": 16.902135848999023, "rewards/rejected": -20.080181121826172, "step": 3430 }, { "epoch": 0.4868383809793377, "grad_norm": 63.37548900139259, "learning_rate": 5.702154426796666e-08, "logits/chosen": -1.8908859491348267, "logits/rejected": -2.3704209327697754, "logps/chosen": -430.9766540527344, "logps/rejected": -2210.27001953125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -3.9146552085876465, "rewards/margins": 17.75827407836914, "rewards/rejected": -21.672927856445312, "step": 3440 }, { "epoch": 0.4882536088310218, "grad_norm": 0.39922069353328193, "learning_rate": 5.686428683755308e-08, "logits/chosen": -1.8536736965179443, "logits/rejected": -2.3330180644989014, "logps/chosen": -432.0633239746094, "logps/rejected": -2140.265380859375, "loss": 0.0572, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.928187131881714, "rewards/margins": 17.031620025634766, "rewards/rejected": -20.95980453491211, "step": 3450 }, { "epoch": 0.4896688366827059, "grad_norm": 2.096424456858992, "learning_rate": 5.670702940713948e-08, "logits/chosen": -1.8270671367645264, "logits/rejected": -2.3372256755828857, "logps/chosen": -357.6448974609375, "logps/rejected": -2092.04736328125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -3.1824300289154053, "rewards/margins": 17.332561492919922, "rewards/rejected": -20.514991760253906, "step": 3460 }, { "epoch": 0.49108406453439, "grad_norm": 294.45275012413407, "learning_rate": 5.6549771976725904e-08, "logits/chosen": -1.8691561222076416, "logits/rejected": -2.3665847778320312, "logps/chosen": -315.14569091796875, "logps/rejected": -2140.925048828125, "loss": 0.0327, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7693703174591064, "rewards/margins": 18.208545684814453, "rewards/rejected": -20.977914810180664, "step": 3470 }, { "epoch": 0.49249929238607415, "grad_norm": 12.631202171623604, "learning_rate": 5.639251454631231e-08, "logits/chosen": -1.7663415670394897, "logits/rejected": -2.254941463470459, "logps/chosen": -298.6046447753906, "logps/rejected": -1868.725830078125, "loss": 0.0976, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.605868339538574, "rewards/margins": 15.696454048156738, "rewards/rejected": -18.30232048034668, "step": 3480 }, { "epoch": 0.4939145202377583, "grad_norm": 0.23049839834124244, "learning_rate": 5.6235257115898724e-08, "logits/chosen": -1.700356125831604, "logits/rejected": -2.184948444366455, "logps/chosen": -326.0857849121094, "logps/rejected": -1945.898681640625, "loss": 0.0374, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8953030109405518, "rewards/margins": 16.160375595092773, "rewards/rejected": -19.05567741394043, "step": 3490 }, { "epoch": 0.4953297480894424, "grad_norm": 276.39763928333053, "learning_rate": 5.6077999685485133e-08, "logits/chosen": -1.7571609020233154, "logits/rejected": -2.2587265968322754, "logps/chosen": -351.3701171875, "logps/rejected": -2032.0718994140625, "loss": 0.0311, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1390910148620605, "rewards/margins": 16.779420852661133, "rewards/rejected": -19.91851043701172, "step": 3500 }, { "epoch": 0.49674497594112654, "grad_norm": 0.35742786328853976, "learning_rate": 5.592074225507155e-08, "logits/chosen": -1.8700401782989502, "logits/rejected": -2.2900614738464355, "logps/chosen": -431.62811279296875, "logps/rejected": -2177.2646484375, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -3.912644624710083, "rewards/margins": 17.43561363220215, "rewards/rejected": -21.34825897216797, "step": 3510 }, { "epoch": 0.4981602037928106, "grad_norm": 12.401286577941935, "learning_rate": 5.576348482465796e-08, "logits/chosen": -1.8618144989013672, "logits/rejected": -2.325762987136841, "logps/chosen": -346.45074462890625, "logps/rejected": -1973.8203125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -3.0893683433532715, "rewards/margins": 16.246410369873047, "rewards/rejected": -19.33577537536621, "step": 3520 }, { "epoch": 0.49957543164449475, "grad_norm": 0.0966342890809797, "learning_rate": 5.5606227394244376e-08, "logits/chosen": -1.7886426448822021, "logits/rejected": -2.363109588623047, "logps/chosen": -425.83514404296875, "logps/rejected": -2153.822265625, "loss": 0.0545, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.852980136871338, "rewards/margins": 17.252971649169922, "rewards/rejected": -21.1059513092041, "step": 3530 }, { "epoch": 0.5009906594961789, "grad_norm": 13.83738424283277, "learning_rate": 5.544896996383079e-08, "logits/chosen": -1.9178205728530884, "logits/rejected": -2.2537736892700195, "logps/chosen": -477.43817138671875, "logps/rejected": -2155.73876953125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -4.391613483428955, "rewards/margins": 16.76104736328125, "rewards/rejected": -21.152660369873047, "step": 3540 }, { "epoch": 0.502405887347863, "grad_norm": 5.313174503651349, "learning_rate": 5.52917125334172e-08, "logits/chosen": -1.9439064264297485, "logits/rejected": -2.405547618865967, "logps/chosen": -426.7547302246094, "logps/rejected": -2279.05029296875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -3.88254976272583, "rewards/margins": 18.48524284362793, "rewards/rejected": -22.3677921295166, "step": 3550 }, { "epoch": 0.5038211151995471, "grad_norm": 0.3087347966007789, "learning_rate": 5.513445510300362e-08, "logits/chosen": -1.9050893783569336, "logits/rejected": -2.356266975402832, "logps/chosen": -453.75469970703125, "logps/rejected": -2106.497314453125, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -4.145519733428955, "rewards/margins": 16.53099822998047, "rewards/rejected": -20.676517486572266, "step": 3560 }, { "epoch": 0.5052363430512312, "grad_norm": 28.263037712435267, "learning_rate": 5.497719767259003e-08, "logits/chosen": -1.9059789180755615, "logits/rejected": -2.424800395965576, "logps/chosen": -389.2085266113281, "logps/rejected": -2367.781982421875, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": -3.5166549682617188, "rewards/margins": 19.724050521850586, "rewards/rejected": -23.240707397460938, "step": 3570 }, { "epoch": 0.5066515709029153, "grad_norm": 0.17791508637887724, "learning_rate": 5.4819940242176445e-08, "logits/chosen": -1.8341060876846313, "logits/rejected": -2.302833318710327, "logps/chosen": -376.6608581542969, "logps/rejected": -2051.04443359375, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.3743796348571777, "rewards/margins": 16.705020904541016, "rewards/rejected": -20.07940101623535, "step": 3580 }, { "epoch": 0.5080667987545995, "grad_norm": 823.28907815375, "learning_rate": 5.466268281176285e-08, "logits/chosen": -1.7829341888427734, "logits/rejected": -2.279573678970337, "logps/chosen": -318.4897766113281, "logps/rejected": -2006.225830078125, "loss": 0.0585, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7995870113372803, "rewards/margins": 16.838542938232422, "rewards/rejected": -19.63812828063965, "step": 3590 }, { "epoch": 0.5094820266062836, "grad_norm": 11.437484314610414, "learning_rate": 5.4505425381349265e-08, "logits/chosen": -1.8359096050262451, "logits/rejected": -2.3126494884490967, "logps/chosen": -321.8591613769531, "logps/rejected": -1931.738525390625, "loss": 0.0485, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.824387788772583, "rewards/margins": 16.070606231689453, "rewards/rejected": -18.89499282836914, "step": 3600 }, { "epoch": 0.5108972544579677, "grad_norm": 81.69712196637859, "learning_rate": 5.4348167950935675e-08, "logits/chosen": -1.8515913486480713, "logits/rejected": -2.2430260181427, "logps/chosen": -322.1767272949219, "logps/rejected": -1970.490966796875, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -2.832655429840088, "rewards/margins": 16.442304611206055, "rewards/rejected": -19.274959564208984, "step": 3610 }, { "epoch": 0.5123124823096519, "grad_norm": 2.3113529653384264, "learning_rate": 5.419091052052209e-08, "logits/chosen": -1.9192806482315063, "logits/rejected": -2.3421473503112793, "logps/chosen": -413.50341796875, "logps/rejected": -2361.686767578125, "loss": 0.0164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7424347400665283, "rewards/margins": 19.463428497314453, "rewards/rejected": -23.20586395263672, "step": 3620 }, { "epoch": 0.513727710161336, "grad_norm": 212.58052474610946, "learning_rate": 5.40336530901085e-08, "logits/chosen": -1.8247449398040771, "logits/rejected": -2.2847695350646973, "logps/chosen": -362.20050048828125, "logps/rejected": -2197.05517578125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.2262187004089355, "rewards/margins": 18.297203063964844, "rewards/rejected": -21.523422241210938, "step": 3630 }, { "epoch": 0.5151429380130201, "grad_norm": 0.06614313673867636, "learning_rate": 5.387639565969492e-08, "logits/chosen": -1.8412069082260132, "logits/rejected": -2.3112998008728027, "logps/chosen": -428.9342346191406, "logps/rejected": -2204.323974609375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -3.9182682037353516, "rewards/margins": 17.714258193969727, "rewards/rejected": -21.632526397705078, "step": 3640 }, { "epoch": 0.5165581658647043, "grad_norm": 0.7992816355952512, "learning_rate": 5.3719138229281334e-08, "logits/chosen": -1.8472694158554077, "logits/rejected": -2.2803525924682617, "logps/chosen": -408.6439514160156, "logps/rejected": -2287.32666015625, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -3.7037353515625, "rewards/margins": 18.74980354309082, "rewards/rejected": -22.453540802001953, "step": 3650 }, { "epoch": 0.5179733937163883, "grad_norm": 2.4758390243226462, "learning_rate": 5.3561880798867744e-08, "logits/chosen": -1.9844038486480713, "logits/rejected": -2.466768264770508, "logps/chosen": -415.12420654296875, "logps/rejected": -2382.98193359375, "loss": 0.0342, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.764296054840088, "rewards/margins": 19.63140106201172, "rewards/rejected": -23.39569664001465, "step": 3660 }, { "epoch": 0.5193886215680724, "grad_norm": 0.022264946795882385, "learning_rate": 5.340462336845416e-08, "logits/chosen": -1.8750797510147095, "logits/rejected": -2.3622257709503174, "logps/chosen": -480.6292419433594, "logps/rejected": -2268.67333984375, "loss": 0.023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.409084796905518, "rewards/margins": 17.86863136291504, "rewards/rejected": -22.27771759033203, "step": 3670 }, { "epoch": 0.5208038494197565, "grad_norm": 67.32671821057119, "learning_rate": 5.324736593804057e-08, "logits/chosen": -1.9281885623931885, "logits/rejected": -2.386291980743408, "logps/chosen": -489.1935119628906, "logps/rejected": -2352.52978515625, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -4.512565612792969, "rewards/margins": 18.59969711303711, "rewards/rejected": -23.112260818481445, "step": 3680 }, { "epoch": 0.5222190772714407, "grad_norm": 0.1399080314731751, "learning_rate": 5.3090108507626987e-08, "logits/chosen": -1.8778225183486938, "logits/rejected": -2.3837532997131348, "logps/chosen": -458.46728515625, "logps/rejected": -2322.582275390625, "loss": 0.0584, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.203193664550781, "rewards/margins": 18.595497131347656, "rewards/rejected": -22.798694610595703, "step": 3690 }, { "epoch": 0.5236343051231248, "grad_norm": 170.97309767513477, "learning_rate": 5.293285107721339e-08, "logits/chosen": -1.882636308670044, "logits/rejected": -2.3739171028137207, "logps/chosen": -431.23687744140625, "logps/rejected": -2283.082275390625, "loss": 0.0128, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.935701847076416, "rewards/margins": 18.50697135925293, "rewards/rejected": -22.442672729492188, "step": 3700 }, { "epoch": 0.5250495329748089, "grad_norm": 3.92923003859712, "learning_rate": 5.277559364679981e-08, "logits/chosen": -1.9527801275253296, "logits/rejected": -2.343839168548584, "logps/chosen": -520.0038452148438, "logps/rejected": -2568.54296875, "loss": 0.0697, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.787331581115723, "rewards/margins": 20.42181968688965, "rewards/rejected": -25.209148406982422, "step": 3710 }, { "epoch": 0.5264647608264931, "grad_norm": 110.93476062439768, "learning_rate": 5.2618336216386216e-08, "logits/chosen": -1.8612161874771118, "logits/rejected": -2.361910343170166, "logps/chosen": -530.6429443359375, "logps/rejected": -2494.368896484375, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -4.900094509124756, "rewards/margins": 19.603832244873047, "rewards/rejected": -24.503929138183594, "step": 3720 }, { "epoch": 0.5278799886781772, "grad_norm": 0.9209765154244541, "learning_rate": 5.246107878597263e-08, "logits/chosen": -1.7482706308364868, "logits/rejected": -2.205848217010498, "logps/chosen": -403.29302978515625, "logps/rejected": -2405.90283203125, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -3.6266791820526123, "rewards/margins": 19.99715232849121, "rewards/rejected": -23.623828887939453, "step": 3730 }, { "epoch": 0.5292952165298613, "grad_norm": 19.255388476805308, "learning_rate": 5.2303821355559056e-08, "logits/chosen": -1.8358147144317627, "logits/rejected": -2.2994394302368164, "logps/chosen": -414.51202392578125, "logps/rejected": -2079.67236328125, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -3.7490944862365723, "rewards/margins": 16.617252349853516, "rewards/rejected": -20.36634635925293, "step": 3740 }, { "epoch": 0.5307104443815455, "grad_norm": 33.38770108294165, "learning_rate": 5.214656392514546e-08, "logits/chosen": -1.8439114093780518, "logits/rejected": -2.38391375541687, "logps/chosen": -425.5856018066406, "logps/rejected": -2341.14794921875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.8664588928222656, "rewards/margins": 19.135740280151367, "rewards/rejected": -23.002201080322266, "step": 3750 }, { "epoch": 0.5321256722332296, "grad_norm": 1.469723442631867, "learning_rate": 5.1989306494731875e-08, "logits/chosen": -1.8605003356933594, "logits/rejected": -2.3654274940490723, "logps/chosen": -577.1530151367188, "logps/rejected": -2359.676513671875, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -5.350250244140625, "rewards/margins": 17.81048011779785, "rewards/rejected": -23.16073226928711, "step": 3760 }, { "epoch": 0.5335409000849136, "grad_norm": 356.7502454680932, "learning_rate": 5.1832049064318285e-08, "logits/chosen": -1.8751665353775024, "logits/rejected": -2.34668231010437, "logps/chosen": -458.57049560546875, "logps/rejected": -2238.554443359375, "loss": 0.0803, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.183899879455566, "rewards/margins": 17.79031753540039, "rewards/rejected": -21.974220275878906, "step": 3770 }, { "epoch": 0.5349561279365977, "grad_norm": 1.3804492765300094, "learning_rate": 5.16747916339047e-08, "logits/chosen": -1.8365122079849243, "logits/rejected": -2.400254964828491, "logps/chosen": -451.94036865234375, "logps/rejected": -2528.48828125, "loss": 0.0395, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.143370628356934, "rewards/margins": 20.727176666259766, "rewards/rejected": -24.87054443359375, "step": 3780 }, { "epoch": 0.5363713557882819, "grad_norm": 7.881298170147535, "learning_rate": 5.151753420349111e-08, "logits/chosen": -1.857984185218811, "logits/rejected": -2.424626111984253, "logps/chosen": -517.6944580078125, "logps/rejected": -2432.59033203125, "loss": 0.0332, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.791257381439209, "rewards/margins": 19.13583755493164, "rewards/rejected": -23.92709732055664, "step": 3790 }, { "epoch": 0.537786583639966, "grad_norm": 3.6433140584555606, "learning_rate": 5.136027677307753e-08, "logits/chosen": -1.9091355800628662, "logits/rejected": -2.4264349937438965, "logps/chosen": -520.4908447265625, "logps/rejected": -2560.353515625, "loss": 0.0188, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.8187336921691895, "rewards/margins": 20.3769588470459, "rewards/rejected": -25.195693969726562, "step": 3800 }, { "epoch": 0.5392018114916501, "grad_norm": 438.25728865987764, "learning_rate": 5.120301934266393e-08, "logits/chosen": -1.835095763206482, "logits/rejected": -2.34812331199646, "logps/chosen": -456.94952392578125, "logps/rejected": -2544.940673828125, "loss": 0.0231, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.16293478012085, "rewards/margins": 20.853389739990234, "rewards/rejected": -25.016324996948242, "step": 3810 }, { "epoch": 0.5406170393433343, "grad_norm": 22.458379079150568, "learning_rate": 5.1045761912250354e-08, "logits/chosen": -1.8962056636810303, "logits/rejected": -2.3125247955322266, "logps/chosen": -399.58795166015625, "logps/rejected": -2149.83447265625, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.612574338912964, "rewards/margins": 17.484859466552734, "rewards/rejected": -21.097436904907227, "step": 3820 }, { "epoch": 0.5420322671950184, "grad_norm": 503.0881792331743, "learning_rate": 5.088850448183677e-08, "logits/chosen": -1.8447914123535156, "logits/rejected": -2.309113025665283, "logps/chosen": -404.8933410644531, "logps/rejected": -2221.142578125, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": -3.670100688934326, "rewards/margins": 18.106487274169922, "rewards/rejected": -21.776586532592773, "step": 3830 }, { "epoch": 0.5434474950467025, "grad_norm": 0.5769444599188445, "learning_rate": 5.0731247051423174e-08, "logits/chosen": -1.8871301412582397, "logits/rejected": -2.3481833934783936, "logps/chosen": -393.566650390625, "logps/rejected": -2049.52685546875, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -3.5453922748565674, "rewards/margins": 16.53521728515625, "rewards/rejected": -20.080608367919922, "step": 3840 }, { "epoch": 0.5448627228983867, "grad_norm": 111.21273887430876, "learning_rate": 5.05739896210096e-08, "logits/chosen": -1.8945280313491821, "logits/rejected": -2.363098621368408, "logps/chosen": -413.6650390625, "logps/rejected": -2223.688720703125, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -3.7550182342529297, "rewards/margins": 18.10459327697754, "rewards/rejected": -21.85961151123047, "step": 3850 }, { "epoch": 0.5462779507500708, "grad_norm": 4.232618061102017, "learning_rate": 5.0416732190596e-08, "logits/chosen": -1.893121361732483, "logits/rejected": -2.3533928394317627, "logps/chosen": -363.67138671875, "logps/rejected": -2312.82177734375, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -3.238542079925537, "rewards/margins": 19.44230079650879, "rewards/rejected": -22.680843353271484, "step": 3860 }, { "epoch": 0.5476931786017549, "grad_norm": 184.6915442578083, "learning_rate": 5.0259474760182417e-08, "logits/chosen": -1.868242621421814, "logits/rejected": -2.423828601837158, "logps/chosen": -406.3858947753906, "logps/rejected": -2365.56298828125, "loss": 0.0347, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6605803966522217, "rewards/margins": 19.577213287353516, "rewards/rejected": -23.237794876098633, "step": 3870 }, { "epoch": 0.5491084064534391, "grad_norm": 19.069276029201344, "learning_rate": 5.0102217329768826e-08, "logits/chosen": -1.8565229177474976, "logits/rejected": -2.4172401428222656, "logps/chosen": -388.0845031738281, "logps/rejected": -2431.721435546875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -3.4752254486083984, "rewards/margins": 20.419158935546875, "rewards/rejected": -23.89438247680664, "step": 3880 }, { "epoch": 0.5505236343051231, "grad_norm": 1.751790348822255, "learning_rate": 4.994495989935524e-08, "logits/chosen": -1.8492801189422607, "logits/rejected": -2.3796420097351074, "logps/chosen": -409.97943115234375, "logps/rejected": -2539.96142578125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.721935987472534, "rewards/margins": 21.238689422607422, "rewards/rejected": -24.960622787475586, "step": 3890 }, { "epoch": 0.5519388621568072, "grad_norm": 0.3033920625156282, "learning_rate": 4.978770246894165e-08, "logits/chosen": -1.8679769039154053, "logits/rejected": -2.3982651233673096, "logps/chosen": -422.11199951171875, "logps/rejected": -2403.30029296875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.83569598197937, "rewards/margins": 19.784456253051758, "rewards/rejected": -23.620153427124023, "step": 3900 }, { "epoch": 0.5533540900084913, "grad_norm": 12.866691217314886, "learning_rate": 4.963044503852807e-08, "logits/chosen": -1.8358609676361084, "logits/rejected": -2.437335252761841, "logps/chosen": -375.9891662597656, "logps/rejected": -2422.718505859375, "loss": 0.0682, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.372537612915039, "rewards/margins": 20.427194595336914, "rewards/rejected": -23.799732208251953, "step": 3910 }, { "epoch": 0.5547693178601755, "grad_norm": 0.044293634753242384, "learning_rate": 4.947318760811448e-08, "logits/chosen": -1.849368691444397, "logits/rejected": -2.4594030380249023, "logps/chosen": -381.52191162109375, "logps/rejected": -2456.17138671875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.415750503540039, "rewards/margins": 20.705904006958008, "rewards/rejected": -24.121654510498047, "step": 3920 }, { "epoch": 0.5561845457118596, "grad_norm": 0.1467327183067927, "learning_rate": 4.9315930177700895e-08, "logits/chosen": -1.873440146446228, "logits/rejected": -2.392845630645752, "logps/chosen": -382.8828125, "logps/rejected": -2437.91650390625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -3.4529194831848145, "rewards/margins": 20.4814453125, "rewards/rejected": -23.934368133544922, "step": 3930 }, { "epoch": 0.5575997735635437, "grad_norm": 17.71297008160105, "learning_rate": 4.9158672747287305e-08, "logits/chosen": -1.8013103008270264, "logits/rejected": -2.3737854957580566, "logps/chosen": -397.6031188964844, "logps/rejected": -2315.77783203125, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -3.5982868671417236, "rewards/margins": 19.163944244384766, "rewards/rejected": -22.76223373413086, "step": 3940 }, { "epoch": 0.5590150014152279, "grad_norm": 0.07391310706009434, "learning_rate": 4.9001415316873715e-08, "logits/chosen": -1.8835313320159912, "logits/rejected": -2.3951385021209717, "logps/chosen": -439.5311584472656, "logps/rejected": -2333.31005859375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.00670862197876, "rewards/margins": 18.904666900634766, "rewards/rejected": -22.911375045776367, "step": 3950 }, { "epoch": 0.560430229266912, "grad_norm": 10.726537874429253, "learning_rate": 4.884415788646014e-08, "logits/chosen": -1.8874704837799072, "logits/rejected": -2.3653159141540527, "logps/chosen": -451.7537536621094, "logps/rejected": -2547.031005859375, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -4.114022254943848, "rewards/margins": 20.921409606933594, "rewards/rejected": -25.035432815551758, "step": 3960 }, { "epoch": 0.5618454571185961, "grad_norm": 278.8661769116574, "learning_rate": 4.868690045604655e-08, "logits/chosen": -1.8364877700805664, "logits/rejected": -2.409538984298706, "logps/chosen": -416.8857421875, "logps/rejected": -2406.404052734375, "loss": 0.0242, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.779447078704834, "rewards/margins": 19.881492614746094, "rewards/rejected": -23.660938262939453, "step": 3970 }, { "epoch": 0.5632606849702803, "grad_norm": 511.81977361373356, "learning_rate": 4.852964302563296e-08, "logits/chosen": -1.7998443841934204, "logits/rejected": -2.3145909309387207, "logps/chosen": -441.7867736816406, "logps/rejected": -2476.16064453125, "loss": 0.0759, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.029289245605469, "rewards/margins": 20.3055419921875, "rewards/rejected": -24.33483123779297, "step": 3980 }, { "epoch": 0.5646759128219644, "grad_norm": 318.6442919783841, "learning_rate": 4.8372385595219374e-08, "logits/chosen": -1.8704845905303955, "logits/rejected": -2.3830671310424805, "logps/chosen": -423.968017578125, "logps/rejected": -2613.470703125, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -3.842798948287964, "rewards/margins": 21.851581573486328, "rewards/rejected": -25.694377899169922, "step": 3990 }, { "epoch": 0.5660911406736484, "grad_norm": 1.7887363162884409, "learning_rate": 4.8215128164805784e-08, "logits/chosen": -1.930311918258667, "logits/rejected": -2.4574711322784424, "logps/chosen": -457.2897033691406, "logps/rejected": -2442.10107421875, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -4.167903900146484, "rewards/margins": 19.823707580566406, "rewards/rejected": -23.99161148071289, "step": 4000 }, { "epoch": 0.5675063685253325, "grad_norm": 15.766968017879428, "learning_rate": 4.80578707343922e-08, "logits/chosen": -1.94171941280365, "logits/rejected": -2.4514975547790527, "logps/chosen": -429.9342346191406, "logps/rejected": -2359.202880859375, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -3.9002552032470703, "rewards/margins": 19.288837432861328, "rewards/rejected": -23.1890926361084, "step": 4010 }, { "epoch": 0.5689215963770167, "grad_norm": 62.969100086355986, "learning_rate": 4.790061330397861e-08, "logits/chosen": -1.9510552883148193, "logits/rejected": -2.474898338317871, "logps/chosen": -537.9578247070312, "logps/rejected": -2553.19091796875, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -4.981211185455322, "rewards/margins": 20.092832565307617, "rewards/rejected": -25.074045181274414, "step": 4020 }, { "epoch": 0.5703368242287008, "grad_norm": 238.27207806076908, "learning_rate": 4.774335587356502e-08, "logits/chosen": -1.8977460861206055, "logits/rejected": -2.456779956817627, "logps/chosen": -383.25164794921875, "logps/rejected": -2385.893310546875, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -3.4705588817596436, "rewards/margins": 19.983476638793945, "rewards/rejected": -23.45403289794922, "step": 4030 }, { "epoch": 0.5717520520803849, "grad_norm": 0.10809545341997194, "learning_rate": 4.758609844315144e-08, "logits/chosen": -1.9524784088134766, "logits/rejected": -2.4793827533721924, "logps/chosen": -398.23004150390625, "logps/rejected": -2446.16162109375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.592801570892334, "rewards/margins": 20.42794418334961, "rewards/rejected": -24.0207462310791, "step": 4040 }, { "epoch": 0.5731672799320691, "grad_norm": 193.92594690740927, "learning_rate": 4.742884101273785e-08, "logits/chosen": -1.9636119604110718, "logits/rejected": -2.585470676422119, "logps/chosen": -447.9283752441406, "logps/rejected": -2739.507568359375, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.090723037719727, "rewards/margins": 22.863323211669922, "rewards/rejected": -26.95404624938965, "step": 4050 }, { "epoch": 0.5745825077837532, "grad_norm": 12.852259972266632, "learning_rate": 4.727158358232426e-08, "logits/chosen": -2.0185599327087402, "logits/rejected": -2.6129539012908936, "logps/chosen": -481.4539489746094, "logps/rejected": -2681.25537109375, "loss": 0.018, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.411688804626465, "rewards/margins": 21.96710777282715, "rewards/rejected": -26.378799438476562, "step": 4060 }, { "epoch": 0.5759977356354373, "grad_norm": 44.42913109177225, "learning_rate": 4.711432615191068e-08, "logits/chosen": -2.0175576210021973, "logits/rejected": -2.527292490005493, "logps/chosen": -528.8375244140625, "logps/rejected": -2603.54150390625, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": -4.897642135620117, "rewards/margins": 20.746618270874023, "rewards/rejected": -25.644262313842773, "step": 4070 }, { "epoch": 0.5774129634871215, "grad_norm": 0.08336654272325125, "learning_rate": 4.695706872149709e-08, "logits/chosen": -2.032123327255249, "logits/rejected": -2.5663557052612305, "logps/chosen": -409.2356262207031, "logps/rejected": -2483.84423828125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.709333896636963, "rewards/margins": 20.736385345458984, "rewards/rejected": -24.44571876525879, "step": 4080 }, { "epoch": 0.5788281913388056, "grad_norm": 6.360108107315178, "learning_rate": 4.67998112910835e-08, "logits/chosen": -1.9449317455291748, "logits/rejected": -2.5333988666534424, "logps/chosen": -550.9464111328125, "logps/rejected": -2713.607421875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -5.109865665435791, "rewards/margins": 21.622282028198242, "rewards/rejected": -26.732147216796875, "step": 4090 }, { "epoch": 0.5802434191904897, "grad_norm": 0.1526997183561209, "learning_rate": 4.6642553860669916e-08, "logits/chosen": -1.9800151586532593, "logits/rejected": -2.549889087677002, "logps/chosen": -445.9917907714844, "logps/rejected": -2758.393310546875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -4.0784125328063965, "rewards/margins": 23.073436737060547, "rewards/rejected": -27.151851654052734, "step": 4100 }, { "epoch": 0.5816586470421737, "grad_norm": 0.14956717785007403, "learning_rate": 4.6485296430256325e-08, "logits/chosen": -1.9462106227874756, "logits/rejected": -2.4485926628112793, "logps/chosen": -476.6650390625, "logps/rejected": -2640.75830078125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.371711730957031, "rewards/margins": 21.605037689208984, "rewards/rejected": -25.976749420166016, "step": 4110 }, { "epoch": 0.5830738748938579, "grad_norm": 9.944658381443123, "learning_rate": 4.632803899984274e-08, "logits/chosen": -1.9750303030014038, "logits/rejected": -2.539966344833374, "logps/chosen": -409.3119201660156, "logps/rejected": -2678.294921875, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -3.709354877471924, "rewards/margins": 22.664581298828125, "rewards/rejected": -26.37393569946289, "step": 4120 }, { "epoch": 0.584489102745542, "grad_norm": 145.06333690818022, "learning_rate": 4.617078156942915e-08, "logits/chosen": -1.9301687479019165, "logits/rejected": -2.514472484588623, "logps/chosen": -493.65911865234375, "logps/rejected": -2783.908203125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -4.549119472503662, "rewards/margins": 22.864065170288086, "rewards/rejected": -27.41318702697754, "step": 4130 }, { "epoch": 0.5859043305972261, "grad_norm": 1.9218367026482133, "learning_rate": 4.601352413901556e-08, "logits/chosen": -1.950254201889038, "logits/rejected": -2.5109610557556152, "logps/chosen": -513.0213623046875, "logps/rejected": -2833.514892578125, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": -4.739321708679199, "rewards/margins": 23.141786575317383, "rewards/rejected": -27.8811092376709, "step": 4140 }, { "epoch": 0.5873195584489103, "grad_norm": 0.3857422748125449, "learning_rate": 4.5856266708601985e-08, "logits/chosen": -1.9515314102172852, "logits/rejected": -2.428866147994995, "logps/chosen": -493.66680908203125, "logps/rejected": -2504.67529296875, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -4.556714057922363, "rewards/margins": 20.08037757873535, "rewards/rejected": -24.63709259033203, "step": 4150 }, { "epoch": 0.5887347863005944, "grad_norm": 21.412764901004646, "learning_rate": 4.5699009278188394e-08, "logits/chosen": -1.9629024267196655, "logits/rejected": -2.56807279586792, "logps/chosen": -400.377197265625, "logps/rejected": -2545.56884765625, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": -3.6328835487365723, "rewards/margins": 21.376094818115234, "rewards/rejected": -25.008975982666016, "step": 4160 }, { "epoch": 0.5901500141522785, "grad_norm": 0.01815481765340601, "learning_rate": 4.5541751847774804e-08, "logits/chosen": -2.029883861541748, "logits/rejected": -2.5390877723693848, "logps/chosen": -432.388671875, "logps/rejected": -2414.32177734375, "loss": 0.0513, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9360718727111816, "rewards/margins": 19.825641632080078, "rewards/rejected": -23.761714935302734, "step": 4170 }, { "epoch": 0.5915652420039627, "grad_norm": 0.022006772472067136, "learning_rate": 4.538449441736122e-08, "logits/chosen": -1.993638277053833, "logits/rejected": -2.5649757385253906, "logps/chosen": -418.63116455078125, "logps/rejected": -2429.84375, "loss": 0.0594, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8003921508789062, "rewards/margins": 20.101177215576172, "rewards/rejected": -23.901573181152344, "step": 4180 }, { "epoch": 0.5929804698556468, "grad_norm": 2.7881002233728047, "learning_rate": 4.522723698694763e-08, "logits/chosen": -1.9868329763412476, "logits/rejected": -2.472369432449341, "logps/chosen": -380.75958251953125, "logps/rejected": -2200.09423828125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.409226179122925, "rewards/margins": 18.20730209350586, "rewards/rejected": -21.61652946472168, "step": 4190 }, { "epoch": 0.5943956977073309, "grad_norm": 1.1433416631122706, "learning_rate": 4.506997955653404e-08, "logits/chosen": -1.9939043521881104, "logits/rejected": -2.5214357376098633, "logps/chosen": -357.73773193359375, "logps/rejected": -2319.41455078125, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -3.1927247047424316, "rewards/margins": 19.613414764404297, "rewards/rejected": -22.806140899658203, "step": 4200 }, { "epoch": 0.595810925559015, "grad_norm": 10.480137356872097, "learning_rate": 4.491272212612046e-08, "logits/chosen": -1.9711675643920898, "logits/rejected": -2.543412446975708, "logps/chosen": -389.94061279296875, "logps/rejected": -2317.544189453125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.5230331420898438, "rewards/margins": 19.234691619873047, "rewards/rejected": -22.757720947265625, "step": 4210 }, { "epoch": 0.5972261534106991, "grad_norm": 1.4796805813075022, "learning_rate": 4.475546469570687e-08, "logits/chosen": -2.060458183288574, "logits/rejected": -2.6311755180358887, "logps/chosen": -378.7228088378906, "logps/rejected": -2323.20458984375, "loss": 0.0111, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3834781646728516, "rewards/margins": 19.42484474182129, "rewards/rejected": -22.808320999145508, "step": 4220 }, { "epoch": 0.5986413812623832, "grad_norm": 8.385601183397693, "learning_rate": 4.459820726529328e-08, "logits/chosen": -2.1007328033447266, "logits/rejected": -2.62593412399292, "logps/chosen": -396.68487548828125, "logps/rejected": -2628.248046875, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": -3.5762991905212402, "rewards/margins": 22.294382095336914, "rewards/rejected": -25.870681762695312, "step": 4230 }, { "epoch": 0.6000566091140673, "grad_norm": 202.43601025425727, "learning_rate": 4.44409498348797e-08, "logits/chosen": -2.0188963413238525, "logits/rejected": -2.581577777862549, "logps/chosen": -399.34002685546875, "logps/rejected": -2442.0107421875, "loss": 0.0335, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.612683057785034, "rewards/margins": 20.421228408813477, "rewards/rejected": -24.033910751342773, "step": 4240 }, { "epoch": 0.6014718369657515, "grad_norm": 0.15996517931240298, "learning_rate": 4.428369240446611e-08, "logits/chosen": -2.130614757537842, "logits/rejected": -2.5980238914489746, "logps/chosen": -524.40380859375, "logps/rejected": -2554.58935546875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -4.852993488311768, "rewards/margins": 20.271297454833984, "rewards/rejected": -25.124292373657227, "step": 4250 }, { "epoch": 0.6028870648174356, "grad_norm": 3.7366877898525, "learning_rate": 4.4126434974052526e-08, "logits/chosen": -2.0863800048828125, "logits/rejected": -2.6363143920898438, "logps/chosen": -440.59039306640625, "logps/rejected": -2677.32666015625, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -4.0341973304748535, "rewards/margins": 22.329402923583984, "rewards/rejected": -26.363597869873047, "step": 4260 }, { "epoch": 0.6043022926691197, "grad_norm": 10.40754552304518, "learning_rate": 4.3969177543638936e-08, "logits/chosen": -2.0854134559631348, "logits/rejected": -2.759720802307129, "logps/chosen": -425.31378173828125, "logps/rejected": -2697.869140625, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -3.8750414848327637, "rewards/margins": 22.662187576293945, "rewards/rejected": -26.537227630615234, "step": 4270 }, { "epoch": 0.6057175205208039, "grad_norm": 14.82256598096003, "learning_rate": 4.3811920113225346e-08, "logits/chosen": -2.1191301345825195, "logits/rejected": -2.6962432861328125, "logps/chosen": -415.6065979003906, "logps/rejected": -2428.18701171875, "loss": 0.0284, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7916436195373535, "rewards/margins": 20.074899673461914, "rewards/rejected": -23.86654281616211, "step": 4280 }, { "epoch": 0.607132748372488, "grad_norm": 3.533227083417911, "learning_rate": 4.365466268281176e-08, "logits/chosen": -2.0340840816497803, "logits/rejected": -2.624359130859375, "logps/chosen": -425.6732482910156, "logps/rejected": -2528.190185546875, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -3.86889386177063, "rewards/margins": 21.00031089782715, "rewards/rejected": -24.869205474853516, "step": 4290 }, { "epoch": 0.6085479762241721, "grad_norm": 0.936751779893379, "learning_rate": 4.349740525239817e-08, "logits/chosen": -2.0071263313293457, "logits/rejected": -2.6219122409820557, "logps/chosen": -473.16278076171875, "logps/rejected": -2646.717041015625, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -4.34275484085083, "rewards/margins": 21.70262336730957, "rewards/rejected": -26.04537582397461, "step": 4300 }, { "epoch": 0.6099632040758562, "grad_norm": 3.5464839726531814, "learning_rate": 4.334014782198459e-08, "logits/chosen": -2.025193929672241, "logits/rejected": -2.5818653106689453, "logps/chosen": -351.2984619140625, "logps/rejected": -2376.37548828125, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -3.129291534423828, "rewards/margins": 20.211658477783203, "rewards/rejected": -23.34095001220703, "step": 4310 }, { "epoch": 0.6113784319275404, "grad_norm": 9.377483779919254, "learning_rate": 4.3182890391571e-08, "logits/chosen": -2.0745902061462402, "logits/rejected": -2.6512856483459473, "logps/chosen": -413.64312744140625, "logps/rejected": -2577.5859375, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7463860511779785, "rewards/margins": 21.61615753173828, "rewards/rejected": -25.3625431060791, "step": 4320 }, { "epoch": 0.6127936597792245, "grad_norm": 21.938622397355967, "learning_rate": 4.3025632961157415e-08, "logits/chosen": -2.101130723953247, "logits/rejected": -2.5422861576080322, "logps/chosen": -494.890625, "logps/rejected": -2364.568359375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -4.557198524475098, "rewards/margins": 18.672292709350586, "rewards/rejected": -23.229490280151367, "step": 4330 }, { "epoch": 0.6142088876309085, "grad_norm": 6.304473203798426, "learning_rate": 4.2868375530743825e-08, "logits/chosen": -2.1849138736724854, "logits/rejected": -2.797102928161621, "logps/chosen": -586.622802734375, "logps/rejected": -2697.25146484375, "loss": 0.0336, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.466094970703125, "rewards/margins": 21.038188934326172, "rewards/rejected": -26.504281997680664, "step": 4340 }, { "epoch": 0.6156241154825927, "grad_norm": 293.5498232964777, "learning_rate": 4.271111810033024e-08, "logits/chosen": -2.107938289642334, "logits/rejected": -2.659456968307495, "logps/chosen": -379.8336181640625, "logps/rejected": -2573.326904296875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.3935189247131348, "rewards/margins": 21.92513656616211, "rewards/rejected": -25.31865692138672, "step": 4350 }, { "epoch": 0.6170393433342768, "grad_norm": 1.2723619423608143, "learning_rate": 4.255386066991665e-08, "logits/chosen": -2.1578469276428223, "logits/rejected": -2.682788133621216, "logps/chosen": -483.5526428222656, "logps/rejected": -2484.32666015625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -4.455209732055664, "rewards/margins": 19.97170639038086, "rewards/rejected": -24.42691421508789, "step": 4360 }, { "epoch": 0.6184545711859609, "grad_norm": 0.0032676851411970512, "learning_rate": 4.239660323950307e-08, "logits/chosen": -2.0844788551330566, "logits/rejected": -2.634392738342285, "logps/chosen": -454.9593811035156, "logps/rejected": -2532.85400390625, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -4.161436557769775, "rewards/margins": 20.77223014831543, "rewards/rejected": -24.933666229248047, "step": 4370 }, { "epoch": 0.619869799037645, "grad_norm": 89.82036490561123, "learning_rate": 4.223934580908948e-08, "logits/chosen": -2.105895519256592, "logits/rejected": -2.7100675106048584, "logps/chosen": -398.8532409667969, "logps/rejected": -2650.884765625, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -3.594710111618042, "rewards/margins": 22.490280151367188, "rewards/rejected": -26.08498764038086, "step": 4380 }, { "epoch": 0.6212850268893292, "grad_norm": 212.477934171152, "learning_rate": 4.208208837867589e-08, "logits/chosen": -2.165888547897339, "logits/rejected": -2.739067316055298, "logps/chosen": -417.73663330078125, "logps/rejected": -2701.78271484375, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": -3.7796027660369873, "rewards/margins": 22.80507469177246, "rewards/rejected": -26.58467674255371, "step": 4390 }, { "epoch": 0.6227002547410133, "grad_norm": 310.8556017794845, "learning_rate": 4.1924830948262303e-08, "logits/chosen": -2.2332944869995117, "logits/rejected": -2.7395036220550537, "logps/chosen": -435.93426513671875, "logps/rejected": -2532.94580078125, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -3.9594740867614746, "rewards/margins": 20.949642181396484, "rewards/rejected": -24.90911293029785, "step": 4400 }, { "epoch": 0.6241154825926974, "grad_norm": 0.3393969950772514, "learning_rate": 4.176757351784871e-08, "logits/chosen": -2.175503730773926, "logits/rejected": -2.7957568168640137, "logps/chosen": -409.7478942871094, "logps/rejected": -2340.3310546875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.690502882003784, "rewards/margins": 19.304031372070312, "rewards/rejected": -22.994531631469727, "step": 4410 }, { "epoch": 0.6255307104443816, "grad_norm": 42.15510695472822, "learning_rate": 4.161031608743513e-08, "logits/chosen": -2.229233503341675, "logits/rejected": -2.7968392372131348, "logps/chosen": -499.3866271972656, "logps/rejected": -2735.408935546875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.6182541847229, "rewards/margins": 22.342121124267578, "rewards/rejected": -26.960376739501953, "step": 4420 }, { "epoch": 0.6269459382960657, "grad_norm": 618.2717240834519, "learning_rate": 4.1453058657021546e-08, "logits/chosen": -2.1718642711639404, "logits/rejected": -2.7798149585723877, "logps/chosen": -398.8190612792969, "logps/rejected": -2509.0341796875, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": -3.5974178314208984, "rewards/margins": 21.087202072143555, "rewards/rejected": -24.684619903564453, "step": 4430 }, { "epoch": 0.6283611661477498, "grad_norm": 0.7971095245357385, "learning_rate": 4.1295801226607956e-08, "logits/chosen": -2.2299039363861084, "logits/rejected": -2.776775598526001, "logps/chosen": -455.97271728515625, "logps/rejected": -2428.891357421875, "loss": 0.0303, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.159205436706543, "rewards/margins": 19.699779510498047, "rewards/rejected": -23.858983993530273, "step": 4440 }, { "epoch": 0.6297763939994339, "grad_norm": 0.0730654545180052, "learning_rate": 4.113854379619437e-08, "logits/chosen": -2.132262706756592, "logits/rejected": -2.7880513668060303, "logps/chosen": -389.67828369140625, "logps/rejected": -2444.65966796875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.5257956981658936, "rewards/margins": 20.52001190185547, "rewards/rejected": -24.045808792114258, "step": 4450 }, { "epoch": 0.631191621851118, "grad_norm": 32.803442209085894, "learning_rate": 4.098128636578078e-08, "logits/chosen": -2.189816951751709, "logits/rejected": -2.7988622188568115, "logps/chosen": -419.96929931640625, "logps/rejected": -2539.849609375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -3.813459873199463, "rewards/margins": 21.191980361938477, "rewards/rejected": -25.005441665649414, "step": 4460 }, { "epoch": 0.6326068497028021, "grad_norm": 273.5640124715191, "learning_rate": 4.082402893536719e-08, "logits/chosen": -2.1716790199279785, "logits/rejected": -2.7869367599487305, "logps/chosen": -490.68212890625, "logps/rejected": -2622.0029296875, "loss": 0.046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.511258602142334, "rewards/margins": 21.29242515563965, "rewards/rejected": -25.803680419921875, "step": 4470 }, { "epoch": 0.6340220775544863, "grad_norm": 199.29342629115118, "learning_rate": 4.066677150495361e-08, "logits/chosen": -2.179112672805786, "logits/rejected": -2.766667127609253, "logps/chosen": -446.0816955566406, "logps/rejected": -2572.841796875, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -4.047817707061768, "rewards/margins": 21.262977600097656, "rewards/rejected": -25.310794830322266, "step": 4480 }, { "epoch": 0.6354373054061704, "grad_norm": 217.0682755230799, "learning_rate": 4.050951407454002e-08, "logits/chosen": -2.2415144443511963, "logits/rejected": -2.833223581314087, "logps/chosen": -515.5640869140625, "logps/rejected": -2781.831787109375, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -4.759807109832764, "rewards/margins": 22.642742156982422, "rewards/rejected": -27.40254783630371, "step": 4490 }, { "epoch": 0.6368525332578545, "grad_norm": 4.201961978920339, "learning_rate": 4.035225664412643e-08, "logits/chosen": -2.163879632949829, "logits/rejected": -2.770552158355713, "logps/chosen": -386.10882568359375, "logps/rejected": -2679.5087890625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.4777755737304688, "rewards/margins": 22.869714736938477, "rewards/rejected": -26.347488403320312, "step": 4500 }, { "epoch": 0.6382677611095386, "grad_norm": 75.40674656993185, "learning_rate": 4.0194999213712845e-08, "logits/chosen": -2.273860216140747, "logits/rejected": -2.8366010189056396, "logps/chosen": -453.8173828125, "logps/rejected": -2437.501953125, "loss": 0.0527, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.143465995788574, "rewards/margins": 19.829713821411133, "rewards/rejected": -23.97317886352539, "step": 4510 }, { "epoch": 0.6396829889612228, "grad_norm": 17.185056795644353, "learning_rate": 4.003774178329926e-08, "logits/chosen": -2.1906943321228027, "logits/rejected": -2.7707178592681885, "logps/chosen": -427.0856018066406, "logps/rejected": -2349.642578125, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8678207397460938, "rewards/margins": 19.2106990814209, "rewards/rejected": -23.07851791381836, "step": 4520 }, { "epoch": 0.6410982168129069, "grad_norm": 0.002893202197821212, "learning_rate": 3.988048435288567e-08, "logits/chosen": -2.248884677886963, "logits/rejected": -2.81706166267395, "logps/chosen": -415.12701416015625, "logps/rejected": -2666.792724609375, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -3.7561233043670654, "rewards/margins": 22.499940872192383, "rewards/rejected": -26.25606346130371, "step": 4530 }, { "epoch": 0.642513444664591, "grad_norm": 3.317324124564132, "learning_rate": 3.972322692247209e-08, "logits/chosen": -2.190171718597412, "logits/rejected": -2.7163100242614746, "logps/chosen": -454.122802734375, "logps/rejected": -2432.881591796875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.157841682434082, "rewards/margins": 19.74380874633789, "rewards/rejected": -23.90165138244629, "step": 4540 }, { "epoch": 0.6439286725162752, "grad_norm": 2.259370734817384, "learning_rate": 3.95659694920585e-08, "logits/chosen": -2.233025312423706, "logits/rejected": -2.7819464206695557, "logps/chosen": -504.3277893066406, "logps/rejected": -2747.923828125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.651813983917236, "rewards/margins": 22.392444610595703, "rewards/rejected": -27.04425621032715, "step": 4550 }, { "epoch": 0.6453439003679592, "grad_norm": 11.238835735111142, "learning_rate": 3.9408712061644914e-08, "logits/chosen": -2.146369218826294, "logits/rejected": -2.789036512374878, "logps/chosen": -444.6117248535156, "logps/rejected": -2598.60888671875, "loss": 0.0338, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.076054096221924, "rewards/margins": 21.517114639282227, "rewards/rejected": -25.593168258666992, "step": 4560 }, { "epoch": 0.6467591282196433, "grad_norm": 515.2906295017784, "learning_rate": 3.9251454631231324e-08, "logits/chosen": -2.2083237171173096, "logits/rejected": -2.7664268016815186, "logps/chosen": -522.0889892578125, "logps/rejected": -2571.46923828125, "loss": 0.0489, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.827948093414307, "rewards/margins": 20.43856430053711, "rewards/rejected": -25.26651382446289, "step": 4570 }, { "epoch": 0.6481743560713275, "grad_norm": 3.8442209430165857, "learning_rate": 3.9094197200817733e-08, "logits/chosen": -2.133240222930908, "logits/rejected": -2.749157667160034, "logps/chosen": -351.77471923828125, "logps/rejected": -2372.89208984375, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1229968070983887, "rewards/margins": 20.204957962036133, "rewards/rejected": -23.327953338623047, "step": 4580 }, { "epoch": 0.6495895839230116, "grad_norm": 0.047243275053731416, "learning_rate": 3.893693977040415e-08, "logits/chosen": -2.189591646194458, "logits/rejected": -2.7078516483306885, "logps/chosen": -369.40667724609375, "logps/rejected": -2418.27587890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.304399013519287, "rewards/margins": 20.47203826904297, "rewards/rejected": -23.776439666748047, "step": 4590 }, { "epoch": 0.6510048117746957, "grad_norm": 17.362707590325936, "learning_rate": 3.877968233999056e-08, "logits/chosen": -2.1642465591430664, "logits/rejected": -2.753502607345581, "logps/chosen": -468.5284118652344, "logps/rejected": -2702.982666015625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -4.280177116394043, "rewards/margins": 22.320446014404297, "rewards/rejected": -26.60062599182129, "step": 4600 }, { "epoch": 0.6524200396263798, "grad_norm": 20.0979147419079, "learning_rate": 3.8622424909576976e-08, "logits/chosen": -2.1345951557159424, "logits/rejected": -2.784156322479248, "logps/chosen": -413.3033142089844, "logps/rejected": -2424.0, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -3.758263111114502, "rewards/margins": 20.101428985595703, "rewards/rejected": -23.859691619873047, "step": 4610 }, { "epoch": 0.653835267478064, "grad_norm": 67.1256820773158, "learning_rate": 3.846516747916339e-08, "logits/chosen": -2.183737277984619, "logits/rejected": -2.75423002243042, "logps/chosen": -549.2440185546875, "logps/rejected": -2891.803466796875, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -5.100428581237793, "rewards/margins": 23.362957000732422, "rewards/rejected": -28.4633846282959, "step": 4620 }, { "epoch": 0.6552504953297481, "grad_norm": 0.24014663694352925, "learning_rate": 3.83079100487498e-08, "logits/chosen": -2.0867128372192383, "logits/rejected": -2.675804853439331, "logps/chosen": -428.4165954589844, "logps/rejected": -2482.215087890625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -3.8920810222625732, "rewards/margins": 20.529285430908203, "rewards/rejected": -24.421367645263672, "step": 4630 }, { "epoch": 0.6566657231814322, "grad_norm": 0.33654880958530003, "learning_rate": 3.815065261833621e-08, "logits/chosen": -2.2239558696746826, "logits/rejected": -2.8264288902282715, "logps/chosen": -553.8880615234375, "logps/rejected": -2763.525390625, "loss": 0.0175, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.139135360717773, "rewards/margins": 22.087474822998047, "rewards/rejected": -27.226612091064453, "step": 4640 }, { "epoch": 0.6580809510331164, "grad_norm": 0.05369733297264345, "learning_rate": 3.799339518792263e-08, "logits/chosen": -2.1674022674560547, "logits/rejected": -2.816742420196533, "logps/chosen": -471.7744140625, "logps/rejected": -2664.063720703125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -4.327274322509766, "rewards/margins": 21.92813491821289, "rewards/rejected": -26.25541114807129, "step": 4650 }, { "epoch": 0.6594961788848005, "grad_norm": 2.223413577757982, "learning_rate": 3.783613775750904e-08, "logits/chosen": -2.225989818572998, "logits/rejected": -2.789524555206299, "logps/chosen": -483.2339782714844, "logps/rejected": -2837.98583984375, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -4.442988395690918, "rewards/margins": 23.50494956970215, "rewards/rejected": -27.94793701171875, "step": 4660 }, { "epoch": 0.6609114067364845, "grad_norm": 35.019206871171306, "learning_rate": 3.7678880327095455e-08, "logits/chosen": -2.15388822555542, "logits/rejected": -2.781710147857666, "logps/chosen": -357.7759704589844, "logps/rejected": -2566.62548828125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.2016372680664062, "rewards/margins": 22.029022216796875, "rewards/rejected": -25.230661392211914, "step": 4670 }, { "epoch": 0.6623266345881687, "grad_norm": 2.4300569227843836, "learning_rate": 3.7521622896681865e-08, "logits/chosen": -2.1381940841674805, "logits/rejected": -2.7122230529785156, "logps/chosen": -484.00262451171875, "logps/rejected": -2472.81591796875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -4.437803745269775, "rewards/margins": 19.878238677978516, "rewards/rejected": -24.3160400390625, "step": 4680 }, { "epoch": 0.6637418624398528, "grad_norm": 0.7337332718667464, "learning_rate": 3.7364365466268275e-08, "logits/chosen": -2.1421360969543457, "logits/rejected": -2.7459487915039062, "logps/chosen": -445.8492126464844, "logps/rejected": -2538.99169921875, "loss": 0.0174, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.050168037414551, "rewards/margins": 20.896228790283203, "rewards/rejected": -24.946399688720703, "step": 4690 }, { "epoch": 0.6651570902915369, "grad_norm": 0.039295710768818534, "learning_rate": 3.72071080358547e-08, "logits/chosen": -2.1227359771728516, "logits/rejected": -2.6986260414123535, "logps/chosen": -451.9212951660156, "logps/rejected": -2827.37353515625, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -4.116551399230957, "rewards/margins": 23.708566665649414, "rewards/rejected": -27.825115203857422, "step": 4700 }, { "epoch": 0.666572318143221, "grad_norm": 6.474318762420991, "learning_rate": 3.704985060544111e-08, "logits/chosen": -2.1248042583465576, "logits/rejected": -2.763118028640747, "logps/chosen": -473.68865966796875, "logps/rejected": -2511.98291015625, "loss": 0.0219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.352753639221191, "rewards/margins": 20.367938995361328, "rewards/rejected": -24.720691680908203, "step": 4710 }, { "epoch": 0.6679875459949052, "grad_norm": 0.17416756884428744, "learning_rate": 3.689259317502752e-08, "logits/chosen": -2.034395456314087, "logits/rejected": -2.6922085285186768, "logps/chosen": -427.30419921875, "logps/rejected": -2532.0712890625, "loss": 0.0248, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8897666931152344, "rewards/margins": 21.01900291442871, "rewards/rejected": -24.908771514892578, "step": 4720 }, { "epoch": 0.6694027738465893, "grad_norm": 15.520462221551007, "learning_rate": 3.6735335744613934e-08, "logits/chosen": -2.159655809402466, "logits/rejected": -2.8050224781036377, "logps/chosen": -510.55029296875, "logps/rejected": -2697.89794921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.696112155914307, "rewards/margins": 21.85150146484375, "rewards/rejected": -26.5476131439209, "step": 4730 }, { "epoch": 0.6708180016982734, "grad_norm": 0.8437152079394965, "learning_rate": 3.6578078314200344e-08, "logits/chosen": -2.1397390365600586, "logits/rejected": -2.807523250579834, "logps/chosen": -414.84027099609375, "logps/rejected": -2670.23583984375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.7671523094177246, "rewards/margins": 22.497682571411133, "rewards/rejected": -26.264835357666016, "step": 4740 }, { "epoch": 0.6722332295499576, "grad_norm": 1.5823544329655026, "learning_rate": 3.642082088378676e-08, "logits/chosen": -2.193114757537842, "logits/rejected": -2.779843807220459, "logps/chosen": -472.98614501953125, "logps/rejected": -2829.98876953125, "loss": 0.0203, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.3104658126831055, "rewards/margins": 23.5538330078125, "rewards/rejected": -27.864299774169922, "step": 4750 }, { "epoch": 0.6736484574016417, "grad_norm": 0.27871245340077777, "learning_rate": 3.626356345337317e-08, "logits/chosen": -2.0928502082824707, "logits/rejected": -2.688436508178711, "logps/chosen": -449.1346130371094, "logps/rejected": -2529.044189453125, "loss": 0.0215, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.082625865936279, "rewards/margins": 20.764589309692383, "rewards/rejected": -24.847217559814453, "step": 4760 }, { "epoch": 0.6750636852533258, "grad_norm": 0.11450793318637835, "learning_rate": 3.610630602295958e-08, "logits/chosen": -2.1711480617523193, "logits/rejected": -2.8058762550354004, "logps/chosen": -500.1576232910156, "logps/rejected": -2831.246826171875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.621743202209473, "rewards/margins": 23.2445011138916, "rewards/rejected": -27.86624526977539, "step": 4770 }, { "epoch": 0.67647891310501, "grad_norm": 232.81315392780363, "learning_rate": 3.5949048592545996e-08, "logits/chosen": -2.173741102218628, "logits/rejected": -2.774247646331787, "logps/chosen": -517.552978515625, "logps/rejected": -2631.00537109375, "loss": 0.0369, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.792543411254883, "rewards/margins": 21.09988021850586, "rewards/rejected": -25.892425537109375, "step": 4780 }, { "epoch": 0.677894140956694, "grad_norm": 3.3686729330178573, "learning_rate": 3.5791791162132406e-08, "logits/chosen": -2.146901845932007, "logits/rejected": -2.833010196685791, "logps/chosen": -439.4197692871094, "logps/rejected": -2608.911376953125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -3.994070053100586, "rewards/margins": 21.642189025878906, "rewards/rejected": -25.63625717163086, "step": 4790 }, { "epoch": 0.6793093688083781, "grad_norm": 246.71439145905674, "learning_rate": 3.563453373171882e-08, "logits/chosen": -2.076444149017334, "logits/rejected": -2.6905746459960938, "logps/chosen": -418.1537170410156, "logps/rejected": -2462.598388671875, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": -3.7866413593292236, "rewards/margins": 20.4066219329834, "rewards/rejected": -24.193262100219727, "step": 4800 }, { "epoch": 0.6807245966600622, "grad_norm": 0.09889712557321303, "learning_rate": 3.547727630130524e-08, "logits/chosen": -2.0740907192230225, "logits/rejected": -2.6846237182617188, "logps/chosen": -342.1187744140625, "logps/rejected": -2232.548583984375, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.037769317626953, "rewards/margins": 18.865291595458984, "rewards/rejected": -21.903060913085938, "step": 4810 }, { "epoch": 0.6821398245117464, "grad_norm": 0.42088792860438684, "learning_rate": 3.532001887089165e-08, "logits/chosen": -2.1514015197753906, "logits/rejected": -2.763117790222168, "logps/chosen": -407.44073486328125, "logps/rejected": -2591.51123046875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.689124345779419, "rewards/margins": 21.77523422241211, "rewards/rejected": -25.464359283447266, "step": 4820 }, { "epoch": 0.6835550523634305, "grad_norm": 4.22156873175335, "learning_rate": 3.516276144047806e-08, "logits/chosen": -2.188339948654175, "logits/rejected": -2.681401491165161, "logps/chosen": -436.4667053222656, "logps/rejected": -2428.57568359375, "loss": 0.0029, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9834511280059814, "rewards/margins": 19.90400505065918, "rewards/rejected": -23.887454986572266, "step": 4830 }, { "epoch": 0.6849702802151146, "grad_norm": 0.10268673535196886, "learning_rate": 3.5005504010064475e-08, "logits/chosen": -2.1399319171905518, "logits/rejected": -2.7415764331817627, "logps/chosen": -400.07928466796875, "logps/rejected": -2593.196044921875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.6077914237976074, "rewards/margins": 21.92772674560547, "rewards/rejected": -25.5355167388916, "step": 4840 }, { "epoch": 0.6863855080667988, "grad_norm": 0.7911071115567283, "learning_rate": 3.4848246579650885e-08, "logits/chosen": -2.1802406311035156, "logits/rejected": -2.748117685317993, "logps/chosen": -401.6334533691406, "logps/rejected": -2424.3251953125, "loss": 0.0143, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6180896759033203, "rewards/margins": 20.2027587890625, "rewards/rejected": -23.820846557617188, "step": 4850 }, { "epoch": 0.6878007359184829, "grad_norm": 6.777319566021177, "learning_rate": 3.46909891492373e-08, "logits/chosen": -2.1977226734161377, "logits/rejected": -2.801760196685791, "logps/chosen": -463.61212158203125, "logps/rejected": -2696.5869140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.245264530181885, "rewards/margins": 22.273788452148438, "rewards/rejected": -26.519052505493164, "step": 4860 }, { "epoch": 0.689215963770167, "grad_norm": 4.173383676070576, "learning_rate": 3.453373171882371e-08, "logits/chosen": -2.273566484451294, "logits/rejected": -2.7987303733825684, "logps/chosen": -571.9710693359375, "logps/rejected": -2758.0205078125, "loss": 0.0137, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.293512344360352, "rewards/margins": 21.874406814575195, "rewards/rejected": -27.167919158935547, "step": 4870 }, { "epoch": 0.6906311916218512, "grad_norm": 8.523892947161983, "learning_rate": 3.437647428841012e-08, "logits/chosen": -2.192385673522949, "logits/rejected": -2.8354320526123047, "logps/chosen": -423.11761474609375, "logps/rejected": -2705.10205078125, "loss": 0.0075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.855440616607666, "rewards/margins": 22.770709991455078, "rewards/rejected": -26.62615394592285, "step": 4880 }, { "epoch": 0.6920464194735353, "grad_norm": 3.2227019099592766, "learning_rate": 3.4219216857996544e-08, "logits/chosen": -2.1337101459503174, "logits/rejected": -2.7762322425842285, "logps/chosen": -401.63433837890625, "logps/rejected": -2602.624755859375, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -3.630293607711792, "rewards/margins": 21.95364761352539, "rewards/rejected": -25.583940505981445, "step": 4890 }, { "epoch": 0.6934616473252193, "grad_norm": 460.66968001360397, "learning_rate": 3.4061959427582954e-08, "logits/chosen": -2.244415283203125, "logits/rejected": -2.803194761276245, "logps/chosen": -427.8316955566406, "logps/rejected": -2595.56298828125, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -3.868039608001709, "rewards/margins": 21.684207916259766, "rewards/rejected": -25.55224609375, "step": 4900 }, { "epoch": 0.6948768751769034, "grad_norm": 11.465932587260157, "learning_rate": 3.3904701997169364e-08, "logits/chosen": -2.222407102584839, "logits/rejected": -2.894246816635132, "logps/chosen": -445.0523376464844, "logps/rejected": -2674.949951171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.067269325256348, "rewards/margins": 22.26643943786621, "rewards/rejected": -26.33370590209961, "step": 4910 }, { "epoch": 0.6962921030285876, "grad_norm": 0.8808469863846483, "learning_rate": 3.374744456675578e-08, "logits/chosen": -2.2150371074676514, "logits/rejected": -2.791086196899414, "logps/chosen": -431.4893493652344, "logps/rejected": -2578.240234375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -3.9218688011169434, "rewards/margins": 21.470722198486328, "rewards/rejected": -25.39259147644043, "step": 4920 }, { "epoch": 0.6977073308802717, "grad_norm": 0.35014222694281705, "learning_rate": 3.359018713634219e-08, "logits/chosen": -2.222144842147827, "logits/rejected": -2.8590171337127686, "logps/chosen": -367.0585632324219, "logps/rejected": -2589.51806640625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.273340940475464, "rewards/margins": 22.200210571289062, "rewards/rejected": -25.473552703857422, "step": 4930 }, { "epoch": 0.6991225587319558, "grad_norm": 20.93237576995447, "learning_rate": 3.34329297059286e-08, "logits/chosen": -2.2206740379333496, "logits/rejected": -2.9194743633270264, "logps/chosen": -529.857421875, "logps/rejected": -2812.49853515625, "loss": 0.0216, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.902166843414307, "rewards/margins": 22.82303237915039, "rewards/rejected": -27.72519874572754, "step": 4940 }, { "epoch": 0.70053778658364, "grad_norm": 0.8136443282483564, "learning_rate": 3.3275672275515017e-08, "logits/chosen": -2.2503151893615723, "logits/rejected": -2.880488395690918, "logps/chosen": -407.0754699707031, "logps/rejected": -2730.633056640625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -3.686631679534912, "rewards/margins": 23.180862426757812, "rewards/rejected": -26.867496490478516, "step": 4950 }, { "epoch": 0.7019530144353241, "grad_norm": 4.248392901272918, "learning_rate": 3.3118414845101426e-08, "logits/chosen": -2.224091053009033, "logits/rejected": -2.882352113723755, "logps/chosen": -377.22381591796875, "logps/rejected": -2651.075439453125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.396514892578125, "rewards/margins": 22.707164764404297, "rewards/rejected": -26.10367774963379, "step": 4960 }, { "epoch": 0.7033682422870082, "grad_norm": 0.16861531910149458, "learning_rate": 3.296115741468784e-08, "logits/chosen": -2.334170341491699, "logits/rejected": -2.9889726638793945, "logps/chosen": -451.17352294921875, "logps/rejected": -2813.073486328125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.107717037200928, "rewards/margins": 23.58891487121582, "rewards/rejected": -27.69663429260254, "step": 4970 }, { "epoch": 0.7047834701386924, "grad_norm": 0.04724597142258426, "learning_rate": 3.280389998427426e-08, "logits/chosen": -2.3433706760406494, "logits/rejected": -2.967660903930664, "logps/chosen": -540.9269409179688, "logps/rejected": -2780.67626953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.013993740081787, "rewards/margins": 22.393985748291016, "rewards/rejected": -27.407978057861328, "step": 4980 }, { "epoch": 0.7061986979903765, "grad_norm": 28.10882474467723, "learning_rate": 3.264664255386067e-08, "logits/chosen": -2.251415967941284, "logits/rejected": -2.899024486541748, "logps/chosen": -352.07232666015625, "logps/rejected": -2425.483154296875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.1426894664764404, "rewards/margins": 20.714780807495117, "rewards/rejected": -23.857471466064453, "step": 4990 }, { "epoch": 0.7076139258420606, "grad_norm": 466.89885559310466, "learning_rate": 3.2489385123447086e-08, "logits/chosen": -2.351824998855591, "logits/rejected": -2.9308762550354004, "logps/chosen": -360.3626403808594, "logps/rejected": -2535.707763671875, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -3.2161495685577393, "rewards/margins": 21.756999969482422, "rewards/rejected": -24.9731502532959, "step": 5000 }, { "epoch": 0.7090291536937446, "grad_norm": 16.59909790375792, "learning_rate": 3.2332127693033495e-08, "logits/chosen": -2.3165431022644043, "logits/rejected": -2.9376912117004395, "logps/chosen": -426.4134216308594, "logps/rejected": -2904.040771484375, "loss": 0.0192, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.864319324493408, "rewards/margins": 24.709056854248047, "rewards/rejected": -28.573373794555664, "step": 5010 }, { "epoch": 0.7104443815454288, "grad_norm": 0.16073277151899285, "learning_rate": 3.2174870262619905e-08, "logits/chosen": -2.3099637031555176, "logits/rejected": -2.924008369445801, "logps/chosen": -422.72393798828125, "logps/rejected": -2612.21142578125, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -3.8287925720214844, "rewards/margins": 21.85521697998047, "rewards/rejected": -25.684009552001953, "step": 5020 }, { "epoch": 0.7118596093971129, "grad_norm": 10.014767496226812, "learning_rate": 3.201761283220632e-08, "logits/chosen": -2.3482778072357178, "logits/rejected": -2.9404869079589844, "logps/chosen": -427.9723205566406, "logps/rejected": -2819.8427734375, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -3.8759055137634277, "rewards/margins": 23.8785343170166, "rewards/rejected": -27.754436492919922, "step": 5030 }, { "epoch": 0.713274837248797, "grad_norm": 1.2705831335581697, "learning_rate": 3.186035540179273e-08, "logits/chosen": -2.3337202072143555, "logits/rejected": -2.906454086303711, "logps/chosen": -492.93658447265625, "logps/rejected": -2744.57958984375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.53847599029541, "rewards/margins": 22.481287002563477, "rewards/rejected": -27.019763946533203, "step": 5040 }, { "epoch": 0.7146900651004812, "grad_norm": 0.012358507044096592, "learning_rate": 3.170309797137914e-08, "logits/chosen": -2.3666563034057617, "logits/rejected": -3.0064234733581543, "logps/chosen": -492.58319091796875, "logps/rejected": -2882.227783203125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -4.5360541343688965, "rewards/margins": 23.848506927490234, "rewards/rejected": -28.384563446044922, "step": 5050 }, { "epoch": 0.7161052929521653, "grad_norm": 68.96952891456655, "learning_rate": 3.154584054096556e-08, "logits/chosen": -2.2814993858337402, "logits/rejected": -2.9624009132385254, "logps/chosen": -403.7789001464844, "logps/rejected": -2879.290283203125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -3.6549384593963623, "rewards/margins": 24.7034854888916, "rewards/rejected": -28.358423233032227, "step": 5060 }, { "epoch": 0.7175205208038494, "grad_norm": 64.14977877413924, "learning_rate": 3.138858311055197e-08, "logits/chosen": -2.306459665298462, "logits/rejected": -2.9638750553131104, "logps/chosen": -404.8154296875, "logps/rejected": -2726.466796875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -3.6637699604034424, "rewards/margins": 23.223220825195312, "rewards/rejected": -26.88698959350586, "step": 5070 }, { "epoch": 0.7189357486555336, "grad_norm": 190.3823008760319, "learning_rate": 3.1231325680138384e-08, "logits/chosen": -2.2904200553894043, "logits/rejected": -2.992730140686035, "logps/chosen": -410.12939453125, "logps/rejected": -3051.2236328125, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -3.712137222290039, "rewards/margins": 26.349899291992188, "rewards/rejected": -30.06203269958496, "step": 5080 }, { "epoch": 0.7203509765072177, "grad_norm": 1.8666653272072644, "learning_rate": 3.10740682497248e-08, "logits/chosen": -2.349071979522705, "logits/rejected": -2.924166440963745, "logps/chosen": -386.9004821777344, "logps/rejected": -2505.014404296875, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -3.4915542602539062, "rewards/margins": 21.148162841796875, "rewards/rejected": -24.63971519470215, "step": 5090 }, { "epoch": 0.7217662043589018, "grad_norm": 735.5761595040998, "learning_rate": 3.091681081931121e-08, "logits/chosen": -2.366877317428589, "logits/rejected": -2.9681787490844727, "logps/chosen": -378.09698486328125, "logps/rejected": -2730.702880859375, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -3.391653537750244, "rewards/margins": 23.516271591186523, "rewards/rejected": -26.90792465209961, "step": 5100 }, { "epoch": 0.723181432210586, "grad_norm": 0.544661320306397, "learning_rate": 3.075955338889763e-08, "logits/chosen": -2.30947208404541, "logits/rejected": -2.9332175254821777, "logps/chosen": -377.777587890625, "logps/rejected": -2614.44580078125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.407829999923706, "rewards/margins": 22.31915855407715, "rewards/rejected": -25.726985931396484, "step": 5110 }, { "epoch": 0.72459666006227, "grad_norm": 14.95515025443874, "learning_rate": 3.060229595848404e-08, "logits/chosen": -2.3666181564331055, "logits/rejected": -2.939488410949707, "logps/chosen": -450.01910400390625, "logps/rejected": -2968.380126953125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -4.127093315124512, "rewards/margins": 25.092926025390625, "rewards/rejected": -29.220016479492188, "step": 5120 }, { "epoch": 0.7260118879139541, "grad_norm": 25.863039343973615, "learning_rate": 3.0445038528070447e-08, "logits/chosen": -2.3827500343322754, "logits/rejected": -2.9907479286193848, "logps/chosen": -465.19183349609375, "logps/rejected": -2745.38525390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.2607831954956055, "rewards/margins": 22.808359146118164, "rewards/rejected": -27.069141387939453, "step": 5130 }, { "epoch": 0.7274271157656382, "grad_norm": 0.07832888167536525, "learning_rate": 3.028778109765686e-08, "logits/chosen": -2.3784501552581787, "logits/rejected": -3.0328023433685303, "logps/chosen": -464.7734375, "logps/rejected": -2991.91259765625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.249230861663818, "rewards/margins": 25.255619049072266, "rewards/rejected": -29.50484848022461, "step": 5140 }, { "epoch": 0.7288423436173224, "grad_norm": 0.14633145436970119, "learning_rate": 3.013052366724327e-08, "logits/chosen": -2.3322761058807373, "logits/rejected": -2.9551894664764404, "logps/chosen": -402.8583984375, "logps/rejected": -2730.374755859375, "loss": 0.0381, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.635976791381836, "rewards/margins": 23.253164291381836, "rewards/rejected": -26.889141082763672, "step": 5150 }, { "epoch": 0.7302575714690065, "grad_norm": 3.3929394278397313, "learning_rate": 2.997326623682969e-08, "logits/chosen": -2.380861759185791, "logits/rejected": -3.0350773334503174, "logps/chosen": -358.6297912597656, "logps/rejected": -2586.890869140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.195507049560547, "rewards/margins": 22.26183319091797, "rewards/rejected": -25.45734214782715, "step": 5160 }, { "epoch": 0.7316727993206906, "grad_norm": 93.3363740670622, "learning_rate": 2.9816008806416106e-08, "logits/chosen": -2.3878884315490723, "logits/rejected": -2.967102527618408, "logps/chosen": -456.681640625, "logps/rejected": -2759.332763671875, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.183333873748779, "rewards/margins": 22.987262725830078, "rewards/rejected": -27.170597076416016, "step": 5170 }, { "epoch": 0.7330880271723748, "grad_norm": 48.31758104524267, "learning_rate": 2.9658751376002516e-08, "logits/chosen": -2.425957202911377, "logits/rejected": -3.1886610984802246, "logps/chosen": -393.4801330566406, "logps/rejected": -2761.158203125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.556442975997925, "rewards/margins": 23.658037185668945, "rewards/rejected": -27.2144775390625, "step": 5180 }, { "epoch": 0.7345032550240589, "grad_norm": 39.15673643770772, "learning_rate": 2.950149394558893e-08, "logits/chosen": -2.4541964530944824, "logits/rejected": -3.05249285697937, "logps/chosen": -511.9840393066406, "logps/rejected": -2844.85693359375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.724320411682129, "rewards/margins": 23.323705673217773, "rewards/rejected": -28.048025131225586, "step": 5190 }, { "epoch": 0.735918482875743, "grad_norm": 8.440665039096011, "learning_rate": 2.9344236515175342e-08, "logits/chosen": -2.4323296546936035, "logits/rejected": -3.0311174392700195, "logps/chosen": -463.8216247558594, "logps/rejected": -2971.464111328125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -4.234476566314697, "rewards/margins": 25.034326553344727, "rewards/rejected": -29.2688045501709, "step": 5200 }, { "epoch": 0.7373337107274271, "grad_norm": 410.58134536700277, "learning_rate": 2.9186979084761755e-08, "logits/chosen": -2.4116714000701904, "logits/rejected": -2.993337392807007, "logps/chosen": -348.43182373046875, "logps/rejected": -2656.684326171875, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -3.0938401222229004, "rewards/margins": 23.056177139282227, "rewards/rejected": -26.150014877319336, "step": 5210 }, { "epoch": 0.7387489385791113, "grad_norm": 0.5158216647429005, "learning_rate": 2.9029721654348165e-08, "logits/chosen": -2.380267858505249, "logits/rejected": -3.1247010231018066, "logps/chosen": -478.3251953125, "logps/rejected": -3221.777099609375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.383706092834473, "rewards/margins": 27.388015747070312, "rewards/rejected": -31.771724700927734, "step": 5220 }, { "epoch": 0.7401641664307954, "grad_norm": 2.449866599789229, "learning_rate": 2.8872464223934578e-08, "logits/chosen": -2.418041467666626, "logits/rejected": -3.11387038230896, "logps/chosen": -453.64459228515625, "logps/rejected": -3168.603271484375, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -4.1503753662109375, "rewards/margins": 27.1077823638916, "rewards/rejected": -31.258153915405273, "step": 5230 }, { "epoch": 0.7415793942824794, "grad_norm": 0.005879788026140351, "learning_rate": 2.871520679352099e-08, "logits/chosen": -2.394517421722412, "logits/rejected": -3.100220203399658, "logps/chosen": -472.73797607421875, "logps/rejected": -2855.99560546875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -4.338586807250977, "rewards/margins": 23.789546966552734, "rewards/rejected": -28.12813377380371, "step": 5240 }, { "epoch": 0.7429946221341636, "grad_norm": 0.4861176988951499, "learning_rate": 2.8557949363107404e-08, "logits/chosen": -2.415278911590576, "logits/rejected": -3.1278369426727295, "logps/chosen": -466.6405334472656, "logps/rejected": -2990.319091796875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.274433135986328, "rewards/margins": 25.200428009033203, "rewards/rejected": -29.474863052368164, "step": 5250 }, { "epoch": 0.7444098499858477, "grad_norm": 15.105582546766108, "learning_rate": 2.840069193269382e-08, "logits/chosen": -2.4212241172790527, "logits/rejected": -3.1449739933013916, "logps/chosen": -464.44049072265625, "logps/rejected": -3099.40234375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.245061874389648, "rewards/margins": 26.344348907470703, "rewards/rejected": -30.58941078186035, "step": 5260 }, { "epoch": 0.7458250778375318, "grad_norm": 11.051442576055877, "learning_rate": 2.8243434502280234e-08, "logits/chosen": -2.416017770767212, "logits/rejected": -3.0751802921295166, "logps/chosen": -472.03277587890625, "logps/rejected": -2979.28271484375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -4.341922283172607, "rewards/margins": 25.03268814086914, "rewards/rejected": -29.374608993530273, "step": 5270 }, { "epoch": 0.747240305689216, "grad_norm": 0.15751300482131067, "learning_rate": 2.8086177071866647e-08, "logits/chosen": -2.358640670776367, "logits/rejected": -3.074953317642212, "logps/chosen": -442.60955810546875, "logps/rejected": -3074.942138671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.056474208831787, "rewards/margins": 26.2812442779541, "rewards/rejected": -30.337717056274414, "step": 5280 }, { "epoch": 0.7486555335409001, "grad_norm": 0.022120832208476195, "learning_rate": 2.7928919641453057e-08, "logits/chosen": -2.4503417015075684, "logits/rejected": -3.1402549743652344, "logps/chosen": -453.72442626953125, "logps/rejected": -2933.64208984375, "loss": 0.005, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.1535563468933105, "rewards/margins": 24.779117584228516, "rewards/rejected": -28.932674407958984, "step": 5290 }, { "epoch": 0.7500707613925842, "grad_norm": 1.5683811279166215, "learning_rate": 2.777166221103947e-08, "logits/chosen": -2.4732229709625244, "logits/rejected": -3.043804168701172, "logps/chosen": -625.1907958984375, "logps/rejected": -3260.33251953125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -5.857964515686035, "rewards/margins": 26.329784393310547, "rewards/rejected": -32.187747955322266, "step": 5300 }, { "epoch": 0.7514859892442683, "grad_norm": 1.4231804744063978, "learning_rate": 2.7614404780625883e-08, "logits/chosen": -2.3933157920837402, "logits/rejected": -3.0878701210021973, "logps/chosen": -498.7466735839844, "logps/rejected": -2857.7822265625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.599503993988037, "rewards/margins": 23.54153823852539, "rewards/rejected": -28.141040802001953, "step": 5310 }, { "epoch": 0.7529012170959525, "grad_norm": 0.07392163813122665, "learning_rate": 2.7457147350212296e-08, "logits/chosen": -2.4756124019622803, "logits/rejected": -3.119912624359131, "logps/chosen": -393.8204040527344, "logps/rejected": -2821.778076171875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -3.545227527618408, "rewards/margins": 24.281009674072266, "rewards/rejected": -27.82623863220215, "step": 5320 }, { "epoch": 0.7543164449476366, "grad_norm": 10.334838567675025, "learning_rate": 2.7299889919798706e-08, "logits/chosen": -2.3837931156158447, "logits/rejected": -3.011298179626465, "logps/chosen": -560.7975463867188, "logps/rejected": -2958.85498046875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -5.201260566711426, "rewards/margins": 23.962474822998047, "rewards/rejected": -29.16373634338379, "step": 5330 }, { "epoch": 0.7557316727993207, "grad_norm": 0.40310141408692696, "learning_rate": 2.714263248938512e-08, "logits/chosen": -2.395378828048706, "logits/rejected": -3.029428720474243, "logps/chosen": -416.8080139160156, "logps/rejected": -2885.09716796875, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -3.789524793624878, "rewards/margins": 24.629037857055664, "rewards/rejected": -28.418563842773438, "step": 5340 }, { "epoch": 0.7571469006510048, "grad_norm": 5.031936488880312, "learning_rate": 2.698537505897154e-08, "logits/chosen": -2.42699933052063, "logits/rejected": -3.0644140243530273, "logps/chosen": -418.76123046875, "logps/rejected": -2736.560791015625, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7783782482147217, "rewards/margins": 23.15431022644043, "rewards/rejected": -26.932687759399414, "step": 5350 }, { "epoch": 0.7585621285026889, "grad_norm": 0.013688550489522739, "learning_rate": 2.682811762855795e-08, "logits/chosen": -2.3755650520324707, "logits/rejected": -3.043818473815918, "logps/chosen": -379.75225830078125, "logps/rejected": -2800.75048828125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.397148847579956, "rewards/margins": 24.180435180664062, "rewards/rejected": -27.57758140563965, "step": 5360 }, { "epoch": 0.759977356354373, "grad_norm": 0.9443458557152, "learning_rate": 2.6670860198144362e-08, "logits/chosen": -2.389136552810669, "logits/rejected": -2.993366003036499, "logps/chosen": -346.0116271972656, "logps/rejected": -2508.891845703125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -3.0821425914764404, "rewards/margins": 21.593486785888672, "rewards/rejected": -24.675628662109375, "step": 5370 }, { "epoch": 0.7613925842060572, "grad_norm": 0.16072812672400852, "learning_rate": 2.6513602767730775e-08, "logits/chosen": -2.4827589988708496, "logits/rejected": -3.0742974281311035, "logps/chosen": -393.43255615234375, "logps/rejected": -2707.121826171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.5374832153320312, "rewards/margins": 23.08511734008789, "rewards/rejected": -26.62259864807129, "step": 5380 }, { "epoch": 0.7628078120577413, "grad_norm": 1.537685246652549, "learning_rate": 2.635634533731719e-08, "logits/chosen": -2.542954921722412, "logits/rejected": -3.0591468811035156, "logps/chosen": -393.53936767578125, "logps/rejected": -2737.49267578125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -3.5219109058380127, "rewards/margins": 23.4456729888916, "rewards/rejected": -26.967580795288086, "step": 5390 }, { "epoch": 0.7642230399094254, "grad_norm": 0.7102216440935567, "learning_rate": 2.6199087906903598e-08, "logits/chosen": -2.3727755546569824, "logits/rejected": -3.0708107948303223, "logps/chosen": -391.15374755859375, "logps/rejected": -2723.08349609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.5080723762512207, "rewards/margins": 23.307172775268555, "rewards/rejected": -26.815242767333984, "step": 5400 }, { "epoch": 0.7656382677611095, "grad_norm": 1.7087574492203268, "learning_rate": 2.604183047649001e-08, "logits/chosen": -2.389887809753418, "logits/rejected": -2.957799196243286, "logps/chosen": -442.97943115234375, "logps/rejected": -2480.927001953125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -4.030056476593018, "rewards/margins": 20.394256591796875, "rewards/rejected": -24.424314498901367, "step": 5410 }, { "epoch": 0.7670534956127937, "grad_norm": 339.48154967976086, "learning_rate": 2.5884573046076425e-08, "logits/chosen": -2.403367042541504, "logits/rejected": -3.024543285369873, "logps/chosen": -370.25701904296875, "logps/rejected": -2426.62939453125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.324413776397705, "rewards/margins": 20.54647445678711, "rewards/rejected": -23.87088966369629, "step": 5420 }, { "epoch": 0.7684687234644778, "grad_norm": 0.0038543186079105347, "learning_rate": 2.5727315615662838e-08, "logits/chosen": -2.361445665359497, "logits/rejected": -2.9870285987854004, "logps/chosen": -423.522216796875, "logps/rejected": -2807.713134765625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.8397057056427, "rewards/margins": 23.793851852416992, "rewards/rejected": -27.633556365966797, "step": 5430 }, { "epoch": 0.7698839513161619, "grad_norm": 0.07061559790134765, "learning_rate": 2.557005818524925e-08, "logits/chosen": -2.4181323051452637, "logits/rejected": -3.0983800888061523, "logps/chosen": -386.98236083984375, "logps/rejected": -2714.745849609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.48588228225708, "rewards/margins": 23.244705200195312, "rewards/rejected": -26.7305850982666, "step": 5440 }, { "epoch": 0.7712991791678461, "grad_norm": 1.0542782227344882, "learning_rate": 2.5412800754835667e-08, "logits/chosen": -2.402047634124756, "logits/rejected": -3.139467239379883, "logps/chosen": -418.95050048828125, "logps/rejected": -2894.5771484375, "loss": 0.003, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7945194244384766, "rewards/margins": 24.730417251586914, "rewards/rejected": -28.52493667602539, "step": 5450 }, { "epoch": 0.7727144070195301, "grad_norm": 0.2779243845249537, "learning_rate": 2.525554332442208e-08, "logits/chosen": -2.4830567836761475, "logits/rejected": -3.0577304363250732, "logps/chosen": -481.2237243652344, "logps/rejected": -2799.615234375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.424283504486084, "rewards/margins": 23.140228271484375, "rewards/rejected": -27.56451416015625, "step": 5460 }, { "epoch": 0.7741296348712142, "grad_norm": 0.001614903853639071, "learning_rate": 2.509828589400849e-08, "logits/chosen": -2.4319307804107666, "logits/rejected": -3.0982625484466553, "logps/chosen": -420.0403747558594, "logps/rejected": -2957.39990234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.809900999069214, "rewards/margins": 25.35227394104004, "rewards/rejected": -29.162174224853516, "step": 5470 }, { "epoch": 0.7755448627228984, "grad_norm": 0.11287990475571039, "learning_rate": 2.4941028463594903e-08, "logits/chosen": -2.4724059104919434, "logits/rejected": -3.143155336380005, "logps/chosen": -499.76751708984375, "logps/rejected": -3009.125244140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.603152275085449, "rewards/margins": 25.053966522216797, "rewards/rejected": -29.657119750976562, "step": 5480 }, { "epoch": 0.7769600905745825, "grad_norm": 0.0031792887249063773, "learning_rate": 2.4783771033181317e-08, "logits/chosen": -2.4502196311950684, "logits/rejected": -3.106163501739502, "logps/chosen": -452.1151428222656, "logps/rejected": -3046.9921875, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -4.125353813171387, "rewards/margins": 25.910852432250977, "rewards/rejected": -30.036209106445312, "step": 5490 }, { "epoch": 0.7783753184262666, "grad_norm": 21.144600658073347, "learning_rate": 2.462651360276773e-08, "logits/chosen": -2.53572678565979, "logits/rejected": -3.1394474506378174, "logps/chosen": -492.87261962890625, "logps/rejected": -2974.0009765625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.559457302093506, "rewards/margins": 24.780838012695312, "rewards/rejected": -29.34029197692871, "step": 5500 }, { "epoch": 0.7797905462779507, "grad_norm": 688.5132436888534, "learning_rate": 2.4469256172354143e-08, "logits/chosen": -2.505772113800049, "logits/rejected": -3.1205708980560303, "logps/chosen": -438.775146484375, "logps/rejected": -2844.891845703125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -4.0045342445373535, "rewards/margins": 24.01961326599121, "rewards/rejected": -28.024145126342773, "step": 5510 }, { "epoch": 0.7812057741296349, "grad_norm": 2.04527432515602, "learning_rate": 2.4311998741940556e-08, "logits/chosen": -2.5155529975891113, "logits/rejected": -3.146347761154175, "logps/chosen": -471.2928161621094, "logps/rejected": -2901.266357421875, "loss": 0.0215, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.325685977935791, "rewards/margins": 24.312963485717773, "rewards/rejected": -28.63865089416504, "step": 5520 }, { "epoch": 0.782621001981319, "grad_norm": 0.1954249156272008, "learning_rate": 2.415474131152697e-08, "logits/chosen": -2.505908250808716, "logits/rejected": -3.2104477882385254, "logps/chosen": -464.07666015625, "logps/rejected": -2787.186767578125, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.261427879333496, "rewards/margins": 23.21707534790039, "rewards/rejected": -27.478504180908203, "step": 5530 }, { "epoch": 0.7840362298330031, "grad_norm": 0.08130444494358188, "learning_rate": 2.3997483881113382e-08, "logits/chosen": -2.493338108062744, "logits/rejected": -3.1987690925598145, "logps/chosen": -478.85968017578125, "logps/rejected": -2890.828125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -4.397215843200684, "rewards/margins": 24.09793472290039, "rewards/rejected": -28.495153427124023, "step": 5540 }, { "epoch": 0.7854514576846873, "grad_norm": 0.11412751848084568, "learning_rate": 2.3840226450699792e-08, "logits/chosen": -2.473738193511963, "logits/rejected": -3.1367814540863037, "logps/chosen": -501.6346130371094, "logps/rejected": -3343.792236328125, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": -4.605276107788086, "rewards/margins": 28.37369728088379, "rewards/rejected": -32.978973388671875, "step": 5550 }, { "epoch": 0.7868666855363714, "grad_norm": 18.813609380451286, "learning_rate": 2.368296902028621e-08, "logits/chosen": -2.4499051570892334, "logits/rejected": -3.1544482707977295, "logps/chosen": -392.2893981933594, "logps/rejected": -2904.77783203125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.5480456352233887, "rewards/margins": 25.07615852355957, "rewards/rejected": -28.62420654296875, "step": 5560 }, { "epoch": 0.7882819133880554, "grad_norm": 0.04874665021082881, "learning_rate": 2.3525711589872622e-08, "logits/chosen": -2.5106797218322754, "logits/rejected": -3.1626343727111816, "logps/chosen": -476.0611267089844, "logps/rejected": -2882.09521484375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.363162040710449, "rewards/margins": 24.023862838745117, "rewards/rejected": -28.38702392578125, "step": 5570 }, { "epoch": 0.7896971412397396, "grad_norm": 158.82011189909218, "learning_rate": 2.3368454159459035e-08, "logits/chosen": -2.5690979957580566, "logits/rejected": -3.245887279510498, "logps/chosen": -596.3643798828125, "logps/rejected": -3153.947998046875, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.582499980926514, "rewards/margins": 25.578554153442383, "rewards/rejected": -31.161056518554688, "step": 5580 }, { "epoch": 0.7911123690914237, "grad_norm": 17.905996992896082, "learning_rate": 2.3211196729045445e-08, "logits/chosen": -2.501009225845337, "logits/rejected": -3.169318914413452, "logps/chosen": -413.26190185546875, "logps/rejected": -2909.63134765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.7543578147888184, "rewards/margins": 24.922134399414062, "rewards/rejected": -28.676494598388672, "step": 5590 }, { "epoch": 0.7925275969431078, "grad_norm": 195.96545539757918, "learning_rate": 2.3053939298631858e-08, "logits/chosen": -2.505743980407715, "logits/rejected": -3.2110257148742676, "logps/chosen": -520.4005737304688, "logps/rejected": -3019.17333984375, "loss": 0.0174, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.82213830947876, "rewards/margins": 24.970016479492188, "rewards/rejected": -29.79215431213379, "step": 5600 }, { "epoch": 0.793942824794792, "grad_norm": 0.02881136065851408, "learning_rate": 2.2896681868218274e-08, "logits/chosen": -2.4442343711853027, "logits/rejected": -3.12615704536438, "logps/chosen": -418.211669921875, "logps/rejected": -2906.785400390625, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -3.789506196975708, "rewards/margins": 24.857540130615234, "rewards/rejected": -28.647043228149414, "step": 5610 }, { "epoch": 0.7953580526464761, "grad_norm": 0.5762745883464914, "learning_rate": 2.2739424437804684e-08, "logits/chosen": -2.439141035079956, "logits/rejected": -3.053330183029175, "logps/chosen": -378.9190368652344, "logps/rejected": -2538.710693359375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.414896011352539, "rewards/margins": 21.583683013916016, "rewards/rejected": -24.998579025268555, "step": 5620 }, { "epoch": 0.7967732804981602, "grad_norm": 0.0965333727820031, "learning_rate": 2.2582167007391097e-08, "logits/chosen": -2.3790204524993896, "logits/rejected": -3.0169568061828613, "logps/chosen": -377.258056640625, "logps/rejected": -2615.1083984375, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -3.3781723976135254, "rewards/margins": 22.410722732543945, "rewards/rejected": -25.788898468017578, "step": 5630 }, { "epoch": 0.7981885083498443, "grad_norm": 0.012707512235150618, "learning_rate": 2.242490957697751e-08, "logits/chosen": -2.464611530303955, "logits/rejected": -3.105639934539795, "logps/chosen": -389.56231689453125, "logps/rejected": -2530.92138671875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -3.5222811698913574, "rewards/margins": 21.39410972595215, "rewards/rejected": -24.916391372680664, "step": 5640 }, { "epoch": 0.7996037362015285, "grad_norm": 0.0290351496827346, "learning_rate": 2.2267652146563924e-08, "logits/chosen": -2.3964805603027344, "logits/rejected": -3.0239510536193848, "logps/chosen": -363.98175048828125, "logps/rejected": -2758.244384765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2528693675994873, "rewards/margins": 23.903902053833008, "rewards/rejected": -27.15677261352539, "step": 5650 }, { "epoch": 0.8010189640532126, "grad_norm": 0.003854155360336278, "learning_rate": 2.2110394716150337e-08, "logits/chosen": -2.5319371223449707, "logits/rejected": -3.1688613891601562, "logps/chosen": -470.25079345703125, "logps/rejected": -2738.374267578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.316301345825195, "rewards/margins": 22.627601623535156, "rewards/rejected": -26.943904876708984, "step": 5660 }, { "epoch": 0.8024341919048967, "grad_norm": 0.036103804174889814, "learning_rate": 2.195313728573675e-08, "logits/chosen": -2.3937039375305176, "logits/rejected": -3.057990550994873, "logps/chosen": -389.3491516113281, "logps/rejected": -2879.369873046875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.523488998413086, "rewards/margins": 24.839693069458008, "rewards/rejected": -28.363183975219727, "step": 5670 }, { "epoch": 0.8038494197565809, "grad_norm": 421.5675384776544, "learning_rate": 2.1795879855323163e-08, "logits/chosen": -2.508765697479248, "logits/rejected": -3.0653510093688965, "logps/chosen": -421.3018493652344, "logps/rejected": -2614.12841796875, "loss": 0.0077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8311495780944824, "rewards/margins": 21.900224685668945, "rewards/rejected": -25.731372833251953, "step": 5680 }, { "epoch": 0.8052646476082649, "grad_norm": 24.683168028547023, "learning_rate": 2.1638622424909576e-08, "logits/chosen": -2.424180269241333, "logits/rejected": -3.0357260704040527, "logps/chosen": -395.1703796386719, "logps/rejected": -2466.227783203125, "loss": 0.013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.558955669403076, "rewards/margins": 20.702594757080078, "rewards/rejected": -24.261550903320312, "step": 5690 }, { "epoch": 0.806679875459949, "grad_norm": 15.973860215521084, "learning_rate": 2.148136499449599e-08, "logits/chosen": -2.4732117652893066, "logits/rejected": -3.1132616996765137, "logps/chosen": -455.1783142089844, "logps/rejected": -2672.98388671875, "loss": 0.0152, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.170393943786621, "rewards/margins": 22.153480529785156, "rewards/rejected": -26.32387351989746, "step": 5700 }, { "epoch": 0.8080951033116331, "grad_norm": 0.0038859652658068326, "learning_rate": 2.1324107564082402e-08, "logits/chosen": -2.4391837120056152, "logits/rejected": -3.081395387649536, "logps/chosen": -451.08599853515625, "logps/rejected": -2868.46728515625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -4.118927955627441, "rewards/margins": 24.170429229736328, "rewards/rejected": -28.289358139038086, "step": 5710 }, { "epoch": 0.8095103311633173, "grad_norm": 161.13121671525792, "learning_rate": 2.1166850133668816e-08, "logits/chosen": -2.4893500804901123, "logits/rejected": -3.1070961952209473, "logps/chosen": -398.16680908203125, "logps/rejected": -2891.61474609375, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": -3.6010947227478027, "rewards/margins": 24.86907958984375, "rewards/rejected": -28.470172882080078, "step": 5720 }, { "epoch": 0.8109255590150014, "grad_norm": 296.5326536729743, "learning_rate": 2.100959270325523e-08, "logits/chosen": -2.565605640411377, "logits/rejected": -3.1608805656433105, "logps/chosen": -474.08062744140625, "logps/rejected": -2889.138671875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -4.348505020141602, "rewards/margins": 24.11379051208496, "rewards/rejected": -28.462299346923828, "step": 5730 }, { "epoch": 0.8123407868666855, "grad_norm": 0.02794779335085734, "learning_rate": 2.085233527284164e-08, "logits/chosen": -2.5279369354248047, "logits/rejected": -3.209749221801758, "logps/chosen": -592.97265625, "logps/rejected": -2969.0517578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.509655475616455, "rewards/margins": 23.72345542907715, "rewards/rejected": -29.23311424255371, "step": 5740 }, { "epoch": 0.8137560147183697, "grad_norm": 61.12529225816948, "learning_rate": 2.0695077842428055e-08, "logits/chosen": -2.5233638286590576, "logits/rejected": -3.22310209274292, "logps/chosen": -391.7742919921875, "logps/rejected": -2939.45263671875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.545985698699951, "rewards/margins": 25.421306610107422, "rewards/rejected": -28.967296600341797, "step": 5750 }, { "epoch": 0.8151712425700538, "grad_norm": 0.4111655026083906, "learning_rate": 2.0537820412014468e-08, "logits/chosen": -2.5101444721221924, "logits/rejected": -3.064378499984741, "logps/chosen": -514.0364990234375, "logps/rejected": -2821.5634765625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.75023078918457, "rewards/margins": 23.043582916259766, "rewards/rejected": -27.793813705444336, "step": 5760 }, { "epoch": 0.8165864704217379, "grad_norm": 0.23272255276155093, "learning_rate": 2.0380562981600878e-08, "logits/chosen": -2.406623363494873, "logits/rejected": -3.167402505874634, "logps/chosen": -488.1158142089844, "logps/rejected": -3147.432861328125, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -4.491507053375244, "rewards/margins": 26.56463050842285, "rewards/rejected": -31.056137084960938, "step": 5770 }, { "epoch": 0.8180016982734221, "grad_norm": 0.01166569832459685, "learning_rate": 2.022330555118729e-08, "logits/chosen": -2.447815418243408, "logits/rejected": -3.133197546005249, "logps/chosen": -392.048828125, "logps/rejected": -2658.923095703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.543788433074951, "rewards/margins": 22.633228302001953, "rewards/rejected": -26.177013397216797, "step": 5780 }, { "epoch": 0.8194169261251062, "grad_norm": 0.8182444662563602, "learning_rate": 2.0066048120773708e-08, "logits/chosen": -2.485495090484619, "logits/rejected": -3.162827968597412, "logps/chosen": -444.2049255371094, "logps/rejected": -3034.143310546875, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.055840969085693, "rewards/margins": 25.862545013427734, "rewards/rejected": -29.918386459350586, "step": 5790 }, { "epoch": 0.8208321539767902, "grad_norm": 129.72866615306538, "learning_rate": 1.990879069036012e-08, "logits/chosen": -2.392277479171753, "logits/rejected": -3.0537219047546387, "logps/chosen": -412.60003662109375, "logps/rejected": -2774.33544921875, "loss": 0.0162, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.73881196975708, "rewards/margins": 23.60390853881836, "rewards/rejected": -27.34271812438965, "step": 5800 }, { "epoch": 0.8222473818284743, "grad_norm": 0.03596951854733009, "learning_rate": 1.975153325994653e-08, "logits/chosen": -2.4932284355163574, "logits/rejected": -3.1999614238739014, "logps/chosen": -395.8096923828125, "logps/rejected": -2700.17578125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.5629985332489014, "rewards/margins": 23.048913955688477, "rewards/rejected": -26.61191177368164, "step": 5810 }, { "epoch": 0.8236626096801585, "grad_norm": 0.582228234691383, "learning_rate": 1.9594275829532944e-08, "logits/chosen": -2.500483274459839, "logits/rejected": -3.214937210083008, "logps/chosen": -515.9754028320312, "logps/rejected": -2871.27783203125, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -4.760280609130859, "rewards/margins": 23.524572372436523, "rewards/rejected": -28.28485107421875, "step": 5820 }, { "epoch": 0.8250778375318426, "grad_norm": 0.03794942458885882, "learning_rate": 1.9437018399119357e-08, "logits/chosen": -2.4820868968963623, "logits/rejected": -3.182788133621216, "logps/chosen": -440.0284729003906, "logps/rejected": -2730.35009765625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.994266986846924, "rewards/margins": 22.876903533935547, "rewards/rejected": -26.871173858642578, "step": 5830 }, { "epoch": 0.8264930653835267, "grad_norm": 0.05255701998657121, "learning_rate": 1.927976096870577e-08, "logits/chosen": -2.511559247970581, "logits/rejected": -3.1529412269592285, "logps/chosen": -483.10711669921875, "logps/rejected": -3016.70654296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.406891822814941, "rewards/margins": 25.346975326538086, "rewards/rejected": -29.75386619567871, "step": 5840 }, { "epoch": 0.8279082932352109, "grad_norm": 0.06387061840828995, "learning_rate": 1.9122503538292183e-08, "logits/chosen": -2.5401546955108643, "logits/rejected": -3.2195029258728027, "logps/chosen": -493.57666015625, "logps/rejected": -2975.26025390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.527033805847168, "rewards/margins": 24.77916145324707, "rewards/rejected": -29.306194305419922, "step": 5850 }, { "epoch": 0.829323521086895, "grad_norm": 0.27910419441281564, "learning_rate": 1.8965246107878596e-08, "logits/chosen": -2.546062469482422, "logits/rejected": -3.147341728210449, "logps/chosen": -516.1578369140625, "logps/rejected": -2894.943115234375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.769204139709473, "rewards/margins": 23.743566513061523, "rewards/rejected": -28.512767791748047, "step": 5860 }, { "epoch": 0.8307387489385791, "grad_norm": 36.15879388597224, "learning_rate": 1.880798867746501e-08, "logits/chosen": -2.543337821960449, "logits/rejected": -3.232966661453247, "logps/chosen": -497.57098388671875, "logps/rejected": -3018.37353515625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.581273555755615, "rewards/margins": 25.20567512512207, "rewards/rejected": -29.78694725036621, "step": 5870 }, { "epoch": 0.8321539767902633, "grad_norm": 0.6139776688006455, "learning_rate": 1.8650731247051423e-08, "logits/chosen": -2.6187503337860107, "logits/rejected": -3.146149158477783, "logps/chosen": -495.1102600097656, "logps/rejected": -2925.56298828125, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -4.560490608215332, "rewards/margins": 24.259859085083008, "rewards/rejected": -28.820348739624023, "step": 5880 }, { "epoch": 0.8335692046419474, "grad_norm": 0.14272905144900305, "learning_rate": 1.8493473816637836e-08, "logits/chosen": -2.4712393283843994, "logits/rejected": -3.1455776691436768, "logps/chosen": -418.3623046875, "logps/rejected": -2893.91455078125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.7830443382263184, "rewards/margins": 24.72901725769043, "rewards/rejected": -28.51206398010254, "step": 5890 }, { "epoch": 0.8349844324936315, "grad_norm": 0.1598948596818688, "learning_rate": 1.833621638622425e-08, "logits/chosen": -2.5415754318237305, "logits/rejected": -3.2143478393554688, "logps/chosen": -425.067626953125, "logps/rejected": -3033.609375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.8663992881774902, "rewards/margins": 26.045730590820312, "rewards/rejected": -29.912128448486328, "step": 5900 }, { "epoch": 0.8363996603453155, "grad_norm": 4.0414181661148145, "learning_rate": 1.8178958955810662e-08, "logits/chosen": -2.562729835510254, "logits/rejected": -3.1571967601776123, "logps/chosen": -461.3857421875, "logps/rejected": -2875.40478515625, "loss": 0.0305, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.225476264953613, "rewards/margins": 24.095348358154297, "rewards/rejected": -28.32082748413086, "step": 5910 }, { "epoch": 0.8378148881969997, "grad_norm": 0.3407487198917977, "learning_rate": 1.8021701525397072e-08, "logits/chosen": -2.5938796997070312, "logits/rejected": -3.2140774726867676, "logps/chosen": -442.18878173828125, "logps/rejected": -2852.28369140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.0479326248168945, "rewards/margins": 24.057491302490234, "rewards/rejected": -28.105422973632812, "step": 5920 }, { "epoch": 0.8392301160486838, "grad_norm": 357.71880802731334, "learning_rate": 1.786444409498349e-08, "logits/chosen": -2.554145574569702, "logits/rejected": -3.176191568374634, "logps/chosen": -407.5418395996094, "logps/rejected": -2861.902587890625, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -3.6974310874938965, "rewards/margins": 24.47948455810547, "rewards/rejected": -28.17691421508789, "step": 5930 }, { "epoch": 0.8406453439003679, "grad_norm": 26.792785290481866, "learning_rate": 1.77071866645699e-08, "logits/chosen": -2.5877339839935303, "logits/rejected": -3.2773265838623047, "logps/chosen": -527.5278930664062, "logps/rejected": -3115.45947265625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.872957229614258, "rewards/margins": 25.8360652923584, "rewards/rejected": -30.709014892578125, "step": 5940 }, { "epoch": 0.8420605717520521, "grad_norm": 0.014117135815549865, "learning_rate": 1.7549929234156315e-08, "logits/chosen": -2.529301166534424, "logits/rejected": -3.1223063468933105, "logps/chosen": -469.23101806640625, "logps/rejected": -3080.9052734375, "loss": 0.0212, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.287397384643555, "rewards/margins": 26.068511962890625, "rewards/rejected": -30.355905532836914, "step": 5950 }, { "epoch": 0.8434757996037362, "grad_norm": 8.327693166612677, "learning_rate": 1.7392671803742725e-08, "logits/chosen": -2.552298069000244, "logits/rejected": -3.2232093811035156, "logps/chosen": -505.46685791015625, "logps/rejected": -3019.358642578125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -4.64530086517334, "rewards/margins": 25.118581771850586, "rewards/rejected": -29.76388168334961, "step": 5960 }, { "epoch": 0.8448910274554203, "grad_norm": 0.3870037110905339, "learning_rate": 1.7235414373329138e-08, "logits/chosen": -2.5576796531677246, "logits/rejected": -3.1462979316711426, "logps/chosen": -455.8260192871094, "logps/rejected": -3019.5439453125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.16148042678833, "rewards/margins": 25.567852020263672, "rewards/rejected": -29.729333877563477, "step": 5970 }, { "epoch": 0.8463062553071045, "grad_norm": 0.9687519779122667, "learning_rate": 1.7078156942915554e-08, "logits/chosen": -2.5481646060943604, "logits/rejected": -3.177790403366089, "logps/chosen": -490.2967224121094, "logps/rejected": -2855.4677734375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.513144493103027, "rewards/margins": 23.626338958740234, "rewards/rejected": -28.139484405517578, "step": 5980 }, { "epoch": 0.8477214831587886, "grad_norm": 0.026800101297577168, "learning_rate": 1.6920899512501964e-08, "logits/chosen": -2.561830759048462, "logits/rejected": -3.2354812622070312, "logps/chosen": -452.0931091308594, "logps/rejected": -2867.513671875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.134773254394531, "rewards/margins": 24.110172271728516, "rewards/rejected": -28.244943618774414, "step": 5990 }, { "epoch": 0.8491367110104727, "grad_norm": 130.17340730945583, "learning_rate": 1.6763642082088377e-08, "logits/chosen": -2.579188108444214, "logits/rejected": -3.1085338592529297, "logps/chosen": -456.41693115234375, "logps/rejected": -2751.689453125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.155807971954346, "rewards/margins": 22.9361515045166, "rewards/rejected": -27.091960906982422, "step": 6000 }, { "epoch": 0.8505519388621569, "grad_norm": 0.21715048528025363, "learning_rate": 1.660638465167479e-08, "logits/chosen": -2.4512369632720947, "logits/rejected": -3.090526580810547, "logps/chosen": -376.82745361328125, "logps/rejected": -2540.60791015625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.3875198364257812, "rewards/margins": 21.61964988708496, "rewards/rejected": -25.007169723510742, "step": 6010 }, { "epoch": 0.8519671667138409, "grad_norm": 7.581081601146398, "learning_rate": 1.6449127221261203e-08, "logits/chosen": -2.537994623184204, "logits/rejected": -3.091541051864624, "logps/chosen": -437.0892639160156, "logps/rejected": -2879.85009765625, "loss": 0.047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.993961811065674, "rewards/margins": 24.363445281982422, "rewards/rejected": -28.357402801513672, "step": 6020 }, { "epoch": 0.853382394565525, "grad_norm": 1.2686632300101859, "learning_rate": 1.6291869790847617e-08, "logits/chosen": -2.6003050804138184, "logits/rejected": -3.2037270069122314, "logps/chosen": -428.6337890625, "logps/rejected": -2843.62744140625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.9041848182678223, "rewards/margins": 24.113994598388672, "rewards/rejected": -28.018178939819336, "step": 6030 }, { "epoch": 0.8547976224172091, "grad_norm": 0.005560538630182003, "learning_rate": 1.613461236043403e-08, "logits/chosen": -2.4688217639923096, "logits/rejected": -3.1291909217834473, "logps/chosen": -249.0596923828125, "logps/rejected": -2613.9462890625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.126141309738159, "rewards/margins": 23.608631134033203, "rewards/rejected": -25.734771728515625, "step": 6040 }, { "epoch": 0.8562128502688933, "grad_norm": 9.683491870760822, "learning_rate": 1.5977354930020443e-08, "logits/chosen": -2.4871606826782227, "logits/rejected": -3.16336727142334, "logps/chosen": -357.671630859375, "logps/rejected": -2962.277587890625, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -3.1902835369110107, "rewards/margins": 26.022533416748047, "rewards/rejected": -29.212814331054688, "step": 6050 }, { "epoch": 0.8576280781205774, "grad_norm": 0.5371144201580696, "learning_rate": 1.5820097499606856e-08, "logits/chosen": -2.566019058227539, "logits/rejected": -3.1128604412078857, "logps/chosen": -295.2489013671875, "logps/rejected": -2455.61474609375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.5693411827087402, "rewards/margins": 21.585834503173828, "rewards/rejected": -24.15517807006836, "step": 6060 }, { "epoch": 0.8590433059722615, "grad_norm": 0.010918467380623657, "learning_rate": 1.566284006919327e-08, "logits/chosen": -2.511425495147705, "logits/rejected": -3.144568920135498, "logps/chosen": -404.60394287109375, "logps/rejected": -2893.50439453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.6426479816436768, "rewards/margins": 24.874204635620117, "rewards/rejected": -28.5168514251709, "step": 6070 }, { "epoch": 0.8604585338239457, "grad_norm": 0.09472787827663022, "learning_rate": 1.5505582638779682e-08, "logits/chosen": -2.5290656089782715, "logits/rejected": -3.116499662399292, "logps/chosen": -446.09765625, "logps/rejected": -2696.24169921875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.057980537414551, "rewards/margins": 22.48325538635254, "rewards/rejected": -26.541234970092773, "step": 6080 }, { "epoch": 0.8618737616756298, "grad_norm": 0.18831620984799188, "learning_rate": 1.5348325208366095e-08, "logits/chosen": -2.566530704498291, "logits/rejected": -3.1360857486724854, "logps/chosen": -504.75445556640625, "logps/rejected": -2945.71435546875, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.6389899253845215, "rewards/margins": 24.41191291809082, "rewards/rejected": -29.050899505615234, "step": 6090 }, { "epoch": 0.8632889895273139, "grad_norm": 0.19461280033808695, "learning_rate": 1.519106777795251e-08, "logits/chosen": -2.4989514350891113, "logits/rejected": -3.1932692527770996, "logps/chosen": -393.9683532714844, "logps/rejected": -2787.820556640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.5397872924804688, "rewards/margins": 23.891063690185547, "rewards/rejected": -27.430850982666016, "step": 6100 }, { "epoch": 0.864704217378998, "grad_norm": 0.8791474495634848, "learning_rate": 1.503381034753892e-08, "logits/chosen": -2.5811991691589355, "logits/rejected": -3.1620841026306152, "logps/chosen": -522.1402587890625, "logps/rejected": -2959.65283203125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -4.839706897735596, "rewards/margins": 24.356103897094727, "rewards/rejected": -29.195810317993164, "step": 6110 }, { "epoch": 0.8661194452306822, "grad_norm": 185.48556059638238, "learning_rate": 1.4876552917125335e-08, "logits/chosen": -2.4725141525268555, "logits/rejected": -3.248051404953003, "logps/chosen": -406.7466125488281, "logps/rejected": -2855.53271484375, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -3.6826934814453125, "rewards/margins": 24.460472106933594, "rewards/rejected": -28.143163681030273, "step": 6120 }, { "epoch": 0.8675346730823663, "grad_norm": 0.02104696926390046, "learning_rate": 1.4719295486711746e-08, "logits/chosen": -2.5342578887939453, "logits/rejected": -3.2363505363464355, "logps/chosen": -404.277099609375, "logps/rejected": -2934.286865234375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.646024703979492, "rewards/margins": 25.26700782775879, "rewards/rejected": -28.91303062438965, "step": 6130 }, { "epoch": 0.8689499009340503, "grad_norm": 0.006516872372889263, "learning_rate": 1.456203805629816e-08, "logits/chosen": -2.5548386573791504, "logits/rejected": -3.2190029621124268, "logps/chosen": -413.60943603515625, "logps/rejected": -3094.02978515625, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -3.757631778717041, "rewards/margins": 26.766132354736328, "rewards/rejected": -30.523761749267578, "step": 6140 }, { "epoch": 0.8703651287857345, "grad_norm": 0.8275799225866035, "learning_rate": 1.4404780625884571e-08, "logits/chosen": -2.4959053993225098, "logits/rejected": -3.1957485675811768, "logps/chosen": -362.0610046386719, "logps/rejected": -2879.400390625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -3.2415497303009033, "rewards/margins": 25.173686981201172, "rewards/rejected": -28.415237426757812, "step": 6150 }, { "epoch": 0.8717803566374186, "grad_norm": 0.05165003700098383, "learning_rate": 1.4247523195470984e-08, "logits/chosen": -2.5362563133239746, "logits/rejected": -3.131014347076416, "logps/chosen": -449.9742126464844, "logps/rejected": -2902.367919921875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -4.088015556335449, "rewards/margins": 24.45737075805664, "rewards/rejected": -28.54538345336914, "step": 6160 }, { "epoch": 0.8731955844891027, "grad_norm": 0.14709778244992003, "learning_rate": 1.4090265765057399e-08, "logits/chosen": -2.5371062755584717, "logits/rejected": -3.2161293029785156, "logps/chosen": -325.2726745605469, "logps/rejected": -2570.487548828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.8717567920684814, "rewards/margins": 22.434070587158203, "rewards/rejected": -25.305828094482422, "step": 6170 }, { "epoch": 0.8746108123407869, "grad_norm": 0.00046597210794844786, "learning_rate": 1.3933008334643812e-08, "logits/chosen": -2.5618388652801514, "logits/rejected": -3.2253546714782715, "logps/chosen": -416.46881103515625, "logps/rejected": -2806.509765625, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -3.775066375732422, "rewards/margins": 23.875621795654297, "rewards/rejected": -27.65069007873535, "step": 6180 }, { "epoch": 0.876026040192471, "grad_norm": 3.887710773138902, "learning_rate": 1.3775750904230224e-08, "logits/chosen": -2.553612232208252, "logits/rejected": -3.1159911155700684, "logps/chosen": -518.810791015625, "logps/rejected": -3033.154296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.78640079498291, "rewards/margins": 25.099712371826172, "rewards/rejected": -29.8861141204834, "step": 6190 }, { "epoch": 0.8774412680441551, "grad_norm": 0.054923880203803246, "learning_rate": 1.3618493473816637e-08, "logits/chosen": -2.58113694190979, "logits/rejected": -3.215674877166748, "logps/chosen": -449.8055725097656, "logps/rejected": -2690.087890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.105997562408447, "rewards/margins": 22.411460876464844, "rewards/rejected": -26.5174560546875, "step": 6200 }, { "epoch": 0.8788564958958393, "grad_norm": 0.0987099349769097, "learning_rate": 1.3461236043403052e-08, "logits/chosen": -2.577101945877075, "logits/rejected": -3.200169086456299, "logps/chosen": -442.88800048828125, "logps/rejected": -3146.964111328125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -4.047898292541504, "rewards/margins": 26.983585357666016, "rewards/rejected": -31.031482696533203, "step": 6210 }, { "epoch": 0.8802717237475234, "grad_norm": 0.7024420382554746, "learning_rate": 1.3303978612989463e-08, "logits/chosen": -2.582029342651367, "logits/rejected": -3.255147933959961, "logps/chosen": -501.8687438964844, "logps/rejected": -2864.296630859375, "loss": 0.0459, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.585448741912842, "rewards/margins": 23.61685562133789, "rewards/rejected": -28.202306747436523, "step": 6220 }, { "epoch": 0.8816869515992075, "grad_norm": 1.3020795133145366, "learning_rate": 1.3146721182575876e-08, "logits/chosen": -2.530090808868408, "logits/rejected": -3.256685256958008, "logps/chosen": -415.53875732421875, "logps/rejected": -2663.60009765625, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.782421827316284, "rewards/margins": 22.45937728881836, "rewards/rejected": -26.24180030822754, "step": 6230 }, { "epoch": 0.8831021794508916, "grad_norm": 0.07081575983355229, "learning_rate": 1.298946375216229e-08, "logits/chosen": -2.6060731410980225, "logits/rejected": -3.2080464363098145, "logps/chosen": -393.4821472167969, "logps/rejected": -2741.852294921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.556368350982666, "rewards/margins": 23.447826385498047, "rewards/rejected": -27.004192352294922, "step": 6240 }, { "epoch": 0.8845174073025757, "grad_norm": 5.281897635678403, "learning_rate": 1.28322063217487e-08, "logits/chosen": -2.526094436645508, "logits/rejected": -3.1373915672302246, "logps/chosen": -429.30438232421875, "logps/rejected": -2806.70068359375, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -3.901991605758667, "rewards/margins": 23.75434684753418, "rewards/rejected": -27.65633773803711, "step": 6250 }, { "epoch": 0.8859326351542598, "grad_norm": 292.03420945587675, "learning_rate": 1.2674948891335116e-08, "logits/chosen": -2.5555849075317383, "logits/rejected": -3.2073168754577637, "logps/chosen": -353.7906188964844, "logps/rejected": -2823.84130859375, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -3.1611454486846924, "rewards/margins": 24.662837982177734, "rewards/rejected": -27.8239803314209, "step": 6260 }, { "epoch": 0.8873478630059439, "grad_norm": 1.2225292880068073, "learning_rate": 1.2517691460921529e-08, "logits/chosen": -2.5660133361816406, "logits/rejected": -3.2094902992248535, "logps/chosen": -453.3834533691406, "logps/rejected": -2797.363525390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.118340492248535, "rewards/margins": 23.42843246459961, "rewards/rejected": -27.546772003173828, "step": 6270 }, { "epoch": 0.8887630908576281, "grad_norm": 0.05328345477609401, "learning_rate": 1.236043403050794e-08, "logits/chosen": -2.6342549324035645, "logits/rejected": -3.251710891723633, "logps/chosen": -443.024658203125, "logps/rejected": -2835.54833984375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.010542869567871, "rewards/margins": 23.920629501342773, "rewards/rejected": -27.93117332458496, "step": 6280 }, { "epoch": 0.8901783187093122, "grad_norm": 0.10432304907267172, "learning_rate": 1.2203176600094355e-08, "logits/chosen": -2.5605766773223877, "logits/rejected": -3.240870952606201, "logps/chosen": -433.1863708496094, "logps/rejected": -3097.61474609375, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.93723726272583, "rewards/margins": 26.613048553466797, "rewards/rejected": -30.550283432006836, "step": 6290 }, { "epoch": 0.8915935465609963, "grad_norm": 2.805239136740369, "learning_rate": 1.2045919169680767e-08, "logits/chosen": -2.636470079421997, "logits/rejected": -3.256408214569092, "logps/chosen": -501.0801696777344, "logps/rejected": -3029.068359375, "loss": 0.0095, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.613499641418457, "rewards/margins": 25.241046905517578, "rewards/rejected": -29.85454750061035, "step": 6300 }, { "epoch": 0.8930087744126805, "grad_norm": 0.06190579535941882, "learning_rate": 1.188866173926718e-08, "logits/chosen": -2.4899632930755615, "logits/rejected": -3.2022507190704346, "logps/chosen": -502.60101318359375, "logps/rejected": -3009.101318359375, "loss": 0.0147, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.612534999847412, "rewards/margins": 25.022991180419922, "rewards/rejected": -29.63552474975586, "step": 6310 }, { "epoch": 0.8944240022643646, "grad_norm": 329.2004429490255, "learning_rate": 1.1731404308853593e-08, "logits/chosen": -2.5157878398895264, "logits/rejected": -3.2090916633605957, "logps/chosen": -475.99310302734375, "logps/rejected": -2774.2109375, "loss": 0.0425, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.360434532165527, "rewards/margins": 22.96625518798828, "rewards/rejected": -27.326690673828125, "step": 6320 }, { "epoch": 0.8958392301160487, "grad_norm": 1.6426116223623302, "learning_rate": 1.1574146878440006e-08, "logits/chosen": -2.457486867904663, "logits/rejected": -3.1887881755828857, "logps/chosen": -341.50628662109375, "logps/rejected": -2626.24267578125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.0008745193481445, "rewards/margins": 22.843460083007812, "rewards/rejected": -25.84433364868164, "step": 6330 }, { "epoch": 0.8972544579677328, "grad_norm": 5.923004163505092, "learning_rate": 1.1416889448026419e-08, "logits/chosen": -2.4186036586761475, "logits/rejected": -3.08160138130188, "logps/chosen": -387.20819091796875, "logps/rejected": -2595.337646484375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.491797924041748, "rewards/margins": 22.07093620300293, "rewards/rejected": -25.562734603881836, "step": 6340 }, { "epoch": 0.898669685819417, "grad_norm": 0.010001960204992189, "learning_rate": 1.1259632017612832e-08, "logits/chosen": -2.541804552078247, "logits/rejected": -3.2163634300231934, "logps/chosen": -366.0329284667969, "logps/rejected": -2700.55908203125, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2713420391082764, "rewards/margins": 23.32735824584961, "rewards/rejected": -26.59869956970215, "step": 6350 }, { "epoch": 0.900084913671101, "grad_norm": 0.04297213677665855, "learning_rate": 1.1102374587199245e-08, "logits/chosen": -2.508523464202881, "logits/rejected": -3.153554677963257, "logps/chosen": -415.4168395996094, "logps/rejected": -2661.183349609375, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -3.7572968006134033, "rewards/margins": 22.46708869934082, "rewards/rejected": -26.22438621520996, "step": 6360 }, { "epoch": 0.9015001415227851, "grad_norm": 0.18107919302604203, "learning_rate": 1.0945117156785657e-08, "logits/chosen": -2.490666151046753, "logits/rejected": -3.0826382637023926, "logps/chosen": -331.5885009765625, "logps/rejected": -2606.04150390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.938490390777588, "rewards/margins": 22.714929580688477, "rewards/rejected": -25.653417587280273, "step": 6370 }, { "epoch": 0.9029153693744693, "grad_norm": 0.0922572836829624, "learning_rate": 1.078785972637207e-08, "logits/chosen": -2.5126190185546875, "logits/rejected": -3.1915974617004395, "logps/chosen": -388.4457092285156, "logps/rejected": -2837.22314453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4912095069885254, "rewards/margins": 24.45429229736328, "rewards/rejected": -27.94550132751465, "step": 6380 }, { "epoch": 0.9043305972261534, "grad_norm": 0.2452337816591314, "learning_rate": 1.0630602295958483e-08, "logits/chosen": -2.5313632488250732, "logits/rejected": -3.131080389022827, "logps/chosen": -373.8753356933594, "logps/rejected": -2725.629150390625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.344817638397217, "rewards/margins": 23.501384735107422, "rewards/rejected": -26.846202850341797, "step": 6390 }, { "epoch": 0.9057458250778375, "grad_norm": 0.5065366077725473, "learning_rate": 1.0473344865544896e-08, "logits/chosen": -2.440309762954712, "logits/rejected": -3.077716588973999, "logps/chosen": -393.05035400390625, "logps/rejected": -2816.71875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.53899884223938, "rewards/margins": 24.215715408325195, "rewards/rejected": -27.754718780517578, "step": 6400 }, { "epoch": 0.9071610529295216, "grad_norm": 3.2370280637435487, "learning_rate": 1.031608743513131e-08, "logits/chosen": -2.5545272827148438, "logits/rejected": -3.1959388256073, "logps/chosen": -417.9851989746094, "logps/rejected": -2950.1552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.790144443511963, "rewards/margins": 25.26341438293457, "rewards/rejected": -29.053558349609375, "step": 6410 }, { "epoch": 0.9085762807812058, "grad_norm": 3.3428874993145077, "learning_rate": 1.0158830004717723e-08, "logits/chosen": -2.5231223106384277, "logits/rejected": -3.1422176361083984, "logps/chosen": -517.7268676757812, "logps/rejected": -2997.398681640625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.785609245300293, "rewards/margins": 24.756935119628906, "rewards/rejected": -29.542545318603516, "step": 6420 }, { "epoch": 0.9099915086328899, "grad_norm": 2.326263223443667, "learning_rate": 1.0001572574304136e-08, "logits/chosen": -2.5041370391845703, "logits/rejected": -3.213315963745117, "logps/chosen": -361.72027587890625, "logps/rejected": -2584.052978515625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.2484562397003174, "rewards/margins": 22.21963119506836, "rewards/rejected": -25.46808624267578, "step": 6430 }, { "epoch": 0.911406736484574, "grad_norm": 0.031184185033551727, "learning_rate": 9.844315143890549e-09, "logits/chosen": -2.505342960357666, "logits/rejected": -3.126692295074463, "logps/chosen": -498.55218505859375, "logps/rejected": -3000.236083984375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.589402675628662, "rewards/margins": 24.991695404052734, "rewards/rejected": -29.581096649169922, "step": 6440 }, { "epoch": 0.9128219643362582, "grad_norm": 0.07194044896721327, "learning_rate": 9.68705771347696e-09, "logits/chosen": -2.5120351314544678, "logits/rejected": -3.155113697052002, "logps/chosen": -443.19500732421875, "logps/rejected": -2908.272705078125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.034646034240723, "rewards/margins": 24.607282638549805, "rewards/rejected": -28.64192771911621, "step": 6450 }, { "epoch": 0.9142371921879423, "grad_norm": 0.015943325020434572, "learning_rate": 9.529800283063375e-09, "logits/chosen": -2.5007221698760986, "logits/rejected": -3.189314365386963, "logps/chosen": -409.7893981933594, "logps/rejected": -2726.744140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.698823928833008, "rewards/margins": 23.156803131103516, "rewards/rejected": -26.855627059936523, "step": 6460 }, { "epoch": 0.9156524200396263, "grad_norm": 0.3753664601272291, "learning_rate": 9.372542852649787e-09, "logits/chosen": -2.4669125080108643, "logits/rejected": -3.155294895172119, "logps/chosen": -350.2514343261719, "logps/rejected": -2806.182861328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.132167339324951, "rewards/margins": 24.517221450805664, "rewards/rejected": -27.649389266967773, "step": 6470 }, { "epoch": 0.9170676478913105, "grad_norm": 0.19569237562349948, "learning_rate": 9.2152854222362e-09, "logits/chosen": -2.485018253326416, "logits/rejected": -3.112067222595215, "logps/chosen": -342.31134033203125, "logps/rejected": -2562.9189453125, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -3.0415549278259277, "rewards/margins": 22.183246612548828, "rewards/rejected": -25.224802017211914, "step": 6480 }, { "epoch": 0.9184828757429946, "grad_norm": 0.12179302217021704, "learning_rate": 9.058027991822613e-09, "logits/chosen": -2.5989136695861816, "logits/rejected": -3.227771043777466, "logps/chosen": -430.302490234375, "logps/rejected": -3001.483154296875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -3.917614698410034, "rewards/margins": 25.63525390625, "rewards/rejected": -29.552867889404297, "step": 6490 }, { "epoch": 0.9198981035946787, "grad_norm": 1.4381881090921127, "learning_rate": 8.900770561409026e-09, "logits/chosen": -2.5356051921844482, "logits/rejected": -3.287515640258789, "logps/chosen": -427.92852783203125, "logps/rejected": -2638.7177734375, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -3.8924460411071777, "rewards/margins": 22.10297203063965, "rewards/rejected": -25.99542236328125, "step": 6500 }, { "epoch": 0.9213133314463628, "grad_norm": 0.24254961037562675, "learning_rate": 8.74351313099544e-09, "logits/chosen": -2.5304338932037354, "logits/rejected": -3.2248470783233643, "logps/chosen": -429.7798767089844, "logps/rejected": -2707.469970703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.9057343006134033, "rewards/margins": 22.771482467651367, "rewards/rejected": -26.67721939086914, "step": 6510 }, { "epoch": 0.922728559298047, "grad_norm": 10.069973974813468, "learning_rate": 8.58625570058185e-09, "logits/chosen": -2.5849969387054443, "logits/rejected": -3.2126522064208984, "logps/chosen": -438.419189453125, "logps/rejected": -3070.768310546875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.004711627960205, "rewards/margins": 26.28034019470215, "rewards/rejected": -30.285049438476562, "step": 6520 }, { "epoch": 0.9241437871497311, "grad_norm": 2.364120320448904, "learning_rate": 8.428998270168266e-09, "logits/chosen": -2.5308117866516113, "logits/rejected": -3.1276068687438965, "logps/chosen": -368.597900390625, "logps/rejected": -2630.19140625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.3030776977539062, "rewards/margins": 22.58272361755371, "rewards/rejected": -25.885799407958984, "step": 6530 }, { "epoch": 0.9255590150014152, "grad_norm": 9.692461937823506, "learning_rate": 8.271740839754677e-09, "logits/chosen": -2.4981741905212402, "logits/rejected": -3.0879766941070557, "logps/chosen": -471.1439514160156, "logps/rejected": -2777.46240234375, "loss": 0.0095, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.302935600280762, "rewards/margins": 23.051820755004883, "rewards/rejected": -27.35475730895996, "step": 6540 }, { "epoch": 0.9269742428530994, "grad_norm": 0.11859409705212479, "learning_rate": 8.114483409341092e-09, "logits/chosen": -2.550344944000244, "logits/rejected": -3.2037181854248047, "logps/chosen": -324.40673828125, "logps/rejected": -2989.517822265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.8632924556732178, "rewards/margins": 26.609600067138672, "rewards/rejected": -29.4728946685791, "step": 6550 }, { "epoch": 0.9283894707047835, "grad_norm": 0.15436333367326202, "learning_rate": 7.957225978927503e-09, "logits/chosen": -2.52455997467041, "logits/rejected": -3.1924126148223877, "logps/chosen": -478.3021545410156, "logps/rejected": -3014.308837890625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.394221305847168, "rewards/margins": 25.294414520263672, "rewards/rejected": -29.68863868713379, "step": 6560 }, { "epoch": 0.9298046985564676, "grad_norm": 0.05601802835400842, "learning_rate": 7.799968548513918e-09, "logits/chosen": -2.5811691284179688, "logits/rejected": -3.2521209716796875, "logps/chosen": -385.7977600097656, "logps/rejected": -2523.89599609375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.459597110748291, "rewards/margins": 21.38286781311035, "rewards/rejected": -24.842464447021484, "step": 6570 }, { "epoch": 0.9312199264081518, "grad_norm": 1.1449473438539999, "learning_rate": 7.64271111810033e-09, "logits/chosen": -2.4976139068603516, "logits/rejected": -3.15960955619812, "logps/chosen": -415.5726013183594, "logps/rejected": -2786.330078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7871131896972656, "rewards/margins": 23.658687591552734, "rewards/rejected": -27.44580078125, "step": 6580 }, { "epoch": 0.9326351542598358, "grad_norm": 2.2471703032136308, "learning_rate": 7.485453687686743e-09, "logits/chosen": -2.5343689918518066, "logits/rejected": -3.19372296333313, "logps/chosen": -467.86724853515625, "logps/rejected": -2810.1396484375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.259548187255859, "rewards/margins": 23.412616729736328, "rewards/rejected": -27.672164916992188, "step": 6590 }, { "epoch": 0.9340503821115199, "grad_norm": 20.726415448061296, "learning_rate": 7.328196257273156e-09, "logits/chosen": -2.6028857231140137, "logits/rejected": -3.2126851081848145, "logps/chosen": -451.20672607421875, "logps/rejected": -2855.85595703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.115120887756348, "rewards/margins": 24.01508140563965, "rewards/rejected": -28.130207061767578, "step": 6600 }, { "epoch": 0.935465609963204, "grad_norm": 0.0024719046040340246, "learning_rate": 7.170938826859568e-09, "logits/chosen": -2.6418912410736084, "logits/rejected": -3.226320743560791, "logps/chosen": -437.61944580078125, "logps/rejected": -2724.34765625, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -3.9595673084259033, "rewards/margins": 22.84752655029297, "rewards/rejected": -26.80709457397461, "step": 6610 }, { "epoch": 0.9368808378148882, "grad_norm": 0.07249745020157336, "learning_rate": 7.013681396445982e-09, "logits/chosen": -2.6213529109954834, "logits/rejected": -3.143972396850586, "logps/chosen": -474.4747619628906, "logps/rejected": -2898.43603515625, "loss": 0.0163, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.352960586547852, "rewards/margins": 24.202037811279297, "rewards/rejected": -28.55499839782715, "step": 6620 }, { "epoch": 0.9382960656665723, "grad_norm": 1.6858040987564331, "learning_rate": 6.8564239660323946e-09, "logits/chosen": -2.5217795372009277, "logits/rejected": -3.1645846366882324, "logps/chosen": -356.1394958496094, "logps/rejected": -2560.604736328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.174966812133789, "rewards/margins": 22.031288146972656, "rewards/rejected": -25.206254959106445, "step": 6630 }, { "epoch": 0.9397112935182564, "grad_norm": 0.42759059500952096, "learning_rate": 6.6991665356188086e-09, "logits/chosen": -2.5588769912719727, "logits/rejected": -3.1910831928253174, "logps/chosen": -461.03485107421875, "logps/rejected": -2795.68603515625, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.231947898864746, "rewards/margins": 23.338594436645508, "rewards/rejected": -27.570541381835938, "step": 6640 }, { "epoch": 0.9411265213699406, "grad_norm": 0.1115190975887913, "learning_rate": 6.541909105205221e-09, "logits/chosen": -2.4988367557525635, "logits/rejected": -3.125074625015259, "logps/chosen": -437.36590576171875, "logps/rejected": -2532.310791015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.011125564575195, "rewards/margins": 20.94327163696289, "rewards/rejected": -24.954397201538086, "step": 6650 }, { "epoch": 0.9425417492216247, "grad_norm": 152.8851360462959, "learning_rate": 6.384651674791634e-09, "logits/chosen": -2.5039424896240234, "logits/rejected": -3.185145854949951, "logps/chosen": -363.4470520019531, "logps/rejected": -2898.4560546875, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": -3.262129545211792, "rewards/margins": 25.294475555419922, "rewards/rejected": -28.556604385375977, "step": 6660 }, { "epoch": 0.9439569770733088, "grad_norm": 13.808310513078263, "learning_rate": 6.227394244378046e-09, "logits/chosen": -2.5821967124938965, "logits/rejected": -3.2668204307556152, "logps/chosen": -397.5326232910156, "logps/rejected": -3054.84228515625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.5857625007629395, "rewards/margins": 26.55099105834961, "rewards/rejected": -30.136754989624023, "step": 6670 }, { "epoch": 0.945372204924993, "grad_norm": 6.708244323843358, "learning_rate": 6.0701368139644595e-09, "logits/chosen": -2.5159854888916016, "logits/rejected": -3.144465208053589, "logps/chosen": -355.472412109375, "logps/rejected": -2676.97314453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.172433614730835, "rewards/margins": 23.195859909057617, "rewards/rejected": -26.3682918548584, "step": 6680 }, { "epoch": 0.9467874327766771, "grad_norm": 77.57978569307019, "learning_rate": 5.912879383550873e-09, "logits/chosen": -2.5680649280548096, "logits/rejected": -3.1938440799713135, "logps/chosen": -470.2861328125, "logps/rejected": -2936.075927734375, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -4.30758810043335, "rewards/margins": 24.636417388916016, "rewards/rejected": -28.944005966186523, "step": 6690 }, { "epoch": 0.9482026606283611, "grad_norm": 37.47740644396665, "learning_rate": 5.755621953137285e-09, "logits/chosen": -2.532409191131592, "logits/rejected": -3.103505849838257, "logps/chosen": -392.10223388671875, "logps/rejected": -2644.917236328125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.5219218730926514, "rewards/margins": 22.5264892578125, "rewards/rejected": -26.048410415649414, "step": 6700 }, { "epoch": 0.9496178884800452, "grad_norm": 20.247759039208294, "learning_rate": 5.598364522723698e-09, "logits/chosen": -2.6303787231445312, "logits/rejected": -3.1892428398132324, "logps/chosen": -469.10089111328125, "logps/rejected": -2915.725341796875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.29639196395874, "rewards/margins": 24.411623001098633, "rewards/rejected": -28.708017349243164, "step": 6710 }, { "epoch": 0.9510331163317294, "grad_norm": 0.7882196000397126, "learning_rate": 5.441107092310111e-09, "logits/chosen": -2.5218119621276855, "logits/rejected": -3.210817813873291, "logps/chosen": -444.222412109375, "logps/rejected": -2750.052734375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.0369791984558105, "rewards/margins": 23.056543350219727, "rewards/rejected": -27.093521118164062, "step": 6720 }, { "epoch": 0.9524483441834135, "grad_norm": 0.25579047904779756, "learning_rate": 5.283849661896524e-09, "logits/chosen": -2.6067352294921875, "logits/rejected": -3.18866229057312, "logps/chosen": -477.4564514160156, "logps/rejected": -2672.0419921875, "loss": 0.0124, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.38925838470459, "rewards/margins": 21.936967849731445, "rewards/rejected": -26.32622718811035, "step": 6730 }, { "epoch": 0.9538635720350976, "grad_norm": 48.684201326719425, "learning_rate": 5.1265922314829375e-09, "logits/chosen": -2.633354663848877, "logits/rejected": -3.2276453971862793, "logps/chosen": -566.0477905273438, "logps/rejected": -3109.52392578125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.269009590148926, "rewards/margins": 25.38370704650879, "rewards/rejected": -30.6527156829834, "step": 6740 }, { "epoch": 0.9552787998867818, "grad_norm": 0.2713102265913373, "learning_rate": 4.969334801069351e-09, "logits/chosen": -2.5992467403411865, "logits/rejected": -3.3306610584259033, "logps/chosen": -434.9539489746094, "logps/rejected": -2831.95263671875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.959697723388672, "rewards/margins": 23.928010940551758, "rewards/rejected": -27.887706756591797, "step": 6750 }, { "epoch": 0.9566940277384659, "grad_norm": 0.204647745366033, "learning_rate": 4.812077370655764e-09, "logits/chosen": -2.5274746417999268, "logits/rejected": -3.11221981048584, "logps/chosen": -414.9981994628906, "logps/rejected": -2920.882080078125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.7723147869110107, "rewards/margins": 25.016611099243164, "rewards/rejected": -28.788925170898438, "step": 6760 }, { "epoch": 0.95810925559015, "grad_norm": 0.015456421683272324, "learning_rate": 4.654819940242176e-09, "logits/chosen": -2.5432753562927246, "logits/rejected": -3.169085741043091, "logps/chosen": -431.6983947753906, "logps/rejected": -3043.0478515625, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": -3.9238624572753906, "rewards/margins": 26.098464965820312, "rewards/rejected": -30.022327423095703, "step": 6770 }, { "epoch": 0.9595244834418342, "grad_norm": 10.758254133673354, "learning_rate": 4.497562509828589e-09, "logits/chosen": -2.630561113357544, "logits/rejected": -3.285642623901367, "logps/chosen": -400.30035400390625, "logps/rejected": -2893.667724609375, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": -3.615029811859131, "rewards/margins": 24.902395248413086, "rewards/rejected": -28.517425537109375, "step": 6780 }, { "epoch": 0.9609397112935183, "grad_norm": 0.08856167479007233, "learning_rate": 4.340305079415002e-09, "logits/chosen": -2.607855796813965, "logits/rejected": -3.2052688598632812, "logps/chosen": -474.60748291015625, "logps/rejected": -2769.653564453125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.359852313995361, "rewards/margins": 22.92572593688965, "rewards/rejected": -27.285579681396484, "step": 6790 }, { "epoch": 0.9623549391452024, "grad_norm": 0.32353373552773057, "learning_rate": 4.183047649001415e-09, "logits/chosen": -2.615349769592285, "logits/rejected": -3.289827823638916, "logps/chosen": -369.09967041015625, "logps/rejected": -3131.83056640625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.3179118633270264, "rewards/margins": 27.554779052734375, "rewards/rejected": -30.872690200805664, "step": 6800 }, { "epoch": 0.9637701669968864, "grad_norm": 0.16660597996876098, "learning_rate": 4.025790218587828e-09, "logits/chosen": -2.5554356575012207, "logits/rejected": -3.18927264213562, "logps/chosen": -340.3284912109375, "logps/rejected": -2792.04345703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0212130546569824, "rewards/margins": 24.48760414123535, "rewards/rejected": -27.50881576538086, "step": 6810 }, { "epoch": 0.9651853948485706, "grad_norm": 0.6069772136313947, "learning_rate": 3.868532788174241e-09, "logits/chosen": -2.676387310028076, "logits/rejected": -3.36785888671875, "logps/chosen": -394.1666259765625, "logps/rejected": -3143.48681640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.5456929206848145, "rewards/margins": 27.44460678100586, "rewards/rejected": -30.99030113220215, "step": 6820 }, { "epoch": 0.9666006227002547, "grad_norm": 0.008495301983693835, "learning_rate": 3.711275357760654e-09, "logits/chosen": -2.6255509853363037, "logits/rejected": -3.296365261077881, "logps/chosen": -407.9010009765625, "logps/rejected": -2904.65576171875, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6842446327209473, "rewards/margins": 24.966773986816406, "rewards/rejected": -28.651020050048828, "step": 6830 }, { "epoch": 0.9680158505519388, "grad_norm": 0.24232514008120493, "learning_rate": 3.5540179273470674e-09, "logits/chosen": -2.578047752380371, "logits/rejected": -3.3089327812194824, "logps/chosen": -429.427734375, "logps/rejected": -2987.8662109375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.914933681488037, "rewards/margins": 25.525209426879883, "rewards/rejected": -29.440139770507812, "step": 6840 }, { "epoch": 0.969431078403623, "grad_norm": 0.2252178711889845, "learning_rate": 3.3967604969334797e-09, "logits/chosen": -2.6080422401428223, "logits/rejected": -3.26128888130188, "logps/chosen": -366.7235412597656, "logps/rejected": -2599.47705078125, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -3.2944443225860596, "rewards/margins": 22.307729721069336, "rewards/rejected": -25.6021728515625, "step": 6850 }, { "epoch": 0.9708463062553071, "grad_norm": 8.33780348865249, "learning_rate": 3.239503066519893e-09, "logits/chosen": -2.5319879055023193, "logits/rejected": -3.2218499183654785, "logps/chosen": -487.508056640625, "logps/rejected": -2844.683349609375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.498915195465088, "rewards/margins": 23.573257446289062, "rewards/rejected": -28.072174072265625, "step": 6860 }, { "epoch": 0.9722615341069912, "grad_norm": 0.5986387225385357, "learning_rate": 3.082245636106306e-09, "logits/chosen": -2.6759352684020996, "logits/rejected": -3.3080246448516846, "logps/chosen": -464.6507873535156, "logps/rejected": -2940.341064453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.268141746520996, "rewards/margins": 24.698143005371094, "rewards/rejected": -28.966283798217773, "step": 6870 }, { "epoch": 0.9736767619586754, "grad_norm": 0.8658772230340441, "learning_rate": 2.9249882056927187e-09, "logits/chosen": -2.6269848346710205, "logits/rejected": -3.2511463165283203, "logps/chosen": -427.97662353515625, "logps/rejected": -2950.122314453125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -3.8919200897216797, "rewards/margins": 25.186004638671875, "rewards/rejected": -29.077926635742188, "step": 6880 }, { "epoch": 0.9750919898103595, "grad_norm": 2.3267641991541925, "learning_rate": 2.767730775279132e-09, "logits/chosen": -2.560084342956543, "logits/rejected": -3.2120819091796875, "logps/chosen": -361.8909912109375, "logps/rejected": -2924.292236328125, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -3.238138198852539, "rewards/margins": 25.590801239013672, "rewards/rejected": -28.82893943786621, "step": 6890 }, { "epoch": 0.9765072176620436, "grad_norm": 0.07284640321402411, "learning_rate": 2.6104733448655446e-09, "logits/chosen": -2.662822723388672, "logits/rejected": -3.233111619949341, "logps/chosen": -457.23602294921875, "logps/rejected": -2949.24560546875, "loss": 0.0102, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.188632011413574, "rewards/margins": 24.92011260986328, "rewards/rejected": -29.10874366760254, "step": 6900 }, { "epoch": 0.9779224455137278, "grad_norm": 0.07607109435390584, "learning_rate": 2.4532159144519577e-09, "logits/chosen": -2.5750246047973633, "logits/rejected": -3.1777100563049316, "logps/chosen": -473.46038818359375, "logps/rejected": -2839.692626953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.353021144866943, "rewards/margins": 23.609880447387695, "rewards/rejected": -27.962902069091797, "step": 6910 }, { "epoch": 0.9793376733654118, "grad_norm": 0.5616488613632639, "learning_rate": 2.295958484038371e-09, "logits/chosen": -2.552330732345581, "logits/rejected": -3.1991357803344727, "logps/chosen": -387.00018310546875, "logps/rejected": -2746.093994140625, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -3.483829975128174, "rewards/margins": 23.556926727294922, "rewards/rejected": -27.040752410888672, "step": 6920 }, { "epoch": 0.9807529012170959, "grad_norm": 0.6935558256087516, "learning_rate": 2.138701053624784e-09, "logits/chosen": -2.5656027793884277, "logits/rejected": -3.263857364654541, "logps/chosen": -543.7437744140625, "logps/rejected": -3120.47802734375, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.044488906860352, "rewards/margins": 25.7255859375, "rewards/rejected": -30.770071029663086, "step": 6930 }, { "epoch": 0.98216812906878, "grad_norm": 0.3191481793189476, "learning_rate": 1.9814436232111963e-09, "logits/chosen": -2.572880268096924, "logits/rejected": -3.2847418785095215, "logps/chosen": -425.7274475097656, "logps/rejected": -2991.2265625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.8721587657928467, "rewards/margins": 25.637470245361328, "rewards/rejected": -29.509632110595703, "step": 6940 }, { "epoch": 0.9835833569204642, "grad_norm": 0.01848391623900847, "learning_rate": 1.8241861927976097e-09, "logits/chosen": -2.6058895587921143, "logits/rejected": -3.209880828857422, "logps/chosen": -431.3121032714844, "logps/rejected": -3292.25244140625, "loss": 0.0331, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9357120990753174, "rewards/margins": 28.519906997680664, "rewards/rejected": -32.45561981201172, "step": 6950 }, { "epoch": 0.9849985847721483, "grad_norm": 0.00482814230599608, "learning_rate": 1.6669287623840226e-09, "logits/chosen": -2.6013059616088867, "logits/rejected": -3.290935516357422, "logps/chosen": -517.8700561523438, "logps/rejected": -2767.95068359375, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.778696537017822, "rewards/margins": 22.503767013549805, "rewards/rejected": -27.282461166381836, "step": 6960 }, { "epoch": 0.9864138126238324, "grad_norm": 0.015070648476139715, "learning_rate": 1.5096713319704356e-09, "logits/chosen": -2.6199653148651123, "logits/rejected": -3.202892303466797, "logps/chosen": -422.21923828125, "logps/rejected": -2929.44677734375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -3.845832347869873, "rewards/margins": 25.03008460998535, "rewards/rejected": -28.87591552734375, "step": 6970 }, { "epoch": 0.9878290404755166, "grad_norm": 23.45646433919941, "learning_rate": 1.3524139015568485e-09, "logits/chosen": -2.6882026195526123, "logits/rejected": -3.296604871749878, "logps/chosen": -515.1275024414062, "logps/rejected": -2917.86181640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.751646518707275, "rewards/margins": 24.01171875, "rewards/rejected": -28.76336669921875, "step": 6980 }, { "epoch": 0.9892442683272007, "grad_norm": 0.28449278768788255, "learning_rate": 1.1951564711432614e-09, "logits/chosen": -2.686305522918701, "logits/rejected": -3.312514066696167, "logps/chosen": -437.9139709472656, "logps/rejected": -2770.609375, "loss": 0.0052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9988903999328613, "rewards/margins": 23.328754425048828, "rewards/rejected": -27.327646255493164, "step": 6990 }, { "epoch": 0.9906594961788848, "grad_norm": 0.9581546135552391, "learning_rate": 1.0378990407296744e-09, "logits/chosen": -2.545407772064209, "logits/rejected": -3.255033016204834, "logps/chosen": -474.46185302734375, "logps/rejected": -3005.776123046875, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -4.347042083740234, "rewards/margins": 25.289426803588867, "rewards/rejected": -29.636465072631836, "step": 7000 }, { "epoch": 0.992074724030569, "grad_norm": 2.255292860309816, "learning_rate": 8.806416103160874e-10, "logits/chosen": -2.583117961883545, "logits/rejected": -3.223315477371216, "logps/chosen": -479.12469482421875, "logps/rejected": -2842.49365234375, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.401499271392822, "rewards/margins": 23.590076446533203, "rewards/rejected": -27.9915771484375, "step": 7010 }, { "epoch": 0.9934899518822531, "grad_norm": 0.988266325751395, "learning_rate": 7.233841799025004e-10, "logits/chosen": -2.6259942054748535, "logits/rejected": -3.271639585494995, "logps/chosen": -449.78759765625, "logps/rejected": -2989.758544921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.084028244018555, "rewards/margins": 25.402605056762695, "rewards/rejected": -29.48663330078125, "step": 7020 }, { "epoch": 0.9949051797339372, "grad_norm": 0.011316144134450097, "learning_rate": 5.661267494889133e-10, "logits/chosen": -2.605973958969116, "logits/rejected": -3.267831325531006, "logps/chosen": -390.6419372558594, "logps/rejected": -2864.28173828125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.5355136394500732, "rewards/margins": 24.666160583496094, "rewards/rejected": -28.201675415039062, "step": 7030 }, { "epoch": 0.9963204075856212, "grad_norm": 0.004679974764731131, "learning_rate": 4.088693190753263e-10, "logits/chosen": -2.629878520965576, "logits/rejected": -3.3015456199645996, "logps/chosen": -532.28271484375, "logps/rejected": -3235.576171875, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.923863887786865, "rewards/margins": 26.976211547851562, "rewards/rejected": -31.900075912475586, "step": 7040 }, { "epoch": 0.9977356354373054, "grad_norm": 0.06554000535859102, "learning_rate": 2.5161188866173924e-10, "logits/chosen": -2.733333110809326, "logits/rejected": -3.479790210723877, "logps/chosen": -472.36376953125, "logps/rejected": -3161.77880859375, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -4.323271751403809, "rewards/margins": 26.833690643310547, "rewards/rejected": -31.156963348388672, "step": 7050 }, { "epoch": 0.9991508632889895, "grad_norm": 7.6011407504093125, "learning_rate": 9.435445824815222e-11, "logits/chosen": -2.554652452468872, "logits/rejected": -3.118103504180908, "logps/chosen": -395.5929870605469, "logps/rejected": -2849.43212890625, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.573125123977661, "rewards/margins": 24.513832092285156, "rewards/rejected": -28.086956024169922, "step": 7060 }, { "epoch": 1.0, "eval_logits/chosen": -2.3996024131774902, "eval_logits/rejected": -3.2167699337005615, "eval_logps/chosen": -208.22796630859375, "eval_logps/rejected": -1466.67333984375, "eval_loss": 0.0005253218114376068, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.7119580507278442, "eval_rewards/margins": 12.642820358276367, "eval_rewards/rejected": -14.354778289794922, "eval_runtime": 5.2151, "eval_samples_per_second": 19.175, "eval_steps_per_second": 0.767, "step": 7066 }, { "epoch": 1.0, "step": 7066, "total_flos": 0.0, "train_loss": 0.10174358356885328, "train_runtime": 28233.3099, "train_samples_per_second": 8.008, "train_steps_per_second": 0.25 } ], "logging_steps": 10, "max_steps": 7066, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }