{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998424948810837, "eval_steps": 100, "global_step": 3174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.322265625, "learning_rate": 3.1446540880503143e-09, "logits/chosen": -1.3876760005950928, "logits/rejected": -1.4584133625030518, "logps/chosen": -148.11717224121094, "logps/rejected": -197.28189086914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.474609375, "learning_rate": 3.1446540880503146e-08, "logits/chosen": -1.2969070672988892, "logits/rejected": -1.0069364309310913, "logps/chosen": -190.5032196044922, "logps/rejected": -182.00355529785156, "loss": 0.6932, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.0009341875556856394, "rewards/margins": 0.0010361968306824565, "rewards/margins_max": 0.0031799401622265577, "rewards/margins_min": -0.0011075465008616447, "rewards/margins_std": 0.003031711094081402, "rewards/rejected": -0.00010200924589298666, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.443359375, "learning_rate": 6.289308176100629e-08, "logits/chosen": -1.3659212589263916, "logits/rejected": -1.052756667137146, "logps/chosen": -225.5138397216797, "logps/rejected": -200.11280822753906, "loss": 0.693, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00012772370246239007, "rewards/margins": -0.00018917841953225434, "rewards/margins_max": 0.0013527333503589034, "rewards/margins_min": -0.0017310904804617167, "rewards/margins_std": 0.0021805930882692337, "rewards/rejected": 6.145476072560996e-05, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.43359375, "learning_rate": 9.433962264150943e-08, "logits/chosen": -1.2631334066390991, "logits/rejected": -0.9830008745193481, "logps/chosen": -180.3957061767578, "logps/rejected": -184.52642822265625, "loss": 0.6929, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0003226413391530514, "rewards/margins": 0.0009109593229368329, "rewards/margins_max": 0.002635856857523322, "rewards/margins_min": -0.0008139380952343345, "rewards/margins_std": 0.0024393731728196144, "rewards/rejected": -0.0005883178091607988, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.33984375, "learning_rate": 1.2578616352201258e-07, "logits/chosen": -1.4588565826416016, "logits/rejected": -1.1574698686599731, "logps/chosen": -225.439697265625, "logps/rejected": -276.75872802734375, "loss": 0.6935, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0006082096369937062, "rewards/margins": 0.00012304118718020618, "rewards/margins_max": 0.0016710966592654586, "rewards/margins_min": -0.001425014459528029, "rewards/margins_std": 0.002189281163737178, "rewards/rejected": 0.00048516839160583913, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.515625, "learning_rate": 1.5723270440251572e-07, "logits/chosen": -1.3671318292617798, "logits/rejected": -0.8632100820541382, "logps/chosen": -331.68609619140625, "logps/rejected": -205.90982055664062, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00018301008094567806, "rewards/margins": 0.0002522182185202837, "rewards/margins_max": 0.002131999935954809, "rewards/margins_min": -0.0016275634989142418, "rewards/margins_std": 0.002658412791788578, "rewards/rejected": -0.0004352282849140465, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.453125, "learning_rate": 1.8867924528301886e-07, "logits/chosen": -1.1950219869613647, "logits/rejected": -1.0118684768676758, "logps/chosen": -203.71957397460938, "logps/rejected": -264.69964599609375, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0003052177489735186, "rewards/margins": 0.00022837640426587313, "rewards/margins_max": 0.0015303840627893806, "rewards/margins_min": -0.0010736312251538038, "rewards/margins_std": 0.001841316930949688, "rewards/rejected": 7.684133015573025e-05, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.4453125, "learning_rate": 2.20125786163522e-07, "logits/chosen": -1.4155724048614502, "logits/rejected": -1.0938544273376465, "logps/chosen": -218.9034423828125, "logps/rejected": -225.016845703125, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": 0.00026987362070940435, "rewards/margins": 0.0008336820756085217, "rewards/margins_max": 0.002929271664470434, "rewards/margins_min": -0.0012619076296687126, "rewards/margins_std": 0.0029636111576110125, "rewards/rejected": -0.0005638084840029478, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.51953125, "learning_rate": 2.5157232704402517e-07, "logits/chosen": -1.2726242542266846, "logits/rejected": -0.9936110377311707, "logps/chosen": -285.13433837890625, "logps/rejected": -266.5086364746094, "loss": 0.6926, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0007069502025842667, "rewards/margins": 0.0009692258317954838, "rewards/margins_max": 0.0028219607193022966, "rewards/margins_min": -0.0008835086482577026, "rewards/margins_std": 0.002620162209495902, "rewards/rejected": -0.00026227571652270854, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.59375, "learning_rate": 2.830188679245283e-07, "logits/chosen": -1.4592866897583008, "logits/rejected": -1.1695111989974976, "logps/chosen": -212.254638671875, "logps/rejected": -219.2605743408203, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.0002520198468118906, "rewards/margins": 0.001166980480775237, "rewards/margins_max": 0.0038443871308118105, "rewards/margins_min": -0.001510425703600049, "rewards/margins_std": 0.0037864241749048233, "rewards/rejected": -0.0009149607503786683, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.337890625, "learning_rate": 3.1446540880503144e-07, "logits/chosen": -1.405822992324829, "logits/rejected": -0.9022544622421265, "logps/chosen": -257.4349670410156, "logps/rejected": -205.3658447265625, "loss": 0.6924, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0013813646510243416, "rewards/margins": 0.0018943824106827378, "rewards/margins_max": 0.0051645501516759396, "rewards/margins_min": -0.0013757857959717512, "rewards/margins_std": 0.004624716006219387, "rewards/rejected": -0.0005130176432430744, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.388671875, "learning_rate": 3.4591194968553456e-07, "logits/chosen": -1.250548005104065, "logits/rejected": -0.9772456884384155, "logps/chosen": -230.58889770507812, "logps/rejected": -190.00180053710938, "loss": 0.6915, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0019372021779417992, "rewards/margins": 0.003521789563819766, "rewards/margins_max": 0.0056365011259913445, "rewards/margins_min": 0.001407077768817544, "rewards/margins_std": 0.002990653971210122, "rewards/rejected": -0.0015845870366320014, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.455078125, "learning_rate": 3.773584905660377e-07, "logits/chosen": -1.4136645793914795, "logits/rejected": -1.0485485792160034, "logps/chosen": -195.27610778808594, "logps/rejected": -186.0947723388672, "loss": 0.6921, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0026215934194624424, "rewards/margins": 0.0021645757369697094, "rewards/margins_max": 0.0044735753908753395, "rewards/margins_min": -0.00014442438259720802, "rewards/margins_std": 0.0032654188107699156, "rewards/rejected": 0.00045701785711571574, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.57421875, "learning_rate": 4.088050314465409e-07, "logits/chosen": -1.1770480871200562, "logits/rejected": -0.9447425603866577, "logps/chosen": -219.49655151367188, "logps/rejected": -248.679443359375, "loss": 0.692, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.00221074465662241, "rewards/margins": 0.0024410211481153965, "rewards/margins_max": 0.005052028689533472, "rewards/margins_min": -0.00016998658247757703, "rewards/margins_std": 0.0036925221793353558, "rewards/rejected": -0.00023027621500659734, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.41796875, "learning_rate": 4.40251572327044e-07, "logits/chosen": -1.1848294734954834, "logits/rejected": -0.9620411992073059, "logps/chosen": -267.6338195800781, "logps/rejected": -216.468505859375, "loss": 0.6916, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0020743615459650755, "rewards/margins": 0.002421380952000618, "rewards/margins_max": 0.004907554481178522, "rewards/margins_min": -6.479259172920138e-05, "rewards/margins_std": 0.0035159799735993147, "rewards/rejected": -0.0003470192023087293, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.3125, "learning_rate": 4.7169811320754717e-07, "logits/chosen": -1.3419865369796753, "logits/rejected": -0.8775084614753723, "logps/chosen": -304.682861328125, "logps/rejected": -234.85049438476562, "loss": 0.6905, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.004905511625111103, "rewards/margins": 0.006082520820200443, "rewards/margins_max": 0.010440012440085411, "rewards/margins_min": 0.0017250289674848318, "rewards/margins_std": 0.00616242503747344, "rewards/rejected": -0.0011770094279199839, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.349609375, "learning_rate": 5.031446540880503e-07, "logits/chosen": -1.362574815750122, "logits/rejected": -1.171775221824646, "logps/chosen": -168.5498504638672, "logps/rejected": -229.1747589111328, "loss": 0.6907, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0031735021620988846, "rewards/margins": 0.004087474662810564, "rewards/margins_max": 0.007105571683496237, "rewards/margins_min": 0.001069377874955535, "rewards/margins_std": 0.00426823366433382, "rewards/rejected": -0.000913972791749984, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.326171875, "learning_rate": 5.345911949685534e-07, "logits/chosen": -1.237182378768921, "logits/rejected": -0.9322627782821655, "logps/chosen": -220.6244354248047, "logps/rejected": -198.8372802734375, "loss": 0.6904, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0044016363099217415, "rewards/margins": 0.006395402364432812, "rewards/margins_max": 0.008824339136481285, "rewards/margins_min": 0.003966464661061764, "rewards/margins_std": 0.0034350368659943342, "rewards/rejected": -0.001993766753003001, "step": 170 }, { "epoch": 0.06, "grad_norm": 0.5625, "learning_rate": 5.660377358490566e-07, "logits/chosen": -1.3780791759490967, "logits/rejected": -1.0467432737350464, "logps/chosen": -213.366455078125, "logps/rejected": -216.13211059570312, "loss": 0.6898, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.005849289242178202, "rewards/margins": 0.007411675062030554, "rewards/margins_max": 0.012342330068349838, "rewards/margins_min": 0.0024810179602354765, "rewards/margins_std": 0.006973001174628735, "rewards/rejected": -0.0015623854706063867, "step": 180 }, { "epoch": 0.06, "grad_norm": 0.43359375, "learning_rate": 5.974842767295597e-07, "logits/chosen": -1.2821629047393799, "logits/rejected": -1.0755701065063477, "logps/chosen": -196.32594299316406, "logps/rejected": -216.0081787109375, "loss": 0.6899, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.005713514052331448, "rewards/margins": 0.0055726682767271996, "rewards/margins_max": 0.009180756285786629, "rewards/margins_min": 0.001964580500498414, "rewards/margins_std": 0.005102606490254402, "rewards/rejected": 0.00014084615395404398, "step": 190 }, { "epoch": 0.06, "grad_norm": 0.43359375, "learning_rate": 6.289308176100629e-07, "logits/chosen": -1.3140381574630737, "logits/rejected": -1.105791687965393, "logps/chosen": -218.8059539794922, "logps/rejected": -207.1342010498047, "loss": 0.6897, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.003872637404128909, "rewards/margins": 0.0057051111944019794, "rewards/margins_max": 0.009732077829539776, "rewards/margins_min": 0.0016781443264335394, "rewards/margins_std": 0.005694991443306208, "rewards/rejected": -0.0018324736738577485, "step": 200 }, { "epoch": 0.07, "grad_norm": 0.38671875, "learning_rate": 6.60377358490566e-07, "logits/chosen": -1.430407166481018, "logits/rejected": -1.0918631553649902, "logps/chosen": -237.2591552734375, "logps/rejected": -253.4399871826172, "loss": 0.6891, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0073573291301727295, "rewards/margins": 0.007854488678276539, "rewards/margins_max": 0.012713620439171791, "rewards/margins_min": 0.0029953576158732176, "rewards/margins_std": 0.006871848367154598, "rewards/rejected": -0.0004971598973497748, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.333984375, "learning_rate": 6.918238993710691e-07, "logits/chosen": -1.3923743963241577, "logits/rejected": -1.1259468793869019, "logps/chosen": -275.0027770996094, "logps/rejected": -198.38681030273438, "loss": 0.6889, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.007775217294692993, "rewards/margins": 0.009523289278149605, "rewards/margins_max": 0.01376691646873951, "rewards/margins_min": 0.005279661156237125, "rewards/margins_std": 0.0060013956390321255, "rewards/rejected": -0.0017480704700574279, "step": 220 }, { "epoch": 0.07, "grad_norm": 0.4765625, "learning_rate": 7.232704402515722e-07, "logits/chosen": -1.4120699167251587, "logits/rejected": -1.116763710975647, "logps/chosen": -253.22354125976562, "logps/rejected": -201.98855590820312, "loss": 0.6879, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.00820042286068201, "rewards/margins": 0.010577328503131866, "rewards/margins_max": 0.01556847058236599, "rewards/margins_min": 0.005586187355220318, "rewards/margins_std": 0.00705854082480073, "rewards/rejected": -0.0023769072722643614, "step": 230 }, { "epoch": 0.08, "grad_norm": 0.484375, "learning_rate": 7.547169811320754e-07, "logits/chosen": -1.3950035572052002, "logits/rejected": -1.2659811973571777, "logps/chosen": -176.2644805908203, "logps/rejected": -260.3876953125, "loss": 0.6873, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.00841821264475584, "rewards/margins": 0.011863857507705688, "rewards/margins_max": 0.01701103337109089, "rewards/margins_min": 0.006716682109981775, "rewards/margins_std": 0.007279204670339823, "rewards/rejected": -0.0034456446301192045, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.392578125, "learning_rate": 7.861635220125787e-07, "logits/chosen": -1.4266269207000732, "logits/rejected": -1.1792528629302979, "logps/chosen": -264.8574523925781, "logps/rejected": -217.40841674804688, "loss": 0.6877, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.009459974244236946, "rewards/margins": 0.011690459214150906, "rewards/margins_max": 0.01826130598783493, "rewards/margins_min": 0.005119613837450743, "rewards/margins_std": 0.00929258018732071, "rewards/rejected": -0.002230485901236534, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.40625, "learning_rate": 8.176100628930818e-07, "logits/chosen": -1.2674744129180908, "logits/rejected": -0.7844586968421936, "logps/chosen": -282.67913818359375, "logps/rejected": -258.6161193847656, "loss": 0.6864, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.011977099813520908, "rewards/margins": 0.013891148380935192, "rewards/margins_max": 0.020919669419527054, "rewards/margins_min": 0.00686262920498848, "rewards/margins_std": 0.009939828887581825, "rewards/rejected": -0.0019140491494908929, "step": 260 }, { "epoch": 0.09, "grad_norm": 0.419921875, "learning_rate": 8.490566037735849e-07, "logits/chosen": -1.3822617530822754, "logits/rejected": -0.8383499383926392, "logps/chosen": -259.55377197265625, "logps/rejected": -242.09774780273438, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.013840844854712486, "rewards/margins": 0.020429277792572975, "rewards/margins_max": 0.029015129432082176, "rewards/margins_min": 0.011843429878354073, "rewards/margins_std": 0.012142224237322807, "rewards/rejected": -0.0065884338691830635, "step": 270 }, { "epoch": 0.09, "grad_norm": 0.37890625, "learning_rate": 8.80503144654088e-07, "logits/chosen": -1.5422178506851196, "logits/rejected": -1.203775405883789, "logps/chosen": -194.75625610351562, "logps/rejected": -190.38040161132812, "loss": 0.6865, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01227253396064043, "rewards/margins": 0.013298863545060158, "rewards/margins_max": 0.018432527780532837, "rewards/margins_min": 0.008165198378264904, "rewards/margins_std": 0.007260097656399012, "rewards/rejected": -0.0010263302829116583, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.30859375, "learning_rate": 9.119496855345912e-07, "logits/chosen": -1.457471489906311, "logits/rejected": -1.0255846977233887, "logps/chosen": -241.8590087890625, "logps/rejected": -208.22671508789062, "loss": 0.6839, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014999980106949806, "rewards/margins": 0.020009437575936317, "rewards/margins_max": 0.029007185250520706, "rewards/margins_min": 0.011011689901351929, "rewards/margins_std": 0.012724736705422401, "rewards/rejected": -0.005009456072002649, "step": 290 }, { "epoch": 0.09, "grad_norm": 0.369140625, "learning_rate": 9.433962264150943e-07, "logits/chosen": -1.6220054626464844, "logits/rejected": -1.1889787912368774, "logps/chosen": -226.6202850341797, "logps/rejected": -211.44381713867188, "loss": 0.6829, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.019335268065333366, "rewards/margins": 0.021433234214782715, "rewards/margins_max": 0.03181144595146179, "rewards/margins_min": 0.011055031791329384, "rewards/margins_std": 0.014677000232040882, "rewards/rejected": -0.0020979673136025667, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.61328125, "learning_rate": 9.748427672955975e-07, "logits/chosen": -1.3852521181106567, "logits/rejected": -1.1346018314361572, "logps/chosen": -199.05203247070312, "logps/rejected": -216.6929473876953, "loss": 0.6839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.017914317548274994, "rewards/margins": 0.018279287964105606, "rewards/margins_max": 0.02675766311585903, "rewards/margins_min": 0.00980091467499733, "rewards/margins_std": 0.011990231461822987, "rewards/rejected": -0.0003649710270110518, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.400390625, "learning_rate": 9.99998790006147e-07, "logits/chosen": -1.3931553363800049, "logits/rejected": -1.0446799993515015, "logps/chosen": -249.444580078125, "logps/rejected": -237.3548583984375, "loss": 0.6836, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.015047195367515087, "rewards/margins": 0.01817159913480282, "rewards/margins_max": 0.028817584738135338, "rewards/margins_min": 0.007525615394115448, "rewards/margins_std": 0.015055695548653603, "rewards/rejected": -0.0031244028359651566, "step": 320 }, { "epoch": 0.1, "grad_norm": 0.4140625, "learning_rate": 9.999564408362052e-07, "logits/chosen": -1.484261155128479, "logits/rejected": -0.9981343150138855, "logps/chosen": -238.7038116455078, "logps/rejected": -257.0067443847656, "loss": 0.6801, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024171117693185806, "rewards/margins": 0.03251297399401665, "rewards/margins_max": 0.04922250285744667, "rewards/margins_min": 0.015803448855876923, "rewards/margins_std": 0.023630838841199875, "rewards/rejected": -0.008341856300830841, "step": 330 }, { "epoch": 0.11, "grad_norm": 0.4453125, "learning_rate": 9.998535978298279e-07, "logits/chosen": -1.316965103149414, "logits/rejected": -0.97691810131073, "logps/chosen": -180.54393005371094, "logps/rejected": -181.31243896484375, "loss": 0.6808, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.019655724987387657, "rewards/margins": 0.022490406408905983, "rewards/margins_max": 0.028128573670983315, "rewards/margins_min": 0.01685223914682865, "rewards/margins_std": 0.007973574101924896, "rewards/rejected": -0.0028346790932118893, "step": 340 }, { "epoch": 0.11, "grad_norm": 0.48828125, "learning_rate": 9.996902734308345e-07, "logits/chosen": -1.3453712463378906, "logits/rejected": -0.9177592992782593, "logps/chosen": -262.0094299316406, "logps/rejected": -242.67770385742188, "loss": 0.6785, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.021431343629956245, "rewards/margins": 0.030059820041060448, "rewards/margins_max": 0.04707712680101395, "rewards/margins_min": 0.01304252166301012, "rewards/margins_std": 0.024066099897027016, "rewards/rejected": -0.0086284838616848, "step": 350 }, { "epoch": 0.11, "grad_norm": 0.4921875, "learning_rate": 9.994664874011861e-07, "logits/chosen": -1.6302440166473389, "logits/rejected": -1.2027785778045654, "logps/chosen": -275.63800048828125, "logps/rejected": -223.16134643554688, "loss": 0.6786, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023763367906212807, "rewards/margins": 0.034584060311317444, "rewards/margins_max": 0.05510062724351883, "rewards/margins_min": 0.014067496173083782, "rewards/margins_std": 0.02901480160653591, "rewards/rejected": -0.010820695199072361, "step": 360 }, { "epoch": 0.12, "grad_norm": 0.3984375, "learning_rate": 9.991822668185925e-07, "logits/chosen": -1.4096801280975342, "logits/rejected": -1.0252230167388916, "logps/chosen": -229.8339080810547, "logps/rejected": -172.35845947265625, "loss": 0.6745, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026459768414497375, "rewards/margins": 0.039466358721256256, "rewards/margins_max": 0.05379994958639145, "rewards/margins_min": 0.025132764130830765, "rewards/margins_std": 0.020270761102437973, "rewards/rejected": -0.013006587512791157, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.359375, "learning_rate": 9.988376460732366e-07, "logits/chosen": -1.3159959316253662, "logits/rejected": -1.0778305530548096, "logps/chosen": -227.1737518310547, "logps/rejected": -181.856201171875, "loss": 0.6776, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02110281027853489, "rewards/margins": 0.03095998801290989, "rewards/margins_max": 0.04912665858864784, "rewards/margins_min": 0.01279332023113966, "rewards/margins_std": 0.025691548362374306, "rewards/rejected": -0.009857181459665298, "step": 380 }, { "epoch": 0.12, "grad_norm": 0.38671875, "learning_rate": 9.98432666863613e-07, "logits/chosen": -1.3005465269088745, "logits/rejected": -0.8841564059257507, "logps/chosen": -265.8480529785156, "logps/rejected": -214.14306640625, "loss": 0.6753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023480424657464027, "rewards/margins": 0.03407277166843414, "rewards/margins_max": 0.05221151188015938, "rewards/margins_min": 0.015934035181999207, "rewards/margins_std": 0.025652050971984863, "rewards/rejected": -0.01059234980493784, "step": 390 }, { "epoch": 0.13, "grad_norm": 0.390625, "learning_rate": 9.979673781914829e-07, "logits/chosen": -1.3220188617706299, "logits/rejected": -0.9904147982597351, "logps/chosen": -208.8026580810547, "logps/rejected": -199.86724853515625, "loss": 0.6786, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023009879514575005, "rewards/margins": 0.027196567505598068, "rewards/margins_max": 0.04160440340638161, "rewards/margins_min": 0.012788738124072552, "rewards/margins_std": 0.020375750958919525, "rewards/rejected": -0.004186691250652075, "step": 400 }, { "epoch": 0.13, "grad_norm": 0.55078125, "learning_rate": 9.974418363559443e-07, "logits/chosen": -1.4779157638549805, "logits/rejected": -1.2148815393447876, "logps/chosen": -191.52413940429688, "logps/rejected": -176.9311981201172, "loss": 0.6768, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.024117572233080864, "rewards/margins": 0.029250899329781532, "rewards/margins_max": 0.04297717660665512, "rewards/margins_min": 0.015524620190262794, "rewards/margins_std": 0.019411887973546982, "rewards/rejected": -0.005133326631039381, "step": 410 }, { "epoch": 0.13, "grad_norm": 0.39453125, "learning_rate": 9.968561049466213e-07, "logits/chosen": -1.3753879070281982, "logits/rejected": -0.9945527911186218, "logps/chosen": -254.3845672607422, "logps/rejected": -244.81082153320312, "loss": 0.6703, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04066943749785423, "rewards/margins": 0.043636471033096313, "rewards/margins_max": 0.06465307623147964, "rewards/margins_min": 0.02261987514793873, "rewards/margins_std": 0.02972196415066719, "rewards/rejected": -0.0029670395888388157, "step": 420 }, { "epoch": 0.14, "grad_norm": 0.435546875, "learning_rate": 9.96210254835968e-07, "logits/chosen": -1.3385958671569824, "logits/rejected": -1.0368238687515259, "logps/chosen": -257.49481201171875, "logps/rejected": -251.6103515625, "loss": 0.6703, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.038941044360399246, "rewards/margins": 0.0429171659052372, "rewards/margins_max": 0.07056744396686554, "rewards/margins_min": 0.015266889706254005, "rewards/margins_std": 0.039103396236896515, "rewards/rejected": -0.003976122476160526, "step": 430 }, { "epoch": 0.14, "grad_norm": 0.466796875, "learning_rate": 9.95504364170694e-07, "logits/chosen": -1.523626685142517, "logits/rejected": -0.9943062663078308, "logps/chosen": -205.248291015625, "logps/rejected": -203.47445678710938, "loss": 0.6712, "rewards/accuracies": 0.875, "rewards/chosen": 0.04165233299136162, "rewards/margins": 0.04524458199739456, "rewards/margins_max": 0.07039403915405273, "rewards/margins_min": 0.020095128566026688, "rewards/margins_std": 0.03556669503450394, "rewards/rejected": -0.0035922485403716564, "step": 440 }, { "epoch": 0.14, "grad_norm": 0.5, "learning_rate": 9.947385183623096e-07, "logits/chosen": -1.3122961521148682, "logits/rejected": -1.1558181047439575, "logps/chosen": -210.66714477539062, "logps/rejected": -231.5789031982422, "loss": 0.6726, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03287133574485779, "rewards/margins": 0.042133878916502, "rewards/margins_max": 0.06311113387346268, "rewards/margins_min": 0.02115662209689617, "rewards/margins_std": 0.029666315764188766, "rewards/rejected": -0.009262535721063614, "step": 450 }, { "epoch": 0.14, "grad_norm": 0.36328125, "learning_rate": 9.9391281007679e-07, "logits/chosen": -1.267585039138794, "logits/rejected": -0.9590380787849426, "logps/chosen": -170.42410278320312, "logps/rejected": -212.1375274658203, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 0.03331318497657776, "rewards/margins": 0.0470331646502018, "rewards/margins_max": 0.0685221329331398, "rewards/margins_min": 0.0255441851913929, "rewards/margins_std": 0.030389999970793724, "rewards/rejected": -0.013719978742301464, "step": 460 }, { "epoch": 0.15, "grad_norm": 0.4140625, "learning_rate": 9.930273392233624e-07, "logits/chosen": -1.3601120710372925, "logits/rejected": -1.0370827913284302, "logps/chosen": -211.78121948242188, "logps/rejected": -261.12493896484375, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 0.04147142544388771, "rewards/margins": 0.05635608360171318, "rewards/margins_max": 0.07943321764469147, "rewards/margins_min": 0.03327895328402519, "rewards/margins_std": 0.032635994255542755, "rewards/rejected": -0.01488465815782547, "step": 470 }, { "epoch": 0.15, "grad_norm": 0.53125, "learning_rate": 9.920822129424189e-07, "logits/chosen": -1.3620996475219727, "logits/rejected": -1.0208871364593506, "logps/chosen": -181.54205322265625, "logps/rejected": -218.82913208007812, "loss": 0.6703, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03728248178958893, "rewards/margins": 0.047586847096681595, "rewards/margins_max": 0.06849299371242523, "rewards/margins_min": 0.026680713519454002, "rewards/margins_std": 0.02956574596464634, "rewards/rejected": -0.010304367169737816, "step": 480 }, { "epoch": 0.15, "grad_norm": 0.4375, "learning_rate": 9.910775455925517e-07, "logits/chosen": -1.4793182611465454, "logits/rejected": -1.2021210193634033, "logps/chosen": -174.04185485839844, "logps/rejected": -171.1067352294922, "loss": 0.6724, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02975636161863804, "rewards/margins": 0.03999444097280502, "rewards/margins_max": 0.05971541255712509, "rewards/margins_min": 0.020273465663194656, "rewards/margins_std": 0.02788967452943325, "rewards/rejected": -0.01023807656019926, "step": 490 }, { "epoch": 0.16, "grad_norm": 0.39453125, "learning_rate": 9.90013458736716e-07, "logits/chosen": -1.6029140949249268, "logits/rejected": -1.194563865661621, "logps/chosen": -213.6199493408203, "logps/rejected": -206.2983856201172, "loss": 0.6673, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0341024175286293, "rewards/margins": 0.05521542578935623, "rewards/margins_max": 0.07687483727931976, "rewards/margins_min": 0.0335560217499733, "rewards/margins_std": 0.03063102997839451, "rewards/rejected": -0.02111300453543663, "step": 500 }, { "epoch": 0.16, "grad_norm": 0.373046875, "learning_rate": 9.888900811275203e-07, "logits/chosen": -1.3525406122207642, "logits/rejected": -1.0656477212905884, "logps/chosen": -206.7255401611328, "logps/rejected": -196.1541290283203, "loss": 0.6665, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.030226921662688255, "rewards/margins": 0.05328403785824776, "rewards/margins_max": 0.06973598897457123, "rewards/margins_min": 0.036832086741924286, "rewards/margins_std": 0.02326657809317112, "rewards/rejected": -0.0230571199208498, "step": 510 }, { "epoch": 0.16, "grad_norm": 0.36328125, "learning_rate": 9.877075486916496e-07, "logits/chosen": -1.3600490093231201, "logits/rejected": -1.037398099899292, "logps/chosen": -177.47750854492188, "logps/rejected": -182.17776489257812, "loss": 0.6732, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03540753945708275, "rewards/margins": 0.04670108109712601, "rewards/margins_max": 0.07210978865623474, "rewards/margins_min": 0.02129237726330757, "rewards/margins_std": 0.03593333810567856, "rewards/rejected": -0.011293541640043259, "step": 520 }, { "epoch": 0.17, "grad_norm": 0.4765625, "learning_rate": 9.864660045134162e-07, "logits/chosen": -1.3215343952178955, "logits/rejected": -1.0994096994400024, "logps/chosen": -210.7837371826172, "logps/rejected": -205.7008056640625, "loss": 0.6662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03428806737065315, "rewards/margins": 0.06019355729222298, "rewards/margins_max": 0.08791188150644302, "rewards/margins_min": 0.03247522935271263, "rewards/margins_std": 0.03919963538646698, "rewards/rejected": -0.025905489921569824, "step": 530 }, { "epoch": 0.17, "grad_norm": 0.45703125, "learning_rate": 9.851655988174489e-07, "logits/chosen": -1.47915780544281, "logits/rejected": -0.9851093292236328, "logps/chosen": -194.7159881591797, "logps/rejected": -213.84048461914062, "loss": 0.6662, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.041106559336185455, "rewards/margins": 0.054338376969099045, "rewards/margins_max": 0.0830872431397438, "rewards/margins_min": 0.025589507073163986, "rewards/margins_std": 0.04065703600645065, "rewards/rejected": -0.013231811113655567, "step": 540 }, { "epoch": 0.17, "grad_norm": 0.421875, "learning_rate": 9.83806488950514e-07, "logits/chosen": -1.4582946300506592, "logits/rejected": -1.2372512817382812, "logps/chosen": -231.83242797851562, "logps/rejected": -204.83743286132812, "loss": 0.6648, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03986522555351257, "rewards/margins": 0.05911695212125778, "rewards/margins_max": 0.09305725246667862, "rewards/margins_min": 0.02517666481435299, "rewards/margins_std": 0.0479988157749176, "rewards/rejected": -0.019251730293035507, "step": 550 }, { "epoch": 0.18, "grad_norm": 0.34765625, "learning_rate": 9.82388839362478e-07, "logits/chosen": -1.4161055088043213, "logits/rejected": -1.1000274419784546, "logps/chosen": -198.75758361816406, "logps/rejected": -205.18826293945312, "loss": 0.6659, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04866677150130272, "rewards/margins": 0.0624736063182354, "rewards/margins_max": 0.09315863996744156, "rewards/margins_min": 0.03178856521844864, "rewards/margins_std": 0.04339519888162613, "rewards/rejected": -0.013806832954287529, "step": 560 }, { "epoch": 0.18, "grad_norm": 0.462890625, "learning_rate": 9.809128215864096e-07, "logits/chosen": -1.2735754251480103, "logits/rejected": -0.9051140546798706, "logps/chosen": -270.4387512207031, "logps/rejected": -241.47079467773438, "loss": 0.6665, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03088981844484806, "rewards/margins": 0.05677186697721481, "rewards/margins_max": 0.08117306232452393, "rewards/margins_min": 0.0323706679046154, "rewards/margins_std": 0.03450850397348404, "rewards/rejected": -0.025882050395011902, "step": 570 }, { "epoch": 0.18, "grad_norm": 0.4453125, "learning_rate": 9.79378614217823e-07, "logits/chosen": -1.354640245437622, "logits/rejected": -1.0751674175262451, "logps/chosen": -227.79849243164062, "logps/rejected": -248.08642578125, "loss": 0.6527, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04743616655468941, "rewards/margins": 0.0788937360048294, "rewards/margins_max": 0.11354710906744003, "rewards/margins_min": 0.04424036294221878, "rewards/margins_std": 0.04900727421045303, "rewards/rejected": -0.0314575657248497, "step": 580 }, { "epoch": 0.19, "grad_norm": 0.4140625, "learning_rate": 9.777864028930705e-07, "logits/chosen": -1.322510004043579, "logits/rejected": -1.0033257007598877, "logps/chosen": -206.4609375, "logps/rejected": -225.65158081054688, "loss": 0.6621, "rewards/accuracies": 1.0, "rewards/chosen": 0.03883618488907814, "rewards/margins": 0.06851398199796677, "rewards/margins_max": 0.08924350887537003, "rewards/margins_min": 0.04778445512056351, "rewards/margins_std": 0.029315978288650513, "rewards/rejected": -0.029677793383598328, "step": 590 }, { "epoch": 0.19, "grad_norm": 0.515625, "learning_rate": 9.76136380266878e-07, "logits/chosen": -1.431841492652893, "logits/rejected": -1.2090071439743042, "logps/chosen": -291.49847412109375, "logps/rejected": -261.60888671875, "loss": 0.6575, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.043944261968135834, "rewards/margins": 0.07684491574764252, "rewards/margins_max": 0.10341174900531769, "rewards/margins_min": 0.05027808994054794, "rewards/margins_std": 0.03757117688655853, "rewards/rejected": -0.03290066123008728, "step": 600 }, { "epoch": 0.19, "grad_norm": 0.423828125, "learning_rate": 9.744287459890369e-07, "logits/chosen": -1.2307569980621338, "logits/rejected": -0.8211034536361694, "logps/chosen": -216.7576141357422, "logps/rejected": -194.070556640625, "loss": 0.6609, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.039227746427059174, "rewards/margins": 0.07264076173305511, "rewards/margins_max": 0.10165262222290039, "rewards/margins_min": 0.043628908693790436, "rewards/margins_std": 0.04102896526455879, "rewards/rejected": -0.03341301903128624, "step": 610 }, { "epoch": 0.2, "grad_norm": 0.5, "learning_rate": 9.726637066802446e-07, "logits/chosen": -1.3241937160491943, "logits/rejected": -0.8831518292427063, "logps/chosen": -244.5774383544922, "logps/rejected": -273.13427734375, "loss": 0.6549, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04124955087900162, "rewards/margins": 0.07716774940490723, "rewards/margins_max": 0.12114681303501129, "rewards/margins_min": 0.03318869322538376, "rewards/margins_std": 0.062195777893066406, "rewards/rejected": -0.03591819852590561, "step": 620 }, { "epoch": 0.2, "grad_norm": 0.32421875, "learning_rate": 9.708414759071057e-07, "logits/chosen": -1.4460439682006836, "logits/rejected": -1.0423699617385864, "logps/chosen": -252.99484252929688, "logps/rejected": -239.76608276367188, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": 0.03034689649939537, "rewards/margins": 0.06652723252773285, "rewards/margins_max": 0.0981488898396492, "rewards/margins_min": 0.03490559384226799, "rewards/margins_std": 0.044719766825437546, "rewards/rejected": -0.03618033975362778, "step": 630 }, { "epoch": 0.2, "grad_norm": 0.4140625, "learning_rate": 9.689622741562891e-07, "logits/chosen": -1.4432752132415771, "logits/rejected": -1.0086311101913452, "logps/chosen": -242.03866577148438, "logps/rejected": -226.80911254882812, "loss": 0.6593, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05991531163454056, "rewards/margins": 0.08118681609630585, "rewards/margins_max": 0.11632559448480606, "rewards/margins_min": 0.046048033982515335, "rewards/margins_std": 0.049693748354911804, "rewards/rejected": -0.021271510049700737, "step": 640 }, { "epoch": 0.2, "grad_norm": 0.455078125, "learning_rate": 9.670263288078503e-07, "logits/chosen": -1.4905498027801514, "logits/rejected": -0.9651784896850586, "logps/chosen": -337.162353515625, "logps/rejected": -227.63101196289062, "loss": 0.6499, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04678003862500191, "rewards/margins": 0.09588982164859772, "rewards/margins_max": 0.12638990581035614, "rewards/margins_min": 0.0653897300362587, "rewards/margins_std": 0.043133631348609924, "rewards/rejected": -0.04910977929830551, "step": 650 }, { "epoch": 0.21, "grad_norm": 0.337890625, "learning_rate": 9.650338741077188e-07, "logits/chosen": -1.2945762872695923, "logits/rejected": -1.0813788175582886, "logps/chosen": -229.5716552734375, "logps/rejected": -247.0192108154297, "loss": 0.6584, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.04455048590898514, "rewards/margins": 0.06444776803255081, "rewards/margins_max": 0.10383981466293335, "rewards/margins_min": 0.025055717676877975, "rewards/margins_std": 0.05570877343416214, "rewards/rejected": -0.019897282123565674, "step": 660 }, { "epoch": 0.21, "grad_norm": 0.484375, "learning_rate": 9.629851511393555e-07, "logits/chosen": -1.4708611965179443, "logits/rejected": -0.9937572479248047, "logps/chosen": -286.068603515625, "logps/rejected": -254.18637084960938, "loss": 0.6544, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04879416525363922, "rewards/margins": 0.07758750021457672, "rewards/margins_max": 0.10908614099025726, "rewards/margins_min": 0.04608884081244469, "rewards/margins_std": 0.044545818120241165, "rewards/rejected": -0.0287933312356472, "step": 670 }, { "epoch": 0.21, "grad_norm": 0.5625, "learning_rate": 9.608804077945797e-07, "logits/chosen": -1.4359239339828491, "logits/rejected": -1.0322556495666504, "logps/chosen": -269.5269470214844, "logps/rejected": -274.97137451171875, "loss": 0.6522, "rewards/accuracies": 0.875, "rewards/chosen": 0.051246218383312225, "rewards/margins": 0.07667367160320282, "rewards/margins_max": 0.10428421199321747, "rewards/margins_min": 0.049063123762607574, "rewards/margins_std": 0.03904721140861511, "rewards/rejected": -0.025427449494600296, "step": 680 }, { "epoch": 0.22, "grad_norm": 0.296875, "learning_rate": 9.58719898743578e-07, "logits/chosen": -1.348769187927246, "logits/rejected": -0.9472673535346985, "logps/chosen": -255.1873016357422, "logps/rejected": -228.0542755126953, "loss": 0.6554, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04409017041325569, "rewards/margins": 0.08384691178798676, "rewards/margins_max": 0.11428749561309814, "rewards/margins_min": 0.05340634658932686, "rewards/margins_std": 0.04304947704076767, "rewards/rejected": -0.03975675255060196, "step": 690 }, { "epoch": 0.22, "grad_norm": 0.5546875, "learning_rate": 9.565038854040865e-07, "logits/chosen": -1.3250253200531006, "logits/rejected": -0.9937537908554077, "logps/chosen": -203.81344604492188, "logps/rejected": -221.4913787841797, "loss": 0.6536, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.028742622584104538, "rewards/margins": 0.07320950925350189, "rewards/margins_max": 0.11118922382593155, "rewards/margins_min": 0.03522980213165283, "rewards/margins_std": 0.0537114143371582, "rewards/rejected": -0.04446689039468765, "step": 700 }, { "epoch": 0.22, "grad_norm": 0.341796875, "learning_rate": 9.542326359097617e-07, "logits/chosen": -1.405347228050232, "logits/rejected": -0.9768081903457642, "logps/chosen": -331.59649658203125, "logps/rejected": -235.75888061523438, "loss": 0.6429, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.046662431210279465, "rewards/margins": 0.10430089384317398, "rewards/margins_max": 0.14537875354290009, "rewards/margins_min": 0.06322301924228668, "rewards/margins_std": 0.058092884719371796, "rewards/rejected": -0.05763845518231392, "step": 710 }, { "epoch": 0.23, "grad_norm": 0.443359375, "learning_rate": 9.51906425077736e-07, "logits/chosen": -1.232671856880188, "logits/rejected": -0.8695996999740601, "logps/chosen": -260.71917724609375, "logps/rejected": -237.81472778320312, "loss": 0.6456, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06164161115884781, "rewards/margins": 0.11577005684375763, "rewards/margins_max": 0.18085698783397675, "rewards/margins_min": 0.05068312957882881, "rewards/margins_std": 0.092046819627285, "rewards/rejected": -0.054128456860780716, "step": 720 }, { "epoch": 0.23, "grad_norm": 0.42578125, "learning_rate": 9.495255343753657e-07, "logits/chosen": -1.4524829387664795, "logits/rejected": -1.0947620868682861, "logps/chosen": -204.1517333984375, "logps/rejected": -222.0264129638672, "loss": 0.6525, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04527204856276512, "rewards/margins": 0.09181281924247742, "rewards/margins_max": 0.13302180171012878, "rewards/margins_min": 0.050603847950696945, "rewards/margins_std": 0.058278292417526245, "rewards/rejected": -0.046540774405002594, "step": 730 }, { "epoch": 0.23, "grad_norm": 0.455078125, "learning_rate": 9.470902518861731e-07, "logits/chosen": -1.3769677877426147, "logits/rejected": -1.1935275793075562, "logps/chosen": -191.8888397216797, "logps/rejected": -201.0779571533203, "loss": 0.6528, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03776038438081741, "rewards/margins": 0.08057109266519547, "rewards/margins_max": 0.12366624921560287, "rewards/margins_min": 0.03747590631246567, "rewards/margins_std": 0.0609457865357399, "rewards/rejected": -0.042810700833797455, "step": 740 }, { "epoch": 0.24, "grad_norm": 0.455078125, "learning_rate": 9.446008722749905e-07, "logits/chosen": -1.3329544067382812, "logits/rejected": -1.0925482511520386, "logps/chosen": -228.0487060546875, "logps/rejected": -202.3518524169922, "loss": 0.6537, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03692751005291939, "rewards/margins": 0.08129950612783432, "rewards/margins_max": 0.12913137674331665, "rewards/margins_min": 0.03346762806177139, "rewards/margins_std": 0.06764448434114456, "rewards/rejected": -0.04437199607491493, "step": 750 }, { "epoch": 0.24, "grad_norm": 0.392578125, "learning_rate": 9.420576967523048e-07, "logits/chosen": -1.1731605529785156, "logits/rejected": -0.8517470359802246, "logps/chosen": -237.1516571044922, "logps/rejected": -215.74270629882812, "loss": 0.6573, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.034002404659986496, "rewards/margins": 0.08593714237213135, "rewards/margins_max": 0.12129978835582733, "rewards/margins_min": 0.05057450011372566, "rewards/margins_std": 0.05001033470034599, "rewards/rejected": -0.05193474888801575, "step": 760 }, { "epoch": 0.24, "grad_norm": 0.423828125, "learning_rate": 9.394610330378124e-07, "logits/chosen": -1.4420990943908691, "logits/rejected": -1.063472032546997, "logps/chosen": -228.5890655517578, "logps/rejected": -231.9294891357422, "loss": 0.6502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.049244366586208344, "rewards/margins": 0.08763855695724487, "rewards/margins_max": 0.13933846354484558, "rewards/margins_min": 0.03593864664435387, "rewards/margins_std": 0.07311470806598663, "rewards/rejected": -0.03839418664574623, "step": 770 }, { "epoch": 0.25, "grad_norm": 0.45703125, "learning_rate": 9.368111953231847e-07, "logits/chosen": -1.4083888530731201, "logits/rejected": -0.9682759046554565, "logps/chosen": -223.3924560546875, "logps/rejected": -241.82778930664062, "loss": 0.6447, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06056595593690872, "rewards/margins": 0.12897726893424988, "rewards/margins_max": 0.17253781855106354, "rewards/margins_min": 0.08541673421859741, "rewards/margins_std": 0.06160390377044678, "rewards/rejected": -0.06841133534908295, "step": 780 }, { "epoch": 0.25, "grad_norm": 0.56640625, "learning_rate": 9.341085042340531e-07, "logits/chosen": -1.4172804355621338, "logits/rejected": -1.1236498355865479, "logps/chosen": -171.3277130126953, "logps/rejected": -193.0868682861328, "loss": 0.6512, "rewards/accuracies": 1.0, "rewards/chosen": 0.03927486762404442, "rewards/margins": 0.08602278679609299, "rewards/margins_max": 0.12318630516529083, "rewards/margins_min": 0.04885926470160484, "rewards/margins_std": 0.05255715921521187, "rewards/rejected": -0.04674791917204857, "step": 790 }, { "epoch": 0.25, "grad_norm": 0.451171875, "learning_rate": 9.313532867912124e-07, "logits/chosen": -1.413651466369629, "logits/rejected": -0.906880259513855, "logps/chosen": -246.77468872070312, "logps/rejected": -223.56912231445312, "loss": 0.6408, "rewards/accuracies": 1.0, "rewards/chosen": 0.055256299674510956, "rewards/margins": 0.12263475358486176, "rewards/margins_max": 0.17601759731769562, "rewards/margins_min": 0.06925191730260849, "rewards/margins_std": 0.07549472898244858, "rewards/rejected": -0.0673784539103508, "step": 800 }, { "epoch": 0.26, "grad_norm": 0.36328125, "learning_rate": 9.285458763710523e-07, "logits/chosen": -1.436037302017212, "logits/rejected": -1.0014102458953857, "logps/chosen": -231.8644561767578, "logps/rejected": -184.65553283691406, "loss": 0.6483, "rewards/accuracies": 1.0, "rewards/chosen": 0.06529637426137924, "rewards/margins": 0.0870235487818718, "rewards/margins_max": 0.12472760677337646, "rewards/margins_min": 0.04931949824094772, "rewards/margins_std": 0.05332158878445625, "rewards/rejected": -0.021727172657847404, "step": 810 }, { "epoch": 0.26, "grad_norm": 0.5, "learning_rate": 9.256866126652199e-07, "logits/chosen": -1.33461594581604, "logits/rejected": -0.9908684492111206, "logps/chosen": -201.02322387695312, "logps/rejected": -202.8111572265625, "loss": 0.6461, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.031793173402547836, "rewards/margins": 0.08713125437498093, "rewards/margins_max": 0.12756696343421936, "rewards/margins_min": 0.04669555649161339, "rewards/margins_std": 0.05718470364809036, "rewards/rejected": -0.05533808469772339, "step": 820 }, { "epoch": 0.26, "grad_norm": 0.4453125, "learning_rate": 9.227758416395169e-07, "logits/chosen": -1.4254963397979736, "logits/rejected": -1.0587215423583984, "logps/chosen": -221.55386352539062, "logps/rejected": -192.1359405517578, "loss": 0.6463, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05291753262281418, "rewards/margins": 0.09469744563102722, "rewards/margins_max": 0.14799687266349792, "rewards/margins_min": 0.04139800742268562, "rewards/margins_std": 0.07537679374217987, "rewards/rejected": -0.04177991300821304, "step": 830 }, { "epoch": 0.26, "grad_norm": 0.451171875, "learning_rate": 9.198139154920388e-07, "logits/chosen": -1.4358642101287842, "logits/rejected": -1.1785621643066406, "logps/chosen": -278.039306640625, "logps/rejected": -247.2188720703125, "loss": 0.6388, "rewards/accuracies": 1.0, "rewards/chosen": 0.06873653084039688, "rewards/margins": 0.11557067930698395, "rewards/margins_max": 0.16702882945537567, "rewards/margins_min": 0.06411250680685043, "rewards/margins_std": 0.07277283817529678, "rewards/rejected": -0.04683414846658707, "step": 840 }, { "epoch": 0.27, "grad_norm": 0.36328125, "learning_rate": 9.168011926105597e-07, "logits/chosen": -1.5257022380828857, "logits/rejected": -1.1070044040679932, "logps/chosen": -201.38771057128906, "logps/rejected": -222.58096313476562, "loss": 0.6444, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05585918575525284, "rewards/margins": 0.10483954101800919, "rewards/margins_max": 0.1490497589111328, "rewards/margins_min": 0.060629308223724365, "rewards/margins_std": 0.06252270191907883, "rewards/rejected": -0.04898035526275635, "step": 850 }, { "epoch": 0.27, "grad_norm": 0.50390625, "learning_rate": 9.137380375291677e-07, "logits/chosen": -1.4331508874893188, "logits/rejected": -1.0888936519622803, "logps/chosen": -208.6669921875, "logps/rejected": -228.1973419189453, "loss": 0.6464, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.053120195865631104, "rewards/margins": 0.1023564338684082, "rewards/margins_max": 0.15425564348697662, "rewards/margins_min": 0.05045723915100098, "rewards/margins_std": 0.07339654862880707, "rewards/rejected": -0.0492362454533577, "step": 860 }, { "epoch": 0.27, "grad_norm": 0.55078125, "learning_rate": 9.106248208841568e-07, "logits/chosen": -1.3399418592453003, "logits/rejected": -1.1214873790740967, "logps/chosen": -204.1221466064453, "logps/rejected": -213.8696746826172, "loss": 0.6472, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03532610088586807, "rewards/margins": 0.10521572828292847, "rewards/margins_max": 0.14728297293186188, "rewards/margins_min": 0.06314849108457565, "rewards/margins_std": 0.05949206277728081, "rewards/rejected": -0.06988963484764099, "step": 870 }, { "epoch": 0.28, "grad_norm": 0.376953125, "learning_rate": 9.07461919369181e-07, "logits/chosen": -1.4420771598815918, "logits/rejected": -1.1662547588348389, "logps/chosen": -201.39486694335938, "logps/rejected": -187.26492309570312, "loss": 0.6419, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.051577143371105194, "rewards/margins": 0.10872938483953476, "rewards/margins_max": 0.15409475564956665, "rewards/margins_min": 0.06336402893066406, "rewards/margins_std": 0.06415630877017975, "rewards/rejected": -0.05715225264430046, "step": 880 }, { "epoch": 0.28, "grad_norm": 0.380859375, "learning_rate": 9.042497156896746e-07, "logits/chosen": -1.311943769454956, "logits/rejected": -1.2416942119598389, "logps/chosen": -165.70957946777344, "logps/rejected": -237.78829956054688, "loss": 0.6482, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.036909494549036026, "rewards/margins": 0.07109662890434265, "rewards/margins_max": 0.10757263749837875, "rewards/margins_min": 0.03462062403559685, "rewards/margins_std": 0.05158485844731331, "rewards/rejected": -0.034187134355306625, "step": 890 }, { "epoch": 0.28, "grad_norm": 0.4296875, "learning_rate": 9.009885985165464e-07, "logits/chosen": -1.3028219938278198, "logits/rejected": -1.0853294134140015, "logps/chosen": -199.5616912841797, "logps/rejected": -219.22830200195312, "loss": 0.6427, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05228592827916145, "rewards/margins": 0.08949766308069229, "rewards/margins_max": 0.13855160772800446, "rewards/margins_min": 0.04044371843338013, "rewards/margins_std": 0.0693727508187294, "rewards/rejected": -0.03721173480153084, "step": 900 }, { "epoch": 0.29, "grad_norm": 0.38671875, "learning_rate": 8.976789624391497e-07, "logits/chosen": -1.3990576267242432, "logits/rejected": -1.134710669517517, "logps/chosen": -168.1616668701172, "logps/rejected": -225.7877197265625, "loss": 0.6429, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.051915861666202545, "rewards/margins": 0.10637024790048599, "rewards/margins_max": 0.1688658893108368, "rewards/margins_min": 0.043874599039554596, "rewards/margins_std": 0.08838219195604324, "rewards/rejected": -0.054454393684864044, "step": 910 }, { "epoch": 0.29, "grad_norm": 0.494140625, "learning_rate": 8.94321207917539e-07, "logits/chosen": -1.4574586153030396, "logits/rejected": -1.2704302072525024, "logps/chosen": -248.19015502929688, "logps/rejected": -254.5595245361328, "loss": 0.6465, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.051451824605464935, "rewards/margins": 0.08263147622346878, "rewards/margins_max": 0.12576524913311005, "rewards/margins_min": 0.03949769213795662, "rewards/margins_std": 0.06100037693977356, "rewards/rejected": -0.03117964044213295, "step": 920 }, { "epoch": 0.29, "grad_norm": 1.3515625, "learning_rate": 8.909157412340149e-07, "logits/chosen": -1.4800761938095093, "logits/rejected": -1.0025885105133057, "logps/chosen": -189.4911651611328, "logps/rejected": -236.02151489257812, "loss": 0.6394, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0598456934094429, "rewards/margins": 0.11991927772760391, "rewards/margins_max": 0.1739063560962677, "rewards/margins_min": 0.06593217700719833, "rewards/margins_std": 0.07634927332401276, "rewards/rejected": -0.060073576867580414, "step": 930 }, { "epoch": 0.3, "grad_norm": 0.51953125, "learning_rate": 8.874629744439637e-07, "logits/chosen": -1.3226550817489624, "logits/rejected": -1.000619888305664, "logps/chosen": -201.4481964111328, "logps/rejected": -239.4997100830078, "loss": 0.6387, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.051911354064941406, "rewards/margins": 0.14231614768505096, "rewards/margins_max": 0.22301022708415985, "rewards/margins_min": 0.061622101813554764, "rewards/margins_std": 0.11411863565444946, "rewards/rejected": -0.09040482342243195, "step": 940 }, { "epoch": 0.3, "grad_norm": 0.55078125, "learning_rate": 8.839633253260005e-07, "logits/chosen": -1.5083190202713013, "logits/rejected": -1.0948280096054077, "logps/chosen": -222.4230194091797, "logps/rejected": -244.3561248779297, "loss": 0.6331, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05216183513402939, "rewards/margins": 0.12567996978759766, "rewards/margins_max": 0.18674659729003906, "rewards/margins_min": 0.06461338698863983, "rewards/margins_std": 0.08636122196912766, "rewards/rejected": -0.07351814210414886, "step": 950 }, { "epoch": 0.3, "grad_norm": 0.462890625, "learning_rate": 8.804172173314184e-07, "logits/chosen": -1.5267778635025024, "logits/rejected": -1.0082361698150635, "logps/chosen": -195.0020294189453, "logps/rejected": -216.0598907470703, "loss": 0.6426, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04607005417346954, "rewards/margins": 0.10273507982492447, "rewards/margins_max": 0.15412285923957825, "rewards/margins_min": 0.05134730786085129, "rewards/margins_std": 0.07267327606678009, "rewards/rejected": -0.05666501447558403, "step": 960 }, { "epoch": 0.31, "grad_norm": 0.49609375, "learning_rate": 8.768250795329517e-07, "logits/chosen": -1.4357895851135254, "logits/rejected": -1.077530860900879, "logps/chosen": -193.4352264404297, "logps/rejected": -194.35830688476562, "loss": 0.6363, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06332554668188095, "rewards/margins": 0.12346392869949341, "rewards/margins_max": 0.18960857391357422, "rewards/margins_min": 0.0573192834854126, "rewards/margins_std": 0.09354265034198761, "rewards/rejected": -0.06013838201761246, "step": 970 }, { "epoch": 0.31, "grad_norm": 0.30078125, "learning_rate": 8.731873465728583e-07, "logits/chosen": -1.2073941230773926, "logits/rejected": -1.077758550643921, "logps/chosen": -210.8450164794922, "logps/rejected": -259.5902404785156, "loss": 0.6416, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.039309777319431305, "rewards/margins": 0.09090758115053177, "rewards/margins_max": 0.13156287372112274, "rewards/margins_min": 0.050252266228199005, "rewards/margins_std": 0.05749528482556343, "rewards/rejected": -0.05159779265522957, "step": 980 }, { "epoch": 0.31, "grad_norm": 0.55078125, "learning_rate": 8.695044586103295e-07, "logits/chosen": -1.404300332069397, "logits/rejected": -1.14784836769104, "logps/chosen": -208.1896209716797, "logps/rejected": -215.95730590820312, "loss": 0.6418, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0341053307056427, "rewards/margins": 0.09708726406097412, "rewards/margins_max": 0.14782105386257172, "rewards/margins_min": 0.04635345935821533, "rewards/margins_std": 0.07174843549728394, "rewards/rejected": -0.06298193335533142, "step": 990 }, { "epoch": 0.32, "grad_norm": 0.40234375, "learning_rate": 8.657768612682315e-07, "logits/chosen": -1.6107196807861328, "logits/rejected": -1.1812701225280762, "logps/chosen": -211.580810546875, "logps/rejected": -204.4039764404297, "loss": 0.636, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04763215780258179, "rewards/margins": 0.11304394900798798, "rewards/margins_max": 0.16250091791152954, "rewards/margins_min": 0.06358698755502701, "rewards/margins_std": 0.06994272023439407, "rewards/rejected": -0.06541179120540619, "step": 1000 }, { "epoch": 0.32, "grad_norm": 0.453125, "learning_rate": 8.62005005579185e-07, "logits/chosen": -1.4514292478561401, "logits/rejected": -1.1223483085632324, "logps/chosen": -213.6953125, "logps/rejected": -233.45425415039062, "loss": 0.6357, "rewards/accuracies": 1.0, "rewards/chosen": 0.055231235921382904, "rewards/margins": 0.11501912772655487, "rewards/margins_max": 0.16858163475990295, "rewards/margins_min": 0.06145660951733589, "rewards/margins_std": 0.07574883848428726, "rewards/rejected": -0.05978789180517197, "step": 1010 }, { "epoch": 0.32, "grad_norm": 0.48046875, "learning_rate": 8.581893479309924e-07, "logits/chosen": -1.3202977180480957, "logits/rejected": -0.9375591278076172, "logps/chosen": -244.69229125976562, "logps/rejected": -231.681640625, "loss": 0.6353, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04148910194635391, "rewards/margins": 0.11631004512310028, "rewards/margins_max": 0.1614200919866562, "rewards/margins_min": 0.07120002061128616, "rewards/margins_std": 0.06379522383213043, "rewards/rejected": -0.07482095062732697, "step": 1020 }, { "epoch": 0.32, "grad_norm": 0.34375, "learning_rate": 8.543303500114141e-07, "logits/chosen": -1.4135452508926392, "logits/rejected": -1.2024281024932861, "logps/chosen": -185.69488525390625, "logps/rejected": -222.9813995361328, "loss": 0.6506, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.036480437964200974, "rewards/margins": 0.0974053293466568, "rewards/margins_max": 0.14427343010902405, "rewards/margins_min": 0.050537217408418655, "rewards/margins_std": 0.0662815272808075, "rewards/rejected": -0.06092488765716553, "step": 1030 }, { "epoch": 0.33, "grad_norm": 0.5390625, "learning_rate": 8.504284787523066e-07, "logits/chosen": -1.4746744632720947, "logits/rejected": -1.0196959972381592, "logps/chosen": -249.49453735351562, "logps/rejected": -290.43304443359375, "loss": 0.6352, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04987958446145058, "rewards/margins": 0.13131950795650482, "rewards/margins_max": 0.1925027072429657, "rewards/margins_min": 0.07013632357120514, "rewards/margins_std": 0.08652608841657639, "rewards/rejected": -0.08143992722034454, "step": 1040 }, { "epoch": 0.33, "grad_norm": 0.416015625, "learning_rate": 8.464842062731234e-07, "logits/chosen": -1.3476040363311768, "logits/rejected": -1.1598175764083862, "logps/chosen": -179.7411651611328, "logps/rejected": -220.35177612304688, "loss": 0.6394, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03946131095290184, "rewards/margins": 0.12150558084249496, "rewards/margins_max": 0.17859359085559845, "rewards/margins_min": 0.06441758573055267, "rewards/margins_std": 0.08073462545871735, "rewards/rejected": -0.08204427361488342, "step": 1050 }, { "epoch": 0.33, "grad_norm": 0.37890625, "learning_rate": 8.424980098237902e-07, "logits/chosen": -1.4661829471588135, "logits/rejected": -1.1622331142425537, "logps/chosen": -197.33993530273438, "logps/rejected": -245.0823211669922, "loss": 0.64, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04446860030293465, "rewards/margins": 0.1190033107995987, "rewards/margins_max": 0.17709189653396606, "rewards/margins_min": 0.06091470643877983, "rewards/margins_std": 0.0821496844291687, "rewards/rejected": -0.07453471422195435, "step": 1060 }, { "epoch": 0.34, "grad_norm": 0.345703125, "learning_rate": 8.384703717269583e-07, "logits/chosen": -1.4809454679489136, "logits/rejected": -1.0145283937454224, "logps/chosen": -208.355712890625, "logps/rejected": -197.4699249267578, "loss": 0.6417, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04945468157529831, "rewards/margins": 0.12217775732278824, "rewards/margins_max": 0.17732298374176025, "rewards/margins_min": 0.06703253090381622, "rewards/margins_std": 0.07798713445663452, "rewards/rejected": -0.07272306829690933, "step": 1070 }, { "epoch": 0.34, "grad_norm": 0.53125, "learning_rate": 8.344017793196442e-07, "logits/chosen": -1.3273048400878906, "logits/rejected": -1.0948415994644165, "logps/chosen": -199.98806762695312, "logps/rejected": -194.45005798339844, "loss": 0.6375, "rewards/accuracies": 1.0, "rewards/chosen": 0.04742967337369919, "rewards/margins": 0.13943785429000854, "rewards/margins_max": 0.18036355078220367, "rewards/margins_min": 0.0985121950507164, "rewards/margins_std": 0.057877641171216965, "rewards/rejected": -0.09200819581747055, "step": 1080 }, { "epoch": 0.34, "grad_norm": 0.435546875, "learning_rate": 8.302927248942626e-07, "logits/chosen": -1.347947359085083, "logits/rejected": -0.8450009226799011, "logps/chosen": -309.2386474609375, "logps/rejected": -231.5939483642578, "loss": 0.6228, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06135648488998413, "rewards/margins": 0.17102745175361633, "rewards/margins_max": 0.2658650279045105, "rewards/margins_min": 0.07618991285562515, "rewards/margins_std": 0.13412055373191833, "rewards/rejected": -0.1096709817647934, "step": 1090 }, { "epoch": 0.35, "grad_norm": 0.40234375, "learning_rate": 8.261437056390606e-07, "logits/chosen": -1.509397268295288, "logits/rejected": -1.0450458526611328, "logps/chosen": -227.46694946289062, "logps/rejected": -230.9382781982422, "loss": 0.6316, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06662856787443161, "rewards/margins": 0.14657410979270935, "rewards/margins_max": 0.20933778584003448, "rewards/margins_min": 0.0838104858994484, "rewards/margins_std": 0.08876121044158936, "rewards/rejected": -0.07994556427001953, "step": 1100 }, { "epoch": 0.35, "grad_norm": 0.345703125, "learning_rate": 8.219552235779578e-07, "logits/chosen": -1.3927332162857056, "logits/rejected": -0.9824946522712708, "logps/chosen": -254.88949584960938, "logps/rejected": -268.16265869140625, "loss": 0.6306, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06077701598405838, "rewards/margins": 0.13684354722499847, "rewards/margins_max": 0.18205778300762177, "rewards/margins_min": 0.09162933379411697, "rewards/margins_std": 0.0639425665140152, "rewards/rejected": -0.07606653869152069, "step": 1110 }, { "epoch": 0.35, "grad_norm": 0.380859375, "learning_rate": 8.177277855098032e-07, "logits/chosen": -1.4236234426498413, "logits/rejected": -1.0470139980316162, "logps/chosen": -255.99038696289062, "logps/rejected": -229.17282104492188, "loss": 0.6393, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.044217340648174286, "rewards/margins": 0.12237548828125, "rewards/margins_max": 0.1730232536792755, "rewards/margins_min": 0.0717277079820633, "rewards/margins_std": 0.07162677496671677, "rewards/rejected": -0.07815815508365631, "step": 1120 }, { "epoch": 0.36, "grad_norm": 0.375, "learning_rate": 8.134619029470533e-07, "logits/chosen": -1.3011561632156372, "logits/rejected": -0.9734943509101868, "logps/chosen": -252.0817413330078, "logps/rejected": -222.7303466796875, "loss": 0.6262, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04272838681936264, "rewards/margins": 0.1569289267063141, "rewards/margins_max": 0.21832945942878723, "rewards/margins_min": 0.09552840888500214, "rewards/margins_std": 0.08683343231678009, "rewards/rejected": -0.11420054733753204, "step": 1130 }, { "epoch": 0.36, "grad_norm": 0.451171875, "learning_rate": 8.09158092053879e-07, "logits/chosen": -1.3196508884429932, "logits/rejected": -0.9734094738960266, "logps/chosen": -220.8783416748047, "logps/rejected": -209.29800415039062, "loss": 0.6299, "rewards/accuracies": 1.0, "rewards/chosen": 0.04809530824422836, "rewards/margins": 0.12365426868200302, "rewards/margins_max": 0.19381490349769592, "rewards/margins_min": 0.05349363014101982, "rewards/margins_std": 0.09922213107347488, "rewards/rejected": -0.07555896788835526, "step": 1140 }, { "epoch": 0.36, "grad_norm": 0.5078125, "learning_rate": 8.04816873583712e-07, "logits/chosen": -1.3901035785675049, "logits/rejected": -1.029416799545288, "logps/chosen": -232.9812469482422, "logps/rejected": -244.72299194335938, "loss": 0.631, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05810508131980896, "rewards/margins": 0.15548478066921234, "rewards/margins_max": 0.22354455292224884, "rewards/margins_min": 0.08742500096559525, "rewards/margins_std": 0.09625106304883957, "rewards/rejected": -0.09737969934940338, "step": 1150 }, { "epoch": 0.37, "grad_norm": 0.796875, "learning_rate": 8.004387728162343e-07, "logits/chosen": -1.3547106981277466, "logits/rejected": -0.9800487756729126, "logps/chosen": -220.08432006835938, "logps/rejected": -366.79425048828125, "loss": 0.6261, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05310923978686333, "rewards/margins": 0.16483795642852783, "rewards/margins_max": 0.24075451493263245, "rewards/margins_min": 0.0889214277267456, "rewards/margins_std": 0.10736221075057983, "rewards/rejected": -0.1117287278175354, "step": 1160 }, { "epoch": 0.37, "grad_norm": 0.3515625, "learning_rate": 7.96024319493819e-07, "logits/chosen": -1.4385137557983398, "logits/rejected": -1.0657742023468018, "logps/chosen": -221.01040649414062, "logps/rejected": -200.88296508789062, "loss": 0.6401, "rewards/accuracies": 1.0, "rewards/chosen": 0.036314692348241806, "rewards/margins": 0.1054370254278183, "rewards/margins_max": 0.14237162470817566, "rewards/margins_min": 0.06850244104862213, "rewards/margins_std": 0.05223340913653374, "rewards/rejected": -0.06912233680486679, "step": 1170 }, { "epoch": 0.37, "grad_norm": 0.44921875, "learning_rate": 7.915740477574347e-07, "logits/chosen": -1.5004260540008545, "logits/rejected": -1.0796833038330078, "logps/chosen": -198.33592224121094, "logps/rejected": -186.93319702148438, "loss": 0.6383, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06108757108449936, "rewards/margins": 0.11955182254314423, "rewards/margins_max": 0.1842404305934906, "rewards/margins_min": 0.05486319214105606, "rewards/margins_std": 0.09148352593183517, "rewards/rejected": -0.05846424773335457, "step": 1180 }, { "epoch": 0.37, "grad_norm": 0.359375, "learning_rate": 7.870884960820129e-07, "logits/chosen": -1.3537534475326538, "logits/rejected": -1.0633940696716309, "logps/chosen": -178.4303436279297, "logps/rejected": -234.31103515625, "loss": 0.6258, "rewards/accuracies": 1.0, "rewards/chosen": 0.06088218092918396, "rewards/margins": 0.12319563329219818, "rewards/margins_max": 0.18492364883422852, "rewards/margins_min": 0.061467599123716354, "rewards/margins_std": 0.08729662001132965, "rewards/rejected": -0.062313444912433624, "step": 1190 }, { "epoch": 0.38, "grad_norm": 0.291015625, "learning_rate": 7.825682072112959e-07, "logits/chosen": -1.464988112449646, "logits/rejected": -0.8578370809555054, "logps/chosen": -237.8139190673828, "logps/rejected": -234.4029998779297, "loss": 0.6403, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04808584973216057, "rewards/margins": 0.13337358832359314, "rewards/margins_max": 0.18245692551136017, "rewards/margins_min": 0.0842902883887291, "rewards/margins_std": 0.06941428035497665, "rewards/rejected": -0.08528774976730347, "step": 1200 }, { "epoch": 0.38, "grad_norm": 0.384765625, "learning_rate": 7.780137280921635e-07, "logits/chosen": -1.4263708591461182, "logits/rejected": -1.171579122543335, "logps/chosen": -246.2294464111328, "logps/rejected": -251.85107421875, "loss": 0.6281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.035905174911022186, "rewards/margins": 0.14339666068553925, "rewards/margins_max": 0.2217930257320404, "rewards/margins_min": 0.06500030308961868, "rewards/margins_std": 0.11086919158697128, "rewards/rejected": -0.10749147832393646, "step": 1210 }, { "epoch": 0.38, "grad_norm": 0.365234375, "learning_rate": 7.734256098084551e-07, "logits/chosen": -1.5084072351455688, "logits/rejected": -1.2369990348815918, "logps/chosen": -172.4056854248047, "logps/rejected": -221.0992431640625, "loss": 0.6373, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04651130363345146, "rewards/margins": 0.12719354033470154, "rewards/margins_max": 0.17494919896125793, "rewards/margins_min": 0.07943786680698395, "rewards/margins_std": 0.06753672659397125, "rewards/rejected": -0.08068223297595978, "step": 1220 }, { "epoch": 0.39, "grad_norm": 0.3828125, "learning_rate": 7.688044075142887e-07, "logits/chosen": -1.452924370765686, "logits/rejected": -0.9414194822311401, "logps/chosen": -194.08250427246094, "logps/rejected": -194.6898651123047, "loss": 0.6331, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.023848187178373337, "rewards/margins": 0.1178591400384903, "rewards/margins_max": 0.18268531560897827, "rewards/margins_min": 0.053032953292131424, "rewards/margins_std": 0.09167807549238205, "rewards/rejected": -0.09401094913482666, "step": 1230 }, { "epoch": 0.39, "grad_norm": 0.462890625, "learning_rate": 7.641506803668887e-07, "logits/chosen": -1.487422227859497, "logits/rejected": -1.252956509590149, "logps/chosen": -224.56399536132812, "logps/rejected": -222.06387329101562, "loss": 0.6238, "rewards/accuracies": 1.0, "rewards/chosen": 0.031578294932842255, "rewards/margins": 0.12413300573825836, "rewards/margins_max": 0.1808335930109024, "rewards/margins_min": 0.06743242591619492, "rewards/margins_std": 0.08018673211336136, "rewards/rejected": -0.09255470335483551, "step": 1240 }, { "epoch": 0.39, "grad_norm": 0.3984375, "learning_rate": 7.594649914589286e-07, "logits/chosen": -1.4718706607818604, "logits/rejected": -1.1124763488769531, "logps/chosen": -196.78677368164062, "logps/rejected": -192.97157287597656, "loss": 0.6286, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03744439780712128, "rewards/margins": 0.0985964983701706, "rewards/margins_max": 0.15701943635940552, "rewards/margins_min": 0.04017355293035507, "rewards/margins_std": 0.08262249827384949, "rewards/rejected": -0.06115208938717842, "step": 1250 }, { "epoch": 0.4, "grad_norm": 0.40234375, "learning_rate": 7.547479077503976e-07, "logits/chosen": -1.475367546081543, "logits/rejected": -1.0469552278518677, "logps/chosen": -244.2837371826172, "logps/rejected": -213.0482635498047, "loss": 0.6273, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04668278247117996, "rewards/margins": 0.10903759300708771, "rewards/margins_max": 0.15385808050632477, "rewards/margins_min": 0.06421708315610886, "rewards/margins_std": 0.0633857473731041, "rewards/rejected": -0.06235479563474655, "step": 1260 }, { "epoch": 0.4, "grad_norm": 0.47265625, "learning_rate": 7.5e-07, "logits/chosen": -1.4103657007217407, "logits/rejected": -0.9712256193161011, "logps/chosen": -338.1557922363281, "logps/rejected": -219.37918090820312, "loss": 0.6395, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04872440919280052, "rewards/margins": 0.11783953756093979, "rewards/margins_max": 0.171439990401268, "rewards/margins_min": 0.06423909962177277, "rewards/margins_std": 0.07580247521400452, "rewards/rejected": -0.06911513209342957, "step": 1270 }, { "epoch": 0.4, "grad_norm": 0.4375, "learning_rate": 7.452218426960939e-07, "logits/chosen": -1.3297299146652222, "logits/rejected": -1.0080119371414185, "logps/chosen": -203.5293731689453, "logps/rejected": -218.4529571533203, "loss": 0.6359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05152253434062004, "rewards/margins": 0.13503798842430115, "rewards/margins_max": 0.2129654586315155, "rewards/margins_min": 0.05711054056882858, "rewards/margins_std": 0.11020606756210327, "rewards/rejected": -0.083515465259552, "step": 1280 }, { "epoch": 0.41, "grad_norm": 0.431640625, "learning_rate": 7.404140139871796e-07, "logits/chosen": -1.4835853576660156, "logits/rejected": -1.0349833965301514, "logps/chosen": -226.5528564453125, "logps/rejected": -228.1876220703125, "loss": 0.6331, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04554816335439682, "rewards/margins": 0.12928791344165802, "rewards/margins_max": 0.17952772974967957, "rewards/margins_min": 0.07904806733131409, "rewards/margins_std": 0.07104986160993576, "rewards/rejected": -0.0837397426366806, "step": 1290 }, { "epoch": 0.41, "grad_norm": 0.6640625, "learning_rate": 7.355770956119443e-07, "logits/chosen": -1.307871699333191, "logits/rejected": -1.0118768215179443, "logps/chosen": -220.9732208251953, "logps/rejected": -188.6660919189453, "loss": 0.6339, "rewards/accuracies": 1.0, "rewards/chosen": 0.043763868510723114, "rewards/margins": 0.12307041883468628, "rewards/margins_max": 0.17446239292621613, "rewards/margins_min": 0.07167843729257584, "rewards/margins_std": 0.07267922908067703, "rewards/rejected": -0.07930653542280197, "step": 1300 }, { "epoch": 0.41, "grad_norm": 0.5234375, "learning_rate": 7.307116728288726e-07, "logits/chosen": -1.4419422149658203, "logits/rejected": -1.1987005472183228, "logps/chosen": -180.5616455078125, "logps/rejected": -223.50927734375, "loss": 0.6274, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.056863777339458466, "rewards/margins": 0.13441364467144012, "rewards/margins_max": 0.20376619696617126, "rewards/margins_min": 0.0650610700249672, "rewards/margins_std": 0.09807933866977692, "rewards/rejected": -0.07754985243082047, "step": 1310 }, { "epoch": 0.42, "grad_norm": 0.462890625, "learning_rate": 7.258183343454318e-07, "logits/chosen": -1.4006013870239258, "logits/rejected": -0.9503320455551147, "logps/chosen": -235.5669708251953, "logps/rejected": -239.7013702392578, "loss": 0.6273, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.047770898789167404, "rewards/margins": 0.14523038268089294, "rewards/margins_max": 0.217434361577034, "rewards/margins_min": 0.07302640378475189, "rewards/margins_std": 0.10211183875799179, "rewards/rejected": -0.09745948016643524, "step": 1320 }, { "epoch": 0.42, "grad_norm": 0.55078125, "learning_rate": 7.208976722468391e-07, "logits/chosen": -1.514934778213501, "logits/rejected": -1.2314091920852661, "logps/chosen": -173.3019256591797, "logps/rejected": -180.0929718017578, "loss": 0.6342, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.058720219880342484, "rewards/margins": 0.11582016944885254, "rewards/margins_max": 0.16832241415977478, "rewards/margins_min": 0.0633179321885109, "rewards/margins_std": 0.07424938678741455, "rewards/rejected": -0.057099949568510056, "step": 1330 }, { "epoch": 0.42, "grad_norm": 0.4296875, "learning_rate": 7.159502819244205e-07, "logits/chosen": -1.470738410949707, "logits/rejected": -1.1212527751922607, "logps/chosen": -232.70425415039062, "logps/rejected": -220.24850463867188, "loss": 0.635, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.042904727160930634, "rewards/margins": 0.1314111053943634, "rewards/margins_max": 0.20116429030895233, "rewards/margins_min": 0.061657924205064774, "rewards/margins_std": 0.0986458957195282, "rewards/rejected": -0.08850638568401337, "step": 1340 }, { "epoch": 0.43, "grad_norm": 0.41015625, "learning_rate": 7.109767620035688e-07, "logits/chosen": -1.3368486166000366, "logits/rejected": -1.093397855758667, "logps/chosen": -180.45089721679688, "logps/rejected": -187.77438354492188, "loss": 0.6422, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.041960351169109344, "rewards/margins": 0.09945043921470642, "rewards/margins_max": 0.1653245985507965, "rewards/margins_min": 0.03357627987861633, "rewards/margins_std": 0.09316011518239975, "rewards/rejected": -0.057490088045597076, "step": 1350 }, { "epoch": 0.43, "grad_norm": 0.359375, "learning_rate": 7.059777142713122e-07, "logits/chosen": -1.273991584777832, "logits/rejected": -1.0232566595077515, "logps/chosen": -249.97781372070312, "logps/rejected": -238.1562957763672, "loss": 0.6404, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.030306842178106308, "rewards/margins": 0.09815285354852676, "rewards/margins_max": 0.15446361899375916, "rewards/margins_min": 0.04184209182858467, "rewards/margins_std": 0.07963544130325317, "rewards/rejected": -0.06784601509571075, "step": 1360 }, { "epoch": 0.43, "grad_norm": 0.51953125, "learning_rate": 7.00953743603498e-07, "logits/chosen": -1.3965544700622559, "logits/rejected": -0.992017388343811, "logps/chosen": -297.09332275390625, "logps/rejected": -215.32479858398438, "loss": 0.6317, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04624886438250542, "rewards/margins": 0.11734838783740997, "rewards/margins_max": 0.16934475302696228, "rewards/margins_min": 0.06535204499959946, "rewards/margins_std": 0.07353393733501434, "rewards/rejected": -0.07109951227903366, "step": 1370 }, { "epoch": 0.43, "grad_norm": 0.376953125, "learning_rate": 6.959054578916042e-07, "logits/chosen": -1.3886009454727173, "logits/rejected": -1.0957224369049072, "logps/chosen": -183.44839477539062, "logps/rejected": -231.7744903564453, "loss": 0.6268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03902550786733627, "rewards/margins": 0.14433899521827698, "rewards/margins_max": 0.2244710922241211, "rewards/margins_min": 0.06420690566301346, "rewards/margins_std": 0.11332390457391739, "rewards/rejected": -0.10531347990036011, "step": 1380 }, { "epoch": 0.44, "grad_norm": 0.453125, "learning_rate": 6.908334679691863e-07, "logits/chosen": -1.3139946460723877, "logits/rejected": -1.1197882890701294, "logps/chosen": -209.2381134033203, "logps/rejected": -235.80380249023438, "loss": 0.6347, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.027201693505048752, "rewards/margins": 0.11521643400192261, "rewards/margins_max": 0.15470090508460999, "rewards/margins_min": 0.07573194801807404, "rewards/margins_std": 0.05583949014544487, "rewards/rejected": -0.08801472932100296, "step": 1390 }, { "epoch": 0.44, "grad_norm": 0.5625, "learning_rate": 6.857383875379661e-07, "logits/chosen": -1.2649810314178467, "logits/rejected": -0.9927312731742859, "logps/chosen": -219.4835968017578, "logps/rejected": -233.77133178710938, "loss": 0.6318, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0319049134850502, "rewards/margins": 0.11133754253387451, "rewards/margins_max": 0.15614716708660126, "rewards/margins_min": 0.06652794033288956, "rewards/margins_std": 0.06337036192417145, "rewards/rejected": -0.07943262904882431, "step": 1400 }, { "epoch": 0.44, "grad_norm": 0.609375, "learning_rate": 6.806208330935766e-07, "logits/chosen": -1.4187123775482178, "logits/rejected": -1.213805913925171, "logps/chosen": -275.88079833984375, "logps/rejected": -248.2821502685547, "loss": 0.6253, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04402131587266922, "rewards/margins": 0.12928104400634766, "rewards/margins_max": 0.21939381957054138, "rewards/margins_min": 0.03916824236512184, "rewards/margins_std": 0.1274387240409851, "rewards/rejected": -0.08525971323251724, "step": 1410 }, { "epoch": 0.45, "grad_norm": 0.53125, "learning_rate": 6.754814238509652e-07, "logits/chosen": -1.3234349489212036, "logits/rejected": -0.9899675250053406, "logps/chosen": -261.2273254394531, "logps/rejected": -228.1154327392578, "loss": 0.6347, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03984065353870392, "rewards/margins": 0.15004102885723114, "rewards/margins_max": 0.22247202694416046, "rewards/margins_min": 0.07761004567146301, "rewards/margins_std": 0.10243289172649384, "rewards/rejected": -0.11020038276910782, "step": 1420 }, { "epoch": 0.45, "grad_norm": 0.61328125, "learning_rate": 6.703207816694718e-07, "logits/chosen": -1.4122707843780518, "logits/rejected": -1.0776605606079102, "logps/chosen": -243.84994506835938, "logps/rejected": -214.42257690429688, "loss": 0.6313, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.028529832139611244, "rewards/margins": 0.09134817868471146, "rewards/margins_max": 0.12654173374176025, "rewards/margins_min": 0.05615462735295296, "rewards/margins_std": 0.049771200865507126, "rewards/rejected": -0.06281835585832596, "step": 1430 }, { "epoch": 0.45, "grad_norm": 0.48828125, "learning_rate": 6.651395309775836e-07, "logits/chosen": -1.481340765953064, "logits/rejected": -1.1970375776290894, "logps/chosen": -167.13339233398438, "logps/rejected": -208.0955810546875, "loss": 0.63, "rewards/accuracies": 1.0, "rewards/chosen": 0.05509837716817856, "rewards/margins": 0.1572759747505188, "rewards/margins_max": 0.2205711305141449, "rewards/margins_min": 0.09398078173398972, "rewards/margins_std": 0.0895129069685936, "rewards/rejected": -0.10217758268117905, "step": 1440 }, { "epoch": 0.46, "grad_norm": 0.45703125, "learning_rate": 6.599382986973807e-07, "logits/chosen": -1.4374146461486816, "logits/rejected": -0.964760422706604, "logps/chosen": -208.99783325195312, "logps/rejected": -257.08526611328125, "loss": 0.6281, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.07429121434688568, "rewards/margins": 0.14184515178203583, "rewards/margins_max": 0.22351637482643127, "rewards/margins_min": 0.06017392873764038, "rewards/margins_std": 0.1155005469918251, "rewards/rejected": -0.06755392998456955, "step": 1450 }, { "epoch": 0.46, "grad_norm": 0.41796875, "learning_rate": 6.547177141686798e-07, "logits/chosen": -1.3717443943023682, "logits/rejected": -0.8923895955085754, "logps/chosen": -218.4834442138672, "logps/rejected": -211.63876342773438, "loss": 0.6241, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05127422884106636, "rewards/margins": 0.14416508376598358, "rewards/margins_max": 0.21262860298156738, "rewards/margins_min": 0.07570157200098038, "rewards/margins_std": 0.09682202339172363, "rewards/rejected": -0.09289085119962692, "step": 1460 }, { "epoch": 0.46, "grad_norm": 0.6015625, "learning_rate": 6.494784090728851e-07, "logits/chosen": -1.381131887435913, "logits/rejected": -1.1379430294036865, "logps/chosen": -200.09182739257812, "logps/rejected": -255.30020141601562, "loss": 0.6205, "rewards/accuracies": 1.0, "rewards/chosen": 0.0368528813123703, "rewards/margins": 0.14170141518115997, "rewards/margins_max": 0.2006821632385254, "rewards/margins_min": 0.08272063732147217, "rewards/margins_std": 0.08341138064861298, "rewards/rejected": -0.10484850406646729, "step": 1470 }, { "epoch": 0.47, "grad_norm": 0.43359375, "learning_rate": 6.442210173565561e-07, "logits/chosen": -1.4332636594772339, "logits/rejected": -1.0423600673675537, "logps/chosen": -216.1157684326172, "logps/rejected": -278.70843505859375, "loss": 0.6277, "rewards/accuracies": 1.0, "rewards/chosen": 0.051350273191928864, "rewards/margins": 0.15817813575267792, "rewards/margins_max": 0.2108217179775238, "rewards/margins_min": 0.10553457587957382, "rewards/margins_std": 0.07444924116134644, "rewards/rejected": -0.10682785511016846, "step": 1480 }, { "epoch": 0.47, "grad_norm": 0.380859375, "learning_rate": 6.389461751547008e-07, "logits/chosen": -1.3558521270751953, "logits/rejected": -1.085033655166626, "logps/chosen": -225.6574249267578, "logps/rejected": -217.3332977294922, "loss": 0.6273, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03980935364961624, "rewards/margins": 0.11275990307331085, "rewards/margins_max": 0.16510936617851257, "rewards/margins_min": 0.06041043996810913, "rewards/margins_std": 0.07403331995010376, "rewards/rejected": -0.07295055687427521, "step": 1490 }, { "epoch": 0.47, "grad_norm": 0.4765625, "learning_rate": 6.33654520713805e-07, "logits/chosen": -1.426582932472229, "logits/rejected": -0.8861996531486511, "logps/chosen": -253.96957397460938, "logps/rejected": -198.83224487304688, "loss": 0.6299, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04899178445339203, "rewards/margins": 0.14212216436862946, "rewards/margins_max": 0.2016027718782425, "rewards/margins_min": 0.08264155685901642, "rewards/margins_std": 0.08411829173564911, "rewards/rejected": -0.09313038736581802, "step": 1500 }, { "epoch": 0.48, "grad_norm": 0.44140625, "learning_rate": 6.283466943146051e-07, "logits/chosen": -1.3297570943832397, "logits/rejected": -1.0105302333831787, "logps/chosen": -273.25579833984375, "logps/rejected": -214.6565399169922, "loss": 0.6199, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04776372015476227, "rewards/margins": 0.12771016359329224, "rewards/margins_max": 0.1960275024175644, "rewards/margins_min": 0.05939285084605217, "rewards/margins_std": 0.0966152772307396, "rewards/rejected": -0.07994645088911057, "step": 1510 }, { "epoch": 0.48, "grad_norm": 0.345703125, "learning_rate": 6.230233381946162e-07, "logits/chosen": -1.2974770069122314, "logits/rejected": -1.0966564416885376, "logps/chosen": -169.97372436523438, "logps/rejected": -194.61965942382812, "loss": 0.6351, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03883309289813042, "rewards/margins": 0.130173459649086, "rewards/margins_max": 0.20102062821388245, "rewards/margins_min": 0.05932629853487015, "rewards/margins_std": 0.10019302368164062, "rewards/rejected": -0.09134037047624588, "step": 1520 }, { "epoch": 0.48, "grad_norm": 0.4765625, "learning_rate": 6.176850964704212e-07, "logits/chosen": -1.3526862859725952, "logits/rejected": -0.9741379618644714, "logps/chosen": -177.74484252929688, "logps/rejected": -170.9863739013672, "loss": 0.6383, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03418565168976784, "rewards/margins": 0.13273611664772034, "rewards/margins_max": 0.19749236106872559, "rewards/margins_min": 0.06797986477613449, "rewards/margins_std": 0.09157915413379669, "rewards/rejected": -0.0985504761338234, "step": 1530 }, { "epoch": 0.49, "grad_norm": 0.62109375, "learning_rate": 6.12332615059735e-07, "logits/chosen": -1.5363820791244507, "logits/rejected": -1.1499285697937012, "logps/chosen": -249.35995483398438, "logps/rejected": -278.024169921875, "loss": 0.6262, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05033854395151138, "rewards/margins": 0.12232635915279388, "rewards/margins_max": 0.1788022816181183, "rewards/margins_min": 0.0658503919839859, "rewards/margins_std": 0.07986906915903091, "rewards/rejected": -0.07198779284954071, "step": 1540 }, { "epoch": 0.49, "grad_norm": 0.48046875, "learning_rate": 6.069665416032486e-07, "logits/chosen": -1.5334231853485107, "logits/rejected": -1.0874627828598022, "logps/chosen": -260.1651916503906, "logps/rejected": -216.29342651367188, "loss": 0.6185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04319532588124275, "rewards/margins": 0.11832845211029053, "rewards/margins_max": 0.17873892188072205, "rewards/margins_min": 0.05791795998811722, "rewards/margins_std": 0.08543331921100616, "rewards/rejected": -0.07513312250375748, "step": 1550 }, { "epoch": 0.49, "grad_norm": 0.462890625, "learning_rate": 6.015875253862671e-07, "logits/chosen": -1.4311401844024658, "logits/rejected": -1.0909273624420166, "logps/chosen": -206.8611297607422, "logps/rejected": -273.14886474609375, "loss": 0.6263, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.040190037339925766, "rewards/margins": 0.12395022064447403, "rewards/margins_max": 0.1803322434425354, "rewards/margins_min": 0.06756818294525146, "rewards/margins_std": 0.07973622530698776, "rewards/rejected": -0.08376017212867737, "step": 1560 }, { "epoch": 0.49, "grad_norm": 0.353515625, "learning_rate": 5.961962172601457e-07, "logits/chosen": -1.459913730621338, "logits/rejected": -1.0906808376312256, "logps/chosen": -199.64151000976562, "logps/rejected": -214.5613555908203, "loss": 0.6352, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05259844660758972, "rewards/margins": 0.11418597400188446, "rewards/margins_max": 0.16717670857906342, "rewards/margins_min": 0.061195243149995804, "rewards/margins_std": 0.07494021207094193, "rewards/rejected": -0.06158752366900444, "step": 1570 }, { "epoch": 0.5, "grad_norm": 0.62890625, "learning_rate": 5.907932695635389e-07, "logits/chosen": -1.2819427251815796, "logits/rejected": -1.1336212158203125, "logps/chosen": -206.28005981445312, "logps/rejected": -259.9476013183594, "loss": 0.6302, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.026775460690259933, "rewards/margins": 0.14914140105247498, "rewards/margins_max": 0.24029541015625, "rewards/margins_min": 0.057987384498119354, "rewards/margins_std": 0.12891125679016113, "rewards/rejected": -0.12236593663692474, "step": 1580 }, { "epoch": 0.5, "grad_norm": 0.515625, "learning_rate": 5.853793360434687e-07, "logits/chosen": -1.4024218320846558, "logits/rejected": -0.9892303347587585, "logps/chosen": -269.4578857421875, "logps/rejected": -227.0701141357422, "loss": 0.6304, "rewards/accuracies": 1.0, "rewards/chosen": 0.046691518276929855, "rewards/margins": 0.12285880744457245, "rewards/margins_max": 0.18432477116584778, "rewards/margins_min": 0.06139283627271652, "rewards/margins_std": 0.08692601323127747, "rewards/rejected": -0.0761672854423523, "step": 1590 }, { "epoch": 0.5, "grad_norm": 0.52734375, "learning_rate": 5.79955071776222e-07, "logits/chosen": -1.3737763166427612, "logits/rejected": -1.0089573860168457, "logps/chosen": -176.09317016601562, "logps/rejected": -192.53468322753906, "loss": 0.6285, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.042514294385910034, "rewards/margins": 0.17080818116664886, "rewards/margins_max": 0.24430468678474426, "rewards/margins_min": 0.09731169044971466, "rewards/margins_std": 0.1039397269487381, "rewards/rejected": -0.12829387187957764, "step": 1600 }, { "epoch": 0.51, "grad_norm": 0.412109375, "learning_rate": 5.745211330880872e-07, "logits/chosen": -1.3686656951904297, "logits/rejected": -1.1941941976547241, "logps/chosen": -168.5927734375, "logps/rejected": -232.0594024658203, "loss": 0.6176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04633691906929016, "rewards/margins": 0.15676456689834595, "rewards/margins_max": 0.2298036366701126, "rewards/margins_min": 0.08372551202774048, "rewards/margins_std": 0.10329282283782959, "rewards/rejected": -0.11042765527963638, "step": 1610 }, { "epoch": 0.51, "grad_norm": 0.408203125, "learning_rate": 5.690781774759412e-07, "logits/chosen": -1.4598623514175415, "logits/rejected": -1.1373037099838257, "logps/chosen": -224.392333984375, "logps/rejected": -206.09756469726562, "loss": 0.6346, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04373541846871376, "rewards/margins": 0.12310346215963364, "rewards/margins_max": 0.1717989593744278, "rewards/margins_min": 0.07440796494483948, "rewards/margins_std": 0.0688658356666565, "rewards/rejected": -0.07936803251504898, "step": 1620 }, { "epoch": 0.51, "grad_norm": 0.58203125, "learning_rate": 5.636268635276917e-07, "logits/chosen": -1.237717866897583, "logits/rejected": -1.0289928913116455, "logps/chosen": -194.26791381835938, "logps/rejected": -347.3554382324219, "loss": 0.6174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02769922837615013, "rewards/margins": 0.17334696650505066, "rewards/margins_max": 0.23788447678089142, "rewards/margins_min": 0.1088094711303711, "rewards/margins_std": 0.09126981347799301, "rewards/rejected": -0.14564773440361023, "step": 1630 }, { "epoch": 0.52, "grad_norm": 0.455078125, "learning_rate": 5.581678508425907e-07, "logits/chosen": -1.5093035697937012, "logits/rejected": -1.1646114587783813, "logps/chosen": -225.5840606689453, "logps/rejected": -273.4601135253906, "loss": 0.625, "rewards/accuracies": 1.0, "rewards/chosen": 0.044809166342020035, "rewards/margins": 0.16066548228263855, "rewards/margins_max": 0.22280371189117432, "rewards/margins_min": 0.09852725267410278, "rewards/margins_std": 0.08787672221660614, "rewards/rejected": -0.11585632711648941, "step": 1640 }, { "epoch": 0.52, "grad_norm": 0.443359375, "learning_rate": 5.527017999514238e-07, "logits/chosen": -1.434874176979065, "logits/rejected": -1.2691529989242554, "logps/chosen": -214.0901336669922, "logps/rejected": -304.82098388671875, "loss": 0.6337, "rewards/accuracies": 1.0, "rewards/chosen": 0.05037333816289902, "rewards/margins": 0.13265621662139893, "rewards/margins_max": 0.1799599826335907, "rewards/margins_min": 0.08535243570804596, "rewards/margins_std": 0.06689763814210892, "rewards/rejected": -0.0822828859090805, "step": 1650 }, { "epoch": 0.52, "grad_norm": 0.439453125, "learning_rate": 5.472293722365865e-07, "logits/chosen": -1.5142433643341064, "logits/rejected": -1.1502716541290283, "logps/chosen": -199.11727905273438, "logps/rejected": -236.5269317626953, "loss": 0.6188, "rewards/accuracies": 1.0, "rewards/chosen": 0.04829222336411476, "rewards/margins": 0.15703055262565613, "rewards/margins_max": 0.24611596763134003, "rewards/margins_min": 0.06794511526823044, "rewards/margins_std": 0.12598583102226257, "rewards/rejected": -0.10873832553625107, "step": 1660 }, { "epoch": 0.53, "grad_norm": 0.30859375, "learning_rate": 5.417512298520584e-07, "logits/chosen": -1.43448007106781, "logits/rejected": -1.0308877229690552, "logps/chosen": -233.7875518798828, "logps/rejected": -209.86734008789062, "loss": 0.6302, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04728182405233383, "rewards/margins": 0.1253185272216797, "rewards/margins_max": 0.18824602663516998, "rewards/margins_min": 0.062391042709350586, "rewards/margins_std": 0.08899290859699249, "rewards/rejected": -0.07803670316934586, "step": 1670 }, { "epoch": 0.53, "grad_norm": 0.65234375, "learning_rate": 5.362680356432846e-07, "logits/chosen": -1.6510565280914307, "logits/rejected": -1.247899055480957, "logps/chosen": -236.86453247070312, "logps/rejected": -262.64324951171875, "loss": 0.6231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0422038659453392, "rewards/margins": 0.1669834703207016, "rewards/margins_max": 0.2213924676179886, "rewards/margins_min": 0.11257448047399521, "rewards/margins_std": 0.07694593816995621, "rewards/rejected": -0.1247796043753624, "step": 1680 }, { "epoch": 0.53, "grad_norm": 0.5546875, "learning_rate": 5.307804530669715e-07, "logits/chosen": -1.3352489471435547, "logits/rejected": -1.0017801523208618, "logps/chosen": -203.45884704589844, "logps/rejected": -276.4349060058594, "loss": 0.6183, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04065268859267235, "rewards/margins": 0.1817699670791626, "rewards/margins_max": 0.28018659353256226, "rewards/margins_min": 0.08335334062576294, "rewards/margins_std": 0.13918212056159973, "rewards/rejected": -0.14111728966236115, "step": 1690 }, { "epoch": 0.54, "grad_norm": 0.416015625, "learning_rate": 5.2528914611081e-07, "logits/chosen": -1.5503590106964111, "logits/rejected": -1.1069653034210205, "logps/chosen": -192.2991943359375, "logps/rejected": -210.0384979248047, "loss": 0.6254, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05085741728544235, "rewards/margins": 0.17244575917720795, "rewards/margins_max": 0.2783913016319275, "rewards/margins_min": 0.06650026142597198, "rewards/margins_std": 0.14982958137989044, "rewards/rejected": -0.12158836424350739, "step": 1700 }, { "epoch": 0.54, "grad_norm": 0.515625, "learning_rate": 5.197947792131348e-07, "logits/chosen": -1.1360037326812744, "logits/rejected": -0.9705570936203003, "logps/chosen": -245.7555694580078, "logps/rejected": -288.6032409667969, "loss": 0.6258, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0327492281794548, "rewards/margins": 0.16705994307994843, "rewards/margins_max": 0.22337321937084198, "rewards/margins_min": 0.11074666678905487, "rewards/margins_std": 0.0796389952301979, "rewards/rejected": -0.13431070744991302, "step": 1710 }, { "epoch": 0.54, "grad_norm": 0.345703125, "learning_rate": 5.142980171825276e-07, "logits/chosen": -1.3152287006378174, "logits/rejected": -0.9166573286056519, "logps/chosen": -222.42495727539062, "logps/rejected": -229.1804962158203, "loss": 0.6292, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03760639578104019, "rewards/margins": 0.13950268924236298, "rewards/margins_max": 0.1909518986940384, "rewards/margins_min": 0.08805350959300995, "rewards/margins_std": 0.07276014238595963, "rewards/rejected": -0.10189630836248398, "step": 1720 }, { "epoch": 0.54, "grad_norm": 0.60546875, "learning_rate": 5.087995251173769e-07, "logits/chosen": -1.3346498012542725, "logits/rejected": -0.980597198009491, "logps/chosen": -201.7723388671875, "logps/rejected": -276.3261413574219, "loss": 0.6213, "rewards/accuracies": 1.0, "rewards/chosen": 0.025363069027662277, "rewards/margins": 0.17956769466400146, "rewards/margins_max": 0.2888151705265045, "rewards/margins_min": 0.07032018154859543, "rewards/margins_std": 0.15449929237365723, "rewards/rejected": -0.1542046070098877, "step": 1730 }, { "epoch": 0.55, "grad_norm": 0.490234375, "learning_rate": 5.032999683254027e-07, "logits/chosen": -1.4949066638946533, "logits/rejected": -1.2108285427093506, "logps/chosen": -225.4260711669922, "logps/rejected": -247.3677215576172, "loss": 0.6252, "rewards/accuracies": 1.0, "rewards/chosen": 0.027850795537233353, "rewards/margins": 0.1680610179901123, "rewards/margins_max": 0.2702367603778839, "rewards/margins_min": 0.06588525325059891, "rewards/margins_std": 0.14449834823608398, "rewards/rejected": -0.14021022617816925, "step": 1740 }, { "epoch": 0.55, "grad_norm": 0.4609375, "learning_rate": 4.97800012243155e-07, "logits/chosen": -1.400756597518921, "logits/rejected": -1.0894193649291992, "logps/chosen": -211.5016326904297, "logps/rejected": -224.2008056640625, "loss": 0.6291, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.043883297592401505, "rewards/margins": 0.13359126448631287, "rewards/margins_max": 0.20171597599983215, "rewards/margins_min": 0.06546656787395477, "rewards/margins_std": 0.09634287655353546, "rewards/rejected": -0.08970797061920166, "step": 1750 }, { "epoch": 0.55, "grad_norm": 0.39453125, "learning_rate": 4.923003223554966e-07, "logits/chosen": -1.346853494644165, "logits/rejected": -1.0727789402008057, "logps/chosen": -197.31280517578125, "logps/rejected": -219.85580444335938, "loss": 0.6402, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03675428405404091, "rewards/margins": 0.10769043117761612, "rewards/margins_max": 0.15701591968536377, "rewards/margins_min": 0.05836494639515877, "rewards/margins_std": 0.06975677609443665, "rewards/rejected": -0.0709361582994461, "step": 1760 }, { "epoch": 0.56, "grad_norm": 0.359375, "learning_rate": 4.868015641150819e-07, "logits/chosen": -1.489463448524475, "logits/rejected": -1.0774259567260742, "logps/chosen": -228.04257202148438, "logps/rejected": -254.78701782226562, "loss": 0.6211, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03930111974477768, "rewards/margins": 0.162466898560524, "rewards/margins_max": 0.2493637055158615, "rewards/margins_min": 0.07557009160518646, "rewards/margins_std": 0.12289062887430191, "rewards/rejected": -0.1231658011674881, "step": 1770 }, { "epoch": 0.56, "grad_norm": 0.427734375, "learning_rate": 4.813044028618372e-07, "logits/chosen": -1.3595679998397827, "logits/rejected": -1.0415996313095093, "logps/chosen": -217.2411651611328, "logps/rejected": -226.8505401611328, "loss": 0.6318, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03175649791955948, "rewards/margins": 0.13404306769371033, "rewards/margins_max": 0.1951790750026703, "rewards/margins_min": 0.07290709763765335, "rewards/margins_std": 0.08645935356616974, "rewards/rejected": -0.10228659212589264, "step": 1780 }, { "epoch": 0.56, "grad_norm": 0.443359375, "learning_rate": 4.7580950374245664e-07, "logits/chosen": -1.3718421459197998, "logits/rejected": -1.0821744203567505, "logps/chosen": -202.16091918945312, "logps/rejected": -192.60728454589844, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 0.041288070380687714, "rewards/margins": 0.14421400427818298, "rewards/margins_max": 0.2147628366947174, "rewards/margins_min": 0.07366515696048737, "rewards/margins_std": 0.09977111220359802, "rewards/rejected": -0.10292591899633408, "step": 1790 }, { "epoch": 0.57, "grad_norm": 0.44140625, "learning_rate": 4.703175316299196e-07, "logits/chosen": -1.4759793281555176, "logits/rejected": -0.9586470723152161, "logps/chosen": -306.47882080078125, "logps/rejected": -301.02825927734375, "loss": 0.6237, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.050575144588947296, "rewards/margins": 0.13904833793640137, "rewards/margins_max": 0.204876109957695, "rewards/margins_min": 0.07322058826684952, "rewards/margins_std": 0.09309452027082443, "rewards/rejected": -0.08847320824861526, "step": 1800 }, { "epoch": 0.57, "grad_norm": 0.421875, "learning_rate": 4.6482915104304373e-07, "logits/chosen": -1.3048204183578491, "logits/rejected": -0.8675423860549927, "logps/chosen": -255.3675079345703, "logps/rejected": -200.3520050048828, "loss": 0.6279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04115379601716995, "rewards/margins": 0.16566821932792664, "rewards/margins_max": 0.2521827518939972, "rewards/margins_min": 0.07915371656417847, "rewards/margins_std": 0.12234999984502792, "rewards/rejected": -0.12451444566249847, "step": 1810 }, { "epoch": 0.57, "grad_norm": 0.50390625, "learning_rate": 4.593450260660775e-07, "logits/chosen": -1.3136993646621704, "logits/rejected": -1.0857809782028198, "logps/chosen": -173.83749389648438, "logps/rejected": -204.83389282226562, "loss": 0.6326, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.031986989080905914, "rewards/margins": 0.11422686278820038, "rewards/margins_max": 0.1664976328611374, "rewards/margins_min": 0.061956118792295456, "rewards/margins_std": 0.07392201572656631, "rewards/rejected": -0.08223988860845566, "step": 1820 }, { "epoch": 0.58, "grad_norm": 0.30859375, "learning_rate": 4.5386582026834904e-07, "logits/chosen": -1.407066822052002, "logits/rejected": -1.1559934616088867, "logps/chosen": -194.22482299804688, "logps/rejected": -226.273193359375, "loss": 0.6325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.06259353458881378, "rewards/margins": 0.1343492567539215, "rewards/margins_max": 0.2052960842847824, "rewards/margins_min": 0.06340241432189941, "rewards/margins_std": 0.10033398866653442, "rewards/rejected": -0.07175572961568832, "step": 1830 }, { "epoch": 0.58, "grad_norm": 0.49609375, "learning_rate": 4.483921966239739e-07, "logits/chosen": -1.3300710916519165, "logits/rejected": -1.107683539390564, "logps/chosen": -202.5210723876953, "logps/rejected": -293.78265380859375, "loss": 0.6212, "rewards/accuracies": 1.0, "rewards/chosen": 0.0330859050154686, "rewards/margins": 0.18205811083316803, "rewards/margins_max": 0.25554248690605164, "rewards/margins_min": 0.10857371240854263, "rewards/margins_std": 0.10392262041568756, "rewards/rejected": -0.14897218346595764, "step": 1840 }, { "epoch": 0.58, "grad_norm": 0.55078125, "learning_rate": 4.429248174316375e-07, "logits/chosen": -1.50588059425354, "logits/rejected": -1.1238529682159424, "logps/chosen": -256.1008605957031, "logps/rejected": -309.0824279785156, "loss": 0.6336, "rewards/accuracies": 1.0, "rewards/chosen": 0.0622735433280468, "rewards/margins": 0.16536816954612732, "rewards/margins_max": 0.22724337875843048, "rewards/margins_min": 0.10349295288324356, "rewards/margins_std": 0.08750475943088531, "rewards/rejected": -0.10309461504220963, "step": 1850 }, { "epoch": 0.59, "grad_norm": 0.46484375, "learning_rate": 4.374643442344576e-07, "logits/chosen": -1.2853957414627075, "logits/rejected": -0.9773873090744019, "logps/chosen": -202.54776000976562, "logps/rejected": -197.9080352783203, "loss": 0.6202, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03965754434466362, "rewards/margins": 0.17071697115898132, "rewards/margins_max": 0.264207661151886, "rewards/margins_min": 0.07722628116607666, "rewards/margins_std": 0.13221579790115356, "rewards/rejected": -0.1310594230890274, "step": 1860 }, { "epoch": 0.59, "grad_norm": 0.333984375, "learning_rate": 4.3201143773993864e-07, "logits/chosen": -1.4333173036575317, "logits/rejected": -0.9873960614204407, "logps/chosen": -254.46841430664062, "logps/rejected": -246.49880981445312, "loss": 0.6263, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.041944604367017746, "rewards/margins": 0.1601918637752533, "rewards/margins_max": 0.23515033721923828, "rewards/margins_min": 0.08523334562778473, "rewards/margins_std": 0.10600732266902924, "rewards/rejected": -0.11824724823236465, "step": 1870 }, { "epoch": 0.59, "grad_norm": 0.373046875, "learning_rate": 4.2656675774002773e-07, "logits/chosen": -1.3904842138290405, "logits/rejected": -0.9373539686203003, "logps/chosen": -249.3865203857422, "logps/rejected": -224.6757354736328, "loss": 0.6203, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04652171581983566, "rewards/margins": 0.18809521198272705, "rewards/margins_max": 0.27226054668426514, "rewards/margins_min": 0.10392986238002777, "rewards/margins_std": 0.1190277710556984, "rewards/rejected": -0.1415734887123108, "step": 1880 }, { "epoch": 0.6, "grad_norm": 0.66796875, "learning_rate": 4.211309630312812e-07, "logits/chosen": -1.2958967685699463, "logits/rejected": -0.9569181203842163, "logps/chosen": -210.689453125, "logps/rejected": -248.34646606445312, "loss": 0.6234, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.046597760170698166, "rewards/margins": 0.1351958066225052, "rewards/margins_max": 0.1851089894771576, "rewards/margins_min": 0.08528260141611099, "rewards/margins_std": 0.07058792561292648, "rewards/rejected": -0.08859803527593613, "step": 1890 }, { "epoch": 0.6, "grad_norm": 0.5078125, "learning_rate": 4.1570471133515033e-07, "logits/chosen": -1.539520025253296, "logits/rejected": -1.1000487804412842, "logps/chosen": -274.437744140625, "logps/rejected": -278.19781494140625, "loss": 0.626, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.029712975025177002, "rewards/margins": 0.13053396344184875, "rewards/margins_max": 0.2025536596775055, "rewards/margins_min": 0.058514248579740524, "rewards/margins_std": 0.10185122489929199, "rewards/rejected": -0.10082097351551056, "step": 1900 }, { "epoch": 0.6, "grad_norm": 0.46484375, "learning_rate": 4.102886592183995e-07, "logits/chosen": -1.347434401512146, "logits/rejected": -0.91209876537323, "logps/chosen": -275.15252685546875, "logps/rejected": -234.12081909179688, "loss": 0.6245, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.043122995644807816, "rewards/margins": 0.12455437332391739, "rewards/margins_max": 0.19914036989212036, "rewards/margins_min": 0.049968358129262924, "rewards/margins_std": 0.10548055171966553, "rewards/rejected": -0.08143137395381927, "step": 1910 }, { "epoch": 0.6, "grad_norm": 0.40234375, "learning_rate": 4.048834620136618e-07, "logits/chosen": -1.3737701177597046, "logits/rejected": -1.062455415725708, "logps/chosen": -242.2274169921875, "logps/rejected": -230.4326629638672, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 0.04452799633145332, "rewards/margins": 0.16601449251174927, "rewards/margins_max": 0.25104230642318726, "rewards/margins_min": 0.08098666369915009, "rewards/margins_std": 0.1202474981546402, "rewards/rejected": -0.12148649990558624, "step": 1920 }, { "epoch": 0.61, "grad_norm": 0.427734375, "learning_rate": 3.9948977374014545e-07, "logits/chosen": -1.3358091115951538, "logits/rejected": -0.9706098437309265, "logps/chosen": -188.8526153564453, "logps/rejected": -214.7004852294922, "loss": 0.6333, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04406962916254997, "rewards/margins": 0.1181538924574852, "rewards/margins_max": 0.17844833433628082, "rewards/margins_min": 0.057859472930431366, "rewards/margins_std": 0.08526919782161713, "rewards/rejected": -0.07408426702022552, "step": 1930 }, { "epoch": 0.61, "grad_norm": 0.349609375, "learning_rate": 3.941082470244987e-07, "logits/chosen": -1.3635414838790894, "logits/rejected": -1.0531474351882935, "logps/chosen": -260.57159423828125, "logps/rejected": -218.0052032470703, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": 0.05570930987596512, "rewards/margins": 0.17012056708335876, "rewards/margins_max": 0.2493591010570526, "rewards/margins_min": 0.09088209271430969, "rewards/margins_std": 0.11206014454364777, "rewards/rejected": -0.11441127955913544, "step": 1940 }, { "epoch": 0.61, "grad_norm": 0.5859375, "learning_rate": 3.8873953302184283e-07, "logits/chosen": -1.468860387802124, "logits/rejected": -1.0624154806137085, "logps/chosen": -321.9212951660156, "logps/rejected": -267.7339782714844, "loss": 0.617, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0559014193713665, "rewards/margins": 0.16720013320446014, "rewards/margins_max": 0.22909954190254211, "rewards/margins_min": 0.10530078411102295, "rewards/margins_std": 0.08753892034292221, "rewards/rejected": -0.11129872500896454, "step": 1950 }, { "epoch": 0.62, "grad_norm": 0.41796875, "learning_rate": 3.8338428133698396e-07, "logits/chosen": -1.351555585861206, "logits/rejected": -1.0230903625488281, "logps/chosen": -206.9521026611328, "logps/rejected": -223.57754516601562, "loss": 0.6306, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03831537812948227, "rewards/margins": 0.11696416139602661, "rewards/margins_max": 0.1850701868534088, "rewards/margins_min": 0.04885811731219292, "rewards/margins_std": 0.09631648659706116, "rewards/rejected": -0.07864876836538315, "step": 1960 }, { "epoch": 0.62, "grad_norm": 0.392578125, "learning_rate": 3.780431399458114e-07, "logits/chosen": -1.3833266496658325, "logits/rejected": -0.972434401512146, "logps/chosen": -220.4893035888672, "logps/rejected": -226.40487670898438, "loss": 0.6258, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03797771409153938, "rewards/margins": 0.14182588458061218, "rewards/margins_max": 0.19501110911369324, "rewards/margins_min": 0.08864064514636993, "rewards/margins_std": 0.07521527260541916, "rewards/rejected": -0.1038481742143631, "step": 1970 }, { "epoch": 0.62, "grad_norm": 0.419921875, "learning_rate": 3.7271675511689473e-07, "logits/chosen": -1.570255160331726, "logits/rejected": -1.1389049291610718, "logps/chosen": -190.87924194335938, "logps/rejected": -196.2376251220703, "loss": 0.6235, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05351024121046066, "rewards/margins": 0.14232777059078217, "rewards/margins_max": 0.20520441234111786, "rewards/margins_min": 0.07945115119218826, "rewards/margins_std": 0.08892098814249039, "rewards/rejected": -0.0888175368309021, "step": 1980 }, { "epoch": 0.63, "grad_norm": 0.3671875, "learning_rate": 3.674057713332852e-07, "logits/chosen": -1.418208122253418, "logits/rejected": -1.0676791667938232, "logps/chosen": -222.4821014404297, "logps/rejected": -205.3859405517578, "loss": 0.6245, "rewards/accuracies": 1.0, "rewards/chosen": 0.04518379643559456, "rewards/margins": 0.15014731884002686, "rewards/margins_max": 0.23736615478992462, "rewards/margins_min": 0.06292847543954849, "rewards/margins_std": 0.12334605306386948, "rewards/rejected": -0.104963518679142, "step": 1990 }, { "epoch": 0.63, "grad_norm": 0.408203125, "learning_rate": 3.6211083121453566e-07, "logits/chosen": -1.5330979824066162, "logits/rejected": -1.023696780204773, "logps/chosen": -215.0279541015625, "logps/rejected": -207.7816162109375, "loss": 0.6251, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.048994213342666626, "rewards/margins": 0.15364879369735718, "rewards/margins_max": 0.23372311890125275, "rewards/margins_min": 0.07357443124055862, "rewards/margins_std": 0.11324223130941391, "rewards/rejected": -0.10465456545352936, "step": 2000 }, { "epoch": 0.63, "grad_norm": 0.376953125, "learning_rate": 3.568325754389437e-07, "logits/chosen": -1.4234205484390259, "logits/rejected": -0.9812124371528625, "logps/chosen": -233.41748046875, "logps/rejected": -208.1199493408203, "loss": 0.6291, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0440436489880085, "rewards/margins": 0.12118716537952423, "rewards/margins_max": 0.17412233352661133, "rewards/margins_min": 0.06825198978185654, "rewards/margins_std": 0.07486163079738617, "rewards/rejected": -0.07714351266622543, "step": 2010 }, { "epoch": 0.64, "grad_norm": 0.3828125, "learning_rate": 3.515716426660314e-07, "logits/chosen": -1.2047122716903687, "logits/rejected": -0.977331280708313, "logps/chosen": -228.931396484375, "logps/rejected": -291.92071533203125, "loss": 0.6243, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04894689470529556, "rewards/margins": 0.1800243854522705, "rewards/margins_max": 0.2859472930431366, "rewards/margins_min": 0.07410150021314621, "rewards/margins_std": 0.14979760348796844, "rewards/rejected": -0.13107749819755554, "step": 2020 }, { "epoch": 0.64, "grad_norm": 0.5, "learning_rate": 3.463286694592685e-07, "logits/chosen": -1.5284559726715088, "logits/rejected": -1.197145700454712, "logps/chosen": -280.45672607421875, "logps/rejected": -251.2618865966797, "loss": 0.6211, "rewards/accuracies": 1.0, "rewards/chosen": 0.049861326813697815, "rewards/margins": 0.14641502499580383, "rewards/margins_max": 0.20501860976219177, "rewards/margins_min": 0.0878114253282547, "rewards/margins_std": 0.08287801593542099, "rewards/rejected": -0.09655369818210602, "step": 2030 }, { "epoch": 0.64, "grad_norm": 0.375, "learning_rate": 3.4110429020904916e-07, "logits/chosen": -1.3765310049057007, "logits/rejected": -0.8682848811149597, "logps/chosen": -331.6241760253906, "logps/rejected": -291.6063232421875, "loss": 0.6177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.040959432721138, "rewards/margins": 0.19673797488212585, "rewards/margins_max": 0.30799782276153564, "rewards/margins_min": 0.08547808974981308, "rewards/margins_std": 0.1573452204465866, "rewards/rejected": -0.15577852725982666, "step": 2040 }, { "epoch": 0.65, "grad_norm": 0.427734375, "learning_rate": 3.358991370559323e-07, "logits/chosen": -1.4426238536834717, "logits/rejected": -1.0889427661895752, "logps/chosen": -212.64932250976562, "logps/rejected": -232.1751708984375, "loss": 0.617, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.061146706342697144, "rewards/margins": 0.1886657178401947, "rewards/margins_max": 0.285667359828949, "rewards/margins_min": 0.09166404604911804, "rewards/margins_std": 0.13718107342720032, "rewards/rejected": -0.12751901149749756, "step": 2050 }, { "epoch": 0.65, "grad_norm": 0.33203125, "learning_rate": 3.307138398141528e-07, "logits/chosen": -1.4597798585891724, "logits/rejected": -1.02079176902771, "logps/chosen": -205.18765258789062, "logps/rejected": -242.97683715820312, "loss": 0.6111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0566553995013237, "rewards/margins": 0.18628427386283875, "rewards/margins_max": 0.2999788224697113, "rewards/margins_min": 0.07258973270654678, "rewards/margins_std": 0.1607883721590042, "rewards/rejected": -0.12962886691093445, "step": 2060 }, { "epoch": 0.65, "grad_norm": 0.61328125, "learning_rate": 3.2554902589541664e-07, "logits/chosen": -1.4502575397491455, "logits/rejected": -1.0646450519561768, "logps/chosen": -172.86062622070312, "logps/rejected": -165.33616638183594, "loss": 0.6217, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04238683357834816, "rewards/margins": 0.11539296805858612, "rewards/margins_max": 0.17349234223365784, "rewards/margins_min": 0.05729362368583679, "rewards/margins_std": 0.08216488361358643, "rewards/rejected": -0.07300613820552826, "step": 2070 }, { "epoch": 0.66, "grad_norm": 0.40625, "learning_rate": 3.204053202329835e-07, "logits/chosen": -1.3668615818023682, "logits/rejected": -0.9603246450424194, "logps/chosen": -245.4371795654297, "logps/rejected": -241.2156524658203, "loss": 0.6261, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03958984464406967, "rewards/margins": 0.15117119252681732, "rewards/margins_max": 0.22184331715106964, "rewards/margins_min": 0.080499067902565, "rewards/margins_std": 0.09994547069072723, "rewards/rejected": -0.11158134043216705, "step": 2080 }, { "epoch": 0.66, "grad_norm": 0.4296875, "learning_rate": 3.1528334520605216e-07, "logits/chosen": -1.282301425933838, "logits/rejected": -1.1172327995300293, "logps/chosen": -212.07632446289062, "logps/rejected": -271.19989013671875, "loss": 0.6307, "rewards/accuracies": 1.0, "rewards/chosen": 0.03222063556313515, "rewards/margins": 0.157441645860672, "rewards/margins_max": 0.22982993721961975, "rewards/margins_min": 0.08505336940288544, "rewards/margins_std": 0.10237250477075577, "rewards/rejected": -0.12522102892398834, "step": 2090 }, { "epoch": 0.66, "grad_norm": 0.43359375, "learning_rate": 3.1018372056445305e-07, "logits/chosen": -1.3811043500900269, "logits/rejected": -0.9078986048698425, "logps/chosen": -239.8875274658203, "logps/rejected": -216.6652069091797, "loss": 0.6223, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.046834252774715424, "rewards/margins": 0.18769201636314392, "rewards/margins_max": 0.2837710678577423, "rewards/margins_min": 0.09161292761564255, "rewards/margins_std": 0.13587632775306702, "rewards/rejected": -0.1408577710390091, "step": 2100 }, { "epoch": 0.66, "grad_norm": 0.421875, "learning_rate": 3.0510706335366034e-07, "logits/chosen": -1.4970591068267822, "logits/rejected": -1.091909408569336, "logps/chosen": -195.78086853027344, "logps/rejected": -198.8799591064453, "loss": 0.6334, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.054044730961322784, "rewards/margins": 0.12333653122186661, "rewards/margins_max": 0.1747966706752777, "rewards/margins_min": 0.0718763917684555, "rewards/margins_std": 0.07277561724185944, "rewards/rejected": -0.06929179280996323, "step": 2110 }, { "epoch": 0.67, "grad_norm": 0.474609375, "learning_rate": 3.000539878401296e-07, "logits/chosen": -1.4365322589874268, "logits/rejected": -1.1631158590316772, "logps/chosen": -179.49465942382812, "logps/rejected": -198.1630401611328, "loss": 0.6298, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04496608301997185, "rewards/margins": 0.11149311065673828, "rewards/margins_max": 0.1663789451122284, "rewards/margins_min": 0.05660729482769966, "rewards/margins_std": 0.07762027531862259, "rewards/rejected": -0.06652702391147614, "step": 2120 }, { "epoch": 0.67, "grad_norm": 0.41796875, "learning_rate": 2.9502510543697323e-07, "logits/chosen": -1.4205095767974854, "logits/rejected": -1.1358397006988525, "logps/chosen": -190.26431274414062, "logps/rejected": -216.4228057861328, "loss": 0.6252, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0446227490901947, "rewards/margins": 0.13534995913505554, "rewards/margins_max": 0.2004757821559906, "rewards/margins_min": 0.07022411376237869, "rewards/margins_std": 0.09210184216499329, "rewards/rejected": -0.09072719514369965, "step": 2130 }, { "epoch": 0.67, "grad_norm": 0.482421875, "learning_rate": 2.900210246299808e-07, "logits/chosen": -1.3750842809677124, "logits/rejected": -0.9162171483039856, "logps/chosen": -235.06005859375, "logps/rejected": -295.83526611328125, "loss": 0.6131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06156709045171738, "rewards/margins": 0.19232510030269623, "rewards/margins_max": 0.29850488901138306, "rewards/margins_min": 0.0861453041434288, "rewards/margins_std": 0.15016090869903564, "rewards/rejected": -0.13075801730155945, "step": 2140 }, { "epoch": 0.68, "grad_norm": 0.404296875, "learning_rate": 2.8504235090399275e-07, "logits/chosen": -1.246057391166687, "logits/rejected": -1.2714288234710693, "logps/chosen": -128.15052795410156, "logps/rejected": -195.8909454345703, "loss": 0.6329, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03380041942000389, "rewards/margins": 0.14983999729156494, "rewards/margins_max": 0.23497620224952698, "rewards/margins_min": 0.06470384448766708, "rewards/margins_std": 0.12040072679519653, "rewards/rejected": -0.11603958904743195, "step": 2150 }, { "epoch": 0.68, "grad_norm": 0.4609375, "learning_rate": 2.800896866696382e-07, "logits/chosen": -1.375130534172058, "logits/rejected": -1.0531413555145264, "logps/chosen": -180.38128662109375, "logps/rejected": -198.62501525878906, "loss": 0.6281, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04046819731593132, "rewards/margins": 0.1361556351184845, "rewards/margins_max": 0.22056671977043152, "rewards/margins_min": 0.05174453184008598, "rewards/margins_std": 0.1193753108382225, "rewards/rejected": -0.09568743407726288, "step": 2160 }, { "epoch": 0.68, "grad_norm": 0.408203125, "learning_rate": 2.7516363119044437e-07, "logits/chosen": -1.3951034545898438, "logits/rejected": -1.1207072734832764, "logps/chosen": -194.72064208984375, "logps/rejected": -230.1772003173828, "loss": 0.6286, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04992605373263359, "rewards/margins": 0.12686730921268463, "rewards/margins_max": 0.18470284342765808, "rewards/margins_min": 0.06903177499771118, "rewards/margins_std": 0.08179178088903427, "rewards/rejected": -0.07694125175476074, "step": 2170 }, { "epoch": 0.69, "grad_norm": 0.50390625, "learning_rate": 2.702647805103262e-07, "logits/chosen": -1.4403280019760132, "logits/rejected": -1.1164398193359375, "logps/chosen": -186.88186645507812, "logps/rejected": -217.31051635742188, "loss": 0.6124, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.049945756793022156, "rewards/margins": 0.16797830164432526, "rewards/margins_max": 0.266406774520874, "rewards/margins_min": 0.0695497915148735, "rewards/margins_std": 0.1391989141702652, "rewards/rejected": -0.1180325299501419, "step": 2180 }, { "epoch": 0.69, "grad_norm": 0.51953125, "learning_rate": 2.6539372738146694e-07, "logits/chosen": -1.4543806314468384, "logits/rejected": -0.953220009803772, "logps/chosen": -255.16073608398438, "logps/rejected": -233.70040893554688, "loss": 0.6168, "rewards/accuracies": 1.0, "rewards/chosen": 0.03919634595513344, "rewards/margins": 0.20791907608509064, "rewards/margins_max": 0.3329697251319885, "rewards/margins_min": 0.08286843448877335, "rewards/margins_std": 0.17684832215309143, "rewards/rejected": -0.1687227189540863, "step": 2190 }, { "epoch": 0.69, "grad_norm": 0.6328125, "learning_rate": 2.605510611925955e-07, "logits/chosen": -1.2929773330688477, "logits/rejected": -0.9437387585639954, "logps/chosen": -202.4819793701172, "logps/rejected": -246.94662475585938, "loss": 0.6224, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04549327492713928, "rewards/margins": 0.17824196815490723, "rewards/margins_max": 0.2709406912326813, "rewards/margins_min": 0.08554325997829437, "rewards/margins_std": 0.131095752120018, "rewards/rejected": -0.13274869322776794, "step": 2200 }, { "epoch": 0.7, "grad_norm": 0.462890625, "learning_rate": 2.557373678976723e-07, "logits/chosen": -1.408676266670227, "logits/rejected": -0.9201229214668274, "logps/chosen": -237.0033721923828, "logps/rejected": -179.01229858398438, "loss": 0.6269, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.043176356703042984, "rewards/margins": 0.1315043866634369, "rewards/margins_max": 0.21515369415283203, "rewards/margins_min": 0.047855086624622345, "rewards/margins_std": 0.11829797923564911, "rewards/rejected": -0.0883280336856842, "step": 2210 }, { "epoch": 0.7, "grad_norm": 0.318359375, "learning_rate": 2.5095322994498846e-07, "logits/chosen": -1.5247070789337158, "logits/rejected": -1.1096593141555786, "logps/chosen": -250.4240264892578, "logps/rejected": -194.3154296875, "loss": 0.6146, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04496272653341293, "rewards/margins": 0.1702985316514969, "rewards/margins_max": 0.27228885889053345, "rewards/margins_min": 0.06830821931362152, "rewards/margins_std": 0.14423608779907227, "rewards/rejected": -0.12533581256866455, "step": 2220 }, { "epoch": 0.7, "grad_norm": 0.421875, "learning_rate": 2.4619922620669215e-07, "logits/chosen": -1.4068553447723389, "logits/rejected": -1.0456701517105103, "logps/chosen": -208.51882934570312, "logps/rejected": -200.5174102783203, "loss": 0.6294, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05011656880378723, "rewards/margins": 0.14396075904369354, "rewards/margins_max": 0.22980065643787384, "rewards/margins_min": 0.058120857924222946, "rewards/margins_std": 0.12139594554901123, "rewards/rejected": -0.09384419023990631, "step": 2230 }, { "epoch": 0.71, "grad_norm": 0.51171875, "learning_rate": 2.414759319087452e-07, "logits/chosen": -1.5388705730438232, "logits/rejected": -1.0942213535308838, "logps/chosen": -234.5261688232422, "logps/rejected": -196.3878631591797, "loss": 0.6207, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04927833378314972, "rewards/margins": 0.12129292637109756, "rewards/margins_max": 0.17791934311389923, "rewards/margins_min": 0.06466653198003769, "rewards/margins_std": 0.08008182793855667, "rewards/rejected": -0.07201460003852844, "step": 2240 }, { "epoch": 0.71, "grad_norm": 0.51171875, "learning_rate": 2.3678391856132202e-07, "logits/chosen": -1.5716922283172607, "logits/rejected": -1.0814467668533325, "logps/chosen": -236.632080078125, "logps/rejected": -212.31906127929688, "loss": 0.6342, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.044985391199588776, "rewards/margins": 0.15734454989433289, "rewards/margins_max": 0.2386656105518341, "rewards/margins_min": 0.07602350413799286, "rewards/margins_std": 0.11500532925128937, "rewards/rejected": -0.1123591810464859, "step": 2250 }, { "epoch": 0.71, "grad_norm": 0.478515625, "learning_rate": 2.321237538896579e-07, "logits/chosen": -1.3435510396957397, "logits/rejected": -1.0200581550598145, "logps/chosen": -340.9153747558594, "logps/rejected": -310.7401428222656, "loss": 0.6111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0401308573782444, "rewards/margins": 0.1969205141067505, "rewards/margins_max": 0.30011191964149475, "rewards/margins_min": 0.09372911602258682, "rewards/margins_std": 0.14593467116355896, "rewards/rejected": -0.1567896604537964, "step": 2260 }, { "epoch": 0.72, "grad_norm": 0.58984375, "learning_rate": 2.2749600176535533e-07, "logits/chosen": -1.4122085571289062, "logits/rejected": -0.8751947283744812, "logps/chosen": -255.9865264892578, "logps/rejected": -214.06289672851562, "loss": 0.6122, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04100383445620537, "rewards/margins": 0.16915416717529297, "rewards/margins_max": 0.25182080268859863, "rewards/margins_min": 0.08648748695850372, "rewards/margins_std": 0.11690831184387207, "rewards/rejected": -0.1281503140926361, "step": 2270 }, { "epoch": 0.72, "grad_norm": 0.43359375, "learning_rate": 2.2290122213815605e-07, "logits/chosen": -1.3835846185684204, "logits/rejected": -1.1778188943862915, "logps/chosen": -212.3909149169922, "logps/rejected": -286.0127868652344, "loss": 0.6164, "rewards/accuracies": 1.0, "rewards/chosen": 0.05380062013864517, "rewards/margins": 0.19799622893333435, "rewards/margins_max": 0.3002737760543823, "rewards/margins_min": 0.09571869671344757, "rewards/margins_std": 0.14464230835437775, "rewards/rejected": -0.14419563114643097, "step": 2280 }, { "epoch": 0.72, "grad_norm": 0.46484375, "learning_rate": 2.1833997096818895e-07, "logits/chosen": -1.2950363159179688, "logits/rejected": -1.0533965826034546, "logps/chosen": -207.44741821289062, "logps/rejected": -266.3078918457031, "loss": 0.6148, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04672679305076599, "rewards/margins": 0.18707481026649475, "rewards/margins_max": 0.26824522018432617, "rewards/margins_min": 0.10590440034866333, "rewards/margins_std": 0.11479228734970093, "rewards/rejected": -0.14034804701805115, "step": 2290 }, { "epoch": 0.72, "grad_norm": 0.64453125, "learning_rate": 2.1381280015869956e-07, "logits/chosen": -1.3823282718658447, "logits/rejected": -0.8315450549125671, "logps/chosen": -262.98089599609375, "logps/rejected": -228.6239013671875, "loss": 0.6403, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04306049272418022, "rewards/margins": 0.13284507393836975, "rewards/margins_max": 0.195632204413414, "rewards/margins_min": 0.07005792111158371, "rewards/margins_std": 0.08879442512989044, "rewards/rejected": -0.08978457748889923, "step": 2300 }, { "epoch": 0.73, "grad_norm": 0.494140625, "learning_rate": 2.0932025748927014e-07, "logits/chosen": -1.300336480140686, "logits/rejected": -1.0065299272537231, "logps/chosen": -266.71722412109375, "logps/rejected": -252.8298797607422, "loss": 0.6336, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.045079831033945084, "rewards/margins": 0.14786089956760406, "rewards/margins_max": 0.21760694682598114, "rewards/margins_min": 0.0781148225069046, "rewards/margins_std": 0.09863585978746414, "rewards/rejected": -0.10278107225894928, "step": 2310 }, { "epoch": 0.73, "grad_norm": 0.51953125, "learning_rate": 2.0486288654954027e-07, "logits/chosen": -1.4983582496643066, "logits/rejected": -1.009060025215149, "logps/chosen": -221.823486328125, "logps/rejected": -218.0142364501953, "loss": 0.6192, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06399594247341156, "rewards/margins": 0.17152294516563416, "rewards/margins_max": 0.24920019507408142, "rewards/margins_min": 0.0938456803560257, "rewards/margins_std": 0.10985223203897476, "rewards/rejected": -0.1075270026922226, "step": 2320 }, { "epoch": 0.73, "grad_norm": 0.392578125, "learning_rate": 2.0044122667343295e-07, "logits/chosen": -1.349254846572876, "logits/rejected": -0.9567793011665344, "logps/chosen": -220.45394897460938, "logps/rejected": -214.1141815185547, "loss": 0.6235, "rewards/accuracies": 1.0, "rewards/chosen": 0.055171359330415726, "rewards/margins": 0.16959527134895325, "rewards/margins_max": 0.24443653225898743, "rewards/margins_min": 0.09475398808717728, "rewards/margins_std": 0.10584155470132828, "rewards/rejected": -0.11442389339208603, "step": 2330 }, { "epoch": 0.74, "grad_norm": 0.53125, "learning_rate": 1.9605581287389633e-07, "logits/chosen": -1.3867130279541016, "logits/rejected": -1.2174745798110962, "logps/chosen": -196.43331909179688, "logps/rejected": -213.40365600585938, "loss": 0.6256, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03618159890174866, "rewards/margins": 0.15966863930225372, "rewards/margins_max": 0.2609175443649292, "rewards/margins_min": 0.05841972678899765, "rewards/margins_std": 0.14318758249282837, "rewards/rejected": -0.12348704040050507, "step": 2340 }, { "epoch": 0.74, "grad_norm": 0.52734375, "learning_rate": 1.9170717577816786e-07, "logits/chosen": -1.4080374240875244, "logits/rejected": -1.087418794631958, "logps/chosen": -227.89010620117188, "logps/rejected": -259.26116943359375, "loss": 0.6217, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03795627877116203, "rewards/margins": 0.15768983960151672, "rewards/margins_max": 0.24818861484527588, "rewards/margins_min": 0.06719101220369339, "rewards/margins_std": 0.1279846429824829, "rewards/rejected": -0.1197335347533226, "step": 2350 }, { "epoch": 0.74, "grad_norm": 0.50390625, "learning_rate": 1.873958415635698e-07, "logits/chosen": -1.5294129848480225, "logits/rejected": -1.2719072103500366, "logps/chosen": -238.9661102294922, "logps/rejected": -242.7552947998047, "loss": 0.6315, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04142700880765915, "rewards/margins": 0.14358402788639069, "rewards/margins_max": 0.21057486534118652, "rewards/margins_min": 0.07659320533275604, "rewards/margins_std": 0.09473933279514313, "rewards/rejected": -0.10215701907873154, "step": 2360 }, { "epoch": 0.75, "grad_norm": 0.65625, "learning_rate": 1.8312233189384192e-07, "logits/chosen": -1.48410964012146, "logits/rejected": -0.9692693948745728, "logps/chosen": -229.7482452392578, "logps/rejected": -221.9854278564453, "loss": 0.6061, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.059097446501255035, "rewards/margins": 0.17855104804039001, "rewards/margins_max": 0.27842646837234497, "rewards/margins_min": 0.07867564260959625, "rewards/margins_std": 0.14124515652656555, "rewards/rejected": -0.11945360898971558, "step": 2370 }, { "epoch": 0.75, "grad_norm": 0.412109375, "learning_rate": 1.7888716385602205e-07, "logits/chosen": -1.4760620594024658, "logits/rejected": -0.9696518182754517, "logps/chosen": -222.237548828125, "logps/rejected": -202.98983764648438, "loss": 0.6081, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04904181510210037, "rewards/margins": 0.19830942153930664, "rewards/margins_max": 0.2989969253540039, "rewards/margins_min": 0.09762193262577057, "rewards/margins_std": 0.14239361882209778, "rewards/rejected": -0.14926761388778687, "step": 2380 }, { "epoch": 0.75, "grad_norm": 0.462890625, "learning_rate": 1.7469084989787908e-07, "logits/chosen": -1.5105534791946411, "logits/rejected": -1.268873929977417, "logps/chosen": -218.3324432373047, "logps/rejected": -277.1694030761719, "loss": 0.6269, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05644305422902107, "rewards/margins": 0.1589832305908203, "rewards/margins_max": 0.2425994873046875, "rewards/margins_min": 0.07536697387695312, "rewards/margins_std": 0.11825122684240341, "rewards/rejected": -0.10254015773534775, "step": 2390 }, { "epoch": 0.76, "grad_norm": 0.5546875, "learning_rate": 1.705338977659071e-07, "logits/chosen": -1.3833867311477661, "logits/rejected": -1.0862524509429932, "logps/chosen": -222.6822967529297, "logps/rejected": -247.6257781982422, "loss": 0.6138, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04524652659893036, "rewards/margins": 0.17377988994121552, "rewards/margins_max": 0.2417949140071869, "rewards/margins_min": 0.10576488077640533, "rewards/margins_std": 0.09618774056434631, "rewards/rejected": -0.12853336334228516, "step": 2400 }, { "epoch": 0.76, "grad_norm": 0.462890625, "learning_rate": 1.664168104438901e-07, "logits/chosen": -1.3712468147277832, "logits/rejected": -1.1276142597198486, "logps/chosen": -232.34817504882812, "logps/rejected": -245.1379852294922, "loss": 0.616, "rewards/accuracies": 1.0, "rewards/chosen": 0.04633709043264389, "rewards/margins": 0.16087999939918518, "rewards/margins_max": 0.24791808426380157, "rewards/margins_min": 0.07384191453456879, "rewards/margins_std": 0.123090460896492, "rewards/rejected": -0.11454291641712189, "step": 2410 }, { "epoch": 0.76, "grad_norm": 0.443359375, "learning_rate": 1.6234008609204104e-07, "logits/chosen": -1.4258971214294434, "logits/rejected": -1.0054690837860107, "logps/chosen": -274.0898132324219, "logps/rejected": -227.77304077148438, "loss": 0.6347, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04632297903299332, "rewards/margins": 0.1191258653998375, "rewards/margins_max": 0.17983102798461914, "rewards/margins_min": 0.058420680463314056, "rewards/margins_std": 0.0858500748872757, "rewards/rejected": -0.07280287891626358, "step": 2420 }, { "epoch": 0.77, "grad_norm": 0.412109375, "learning_rate": 1.5830421798672565e-07, "logits/chosen": -1.5638043880462646, "logits/rejected": -1.2185170650482178, "logps/chosen": -212.51797485351562, "logps/rejected": -266.51495361328125, "loss": 0.6243, "rewards/accuracies": 1.0, "rewards/chosen": 0.06803154945373535, "rewards/margins": 0.15410315990447998, "rewards/margins_max": 0.22409594058990479, "rewards/margins_min": 0.08411036431789398, "rewards/margins_std": 0.09898475557565689, "rewards/rejected": -0.08607159554958344, "step": 2430 }, { "epoch": 0.77, "grad_norm": 0.5703125, "learning_rate": 1.5430969446077675e-07, "logits/chosen": -1.206554651260376, "logits/rejected": -0.9525250196456909, "logps/chosen": -220.4082489013672, "logps/rejected": -265.2112731933594, "loss": 0.6248, "rewards/accuracies": 1.0, "rewards/chosen": 0.025963077321648598, "rewards/margins": 0.16358208656311035, "rewards/margins_max": 0.23860347270965576, "rewards/margins_min": 0.08856071531772614, "rewards/margins_std": 0.10609626770019531, "rewards/rejected": -0.1376190185546875, "step": 2440 }, { "epoch": 0.77, "grad_norm": 0.486328125, "learning_rate": 1.5035699884440695e-07, "logits/chosen": -1.298842191696167, "logits/rejected": -0.9138208627700806, "logps/chosen": -207.61990356445312, "logps/rejected": -222.8286590576172, "loss": 0.6145, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.034749679267406464, "rewards/margins": 0.13267698884010315, "rewards/margins_max": 0.20068855583667755, "rewards/margins_min": 0.06466543674468994, "rewards/margins_std": 0.09618286788463593, "rewards/rejected": -0.09792731702327728, "step": 2450 }, { "epoch": 0.77, "grad_norm": 0.5234375, "learning_rate": 1.4644660940672627e-07, "logits/chosen": -1.326765775680542, "logits/rejected": -1.1471151113510132, "logps/chosen": -204.8334197998047, "logps/rejected": -205.65042114257812, "loss": 0.6283, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018175512552261353, "rewards/margins": 0.12012593448162079, "rewards/margins_max": 0.1839393973350525, "rewards/margins_min": 0.05631248280405998, "rewards/margins_std": 0.09024585783481598, "rewards/rejected": -0.10195042937994003, "step": 2460 }, { "epoch": 0.78, "grad_norm": 0.451171875, "learning_rate": 1.4257899929787292e-07, "logits/chosen": -1.4262887239456177, "logits/rejected": -1.038913607597351, "logps/chosen": -245.40530395507812, "logps/rejected": -239.4207305908203, "loss": 0.6269, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05134139209985733, "rewards/margins": 0.15652242302894592, "rewards/margins_max": 0.22957094013690948, "rewards/margins_min": 0.08347393572330475, "rewards/margins_std": 0.10330617427825928, "rewards/rejected": -0.10518103837966919, "step": 2470 }, { "epoch": 0.78, "grad_norm": 0.328125, "learning_rate": 1.3875463649176282e-07, "logits/chosen": -1.3459253311157227, "logits/rejected": -1.2240893840789795, "logps/chosen": -160.60562133789062, "logps/rejected": -167.91712951660156, "loss": 0.6363, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03096461296081543, "rewards/margins": 0.10902484506368637, "rewards/margins_max": 0.15568535029888153, "rewards/margins_min": 0.062364332377910614, "rewards/margins_std": 0.06598792225122452, "rewards/rejected": -0.07806022465229034, "step": 2480 }, { "epoch": 0.78, "grad_norm": 0.462890625, "learning_rate": 1.34973983729465e-07, "logits/chosen": -1.245940089225769, "logits/rejected": -0.9054630994796753, "logps/chosen": -240.6822509765625, "logps/rejected": -227.9288787841797, "loss": 0.6368, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05721098184585571, "rewards/margins": 0.12238309532403946, "rewards/margins_max": 0.16339154541492462, "rewards/margins_min": 0.0813746303319931, "rewards/margins_std": 0.05799471214413643, "rewards/rejected": -0.06517211347818375, "step": 2490 }, { "epoch": 0.79, "grad_norm": 0.482421875, "learning_rate": 1.312374984632118e-07, "logits/chosen": -1.4160993099212646, "logits/rejected": -0.9629823565483093, "logps/chosen": -220.81332397460938, "logps/rejected": -166.48544311523438, "loss": 0.6145, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.025352340191602707, "rewards/margins": 0.1693764626979828, "rewards/margins_max": 0.24544978141784668, "rewards/margins_min": 0.0933031514286995, "rewards/margins_std": 0.1075839176774025, "rewards/rejected": -0.14402411878108978, "step": 2500 }, { "epoch": 0.79, "grad_norm": 0.5703125, "learning_rate": 1.2754563280104714e-07, "logits/chosen": -1.5705251693725586, "logits/rejected": -1.1976501941680908, "logps/chosen": -190.46922302246094, "logps/rejected": -252.851806640625, "loss": 0.6235, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.06647202372550964, "rewards/margins": 0.14776286482810974, "rewards/margins_max": 0.23496422171592712, "rewards/margins_min": 0.06056150048971176, "rewards/margins_std": 0.12332135438919067, "rewards/rejected": -0.0812908411026001, "step": 2510 }, { "epoch": 0.79, "grad_norm": 0.5859375, "learning_rate": 1.238988334521226e-07, "logits/chosen": -1.442922592163086, "logits/rejected": -1.111483097076416, "logps/chosen": -182.8762664794922, "logps/rejected": -199.7118682861328, "loss": 0.6118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.050991643220186234, "rewards/margins": 0.15598046779632568, "rewards/margins_max": 0.24103443324565887, "rewards/margins_min": 0.0709264725446701, "rewards/margins_std": 0.12028451263904572, "rewards/rejected": -0.10498883575201035, "step": 2520 }, { "epoch": 0.8, "grad_norm": 0.44140625, "learning_rate": 1.202975416726464e-07, "logits/chosen": -1.4351770877838135, "logits/rejected": -1.0629112720489502, "logps/chosen": -229.93716430664062, "logps/rejected": -258.378173828125, "loss": 0.6123, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.058566201478242874, "rewards/margins": 0.15714897215366364, "rewards/margins_max": 0.21117070317268372, "rewards/margins_min": 0.10312725603580475, "rewards/margins_std": 0.07639826089143753, "rewards/rejected": -0.09858278185129166, "step": 2530 }, { "epoch": 0.8, "grad_norm": 0.6484375, "learning_rate": 1.1674219321249212e-07, "logits/chosen": -1.4108346700668335, "logits/rejected": -1.0379550457000732, "logps/chosen": -225.57199096679688, "logps/rejected": -248.28396606445312, "loss": 0.6231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.042424093931913376, "rewards/margins": 0.16670480370521545, "rewards/margins_max": 0.23077841103076935, "rewards/margins_min": 0.10263122618198395, "rewards/margins_std": 0.0906137228012085, "rewards/rejected": -0.12428070604801178, "step": 2540 }, { "epoch": 0.8, "grad_norm": 0.3671875, "learning_rate": 1.1323321826247345e-07, "logits/chosen": -1.5495703220367432, "logits/rejected": -1.2719228267669678, "logps/chosen": -207.6171417236328, "logps/rejected": -208.72048950195312, "loss": 0.6275, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05759170651435852, "rewards/margins": 0.14356836676597595, "rewards/margins_max": 0.2183394432067871, "rewards/margins_min": 0.0687972754240036, "rewards/margins_std": 0.10574229806661606, "rewards/rejected": -0.08597666025161743, "step": 2550 }, { "epoch": 0.81, "grad_norm": 0.5859375, "learning_rate": 1.0977104140229265e-07, "logits/chosen": -1.298298954963684, "logits/rejected": -0.9742182493209839, "logps/chosen": -213.3368682861328, "logps/rejected": -246.1780548095703, "loss": 0.6203, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04572229087352753, "rewards/margins": 0.20839472115039825, "rewards/margins_max": 0.3573569357395172, "rewards/margins_min": 0.059432536363601685, "rewards/margins_std": 0.21066434681415558, "rewards/rejected": -0.16267244517803192, "step": 2560 }, { "epoch": 0.81, "grad_norm": 0.51953125, "learning_rate": 1.0635608154916647e-07, "logits/chosen": -1.477253794670105, "logits/rejected": -0.9998503923416138, "logps/chosen": -280.1942138671875, "logps/rejected": -280.0354919433594, "loss": 0.6302, "rewards/accuracies": 1.0, "rewards/chosen": 0.04247181490063667, "rewards/margins": 0.15468376874923706, "rewards/margins_max": 0.21085813641548157, "rewards/margins_min": 0.09850938618183136, "rewards/margins_std": 0.0794425681233406, "rewards/rejected": -0.11221196502447128, "step": 2570 }, { "epoch": 0.81, "grad_norm": 0.4921875, "learning_rate": 1.0298875190713801e-07, "logits/chosen": -1.3592890501022339, "logits/rejected": -0.968646228313446, "logps/chosen": -242.7200927734375, "logps/rejected": -223.07980346679688, "loss": 0.6299, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0529966726899147, "rewards/margins": 0.1338491290807724, "rewards/margins_max": 0.20661978423595428, "rewards/margins_min": 0.061078451573848724, "rewards/margins_std": 0.1029132753610611, "rewards/rejected": -0.0808524563908577, "step": 2580 }, { "epoch": 0.82, "grad_norm": 0.5390625, "learning_rate": 9.966945991708003e-08, "logits/chosen": -1.3368492126464844, "logits/rejected": -0.9701792597770691, "logps/chosen": -254.62747192382812, "logps/rejected": -180.07473754882812, "loss": 0.6205, "rewards/accuracies": 1.0, "rewards/chosen": 0.04608957841992378, "rewards/margins": 0.12919795513153076, "rewards/margins_max": 0.20184385776519775, "rewards/margins_min": 0.056552063673734665, "rewards/margins_std": 0.10273680835962296, "rewards/rejected": -0.08310838788747787, "step": 2590 }, { "epoch": 0.82, "grad_norm": 0.279296875, "learning_rate": 9.639860720739523e-08, "logits/chosen": -1.458553433418274, "logits/rejected": -1.2161775827407837, "logps/chosen": -211.18978881835938, "logps/rejected": -259.21673583984375, "loss": 0.6296, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04403019696474075, "rewards/margins": 0.13392777740955353, "rewards/margins_max": 0.21003444492816925, "rewards/margins_min": 0.057821135967969894, "rewards/margins_std": 0.10763102769851685, "rewards/rejected": -0.08989757299423218, "step": 2600 }, { "epoch": 0.82, "grad_norm": 0.6796875, "learning_rate": 9.31765895454199e-08, "logits/chosen": -1.5028635263442993, "logits/rejected": -1.0774997472763062, "logps/chosen": -204.22152709960938, "logps/rejected": -289.7691345214844, "loss": 0.6152, "rewards/accuracies": 1.0, "rewards/chosen": 0.056610412895679474, "rewards/margins": 0.1657501459121704, "rewards/margins_max": 0.2222793996334076, "rewards/margins_min": 0.10922084748744965, "rewards/margins_std": 0.07994447648525238, "rewards/rejected": -0.10913971811532974, "step": 2610 }, { "epoch": 0.83, "grad_norm": 0.41015625, "learning_rate": 9.000379678953667e-08, "logits/chosen": -1.4289562702178955, "logits/rejected": -1.1931815147399902, "logps/chosen": -211.5123291015625, "logps/rejected": -219.6265869140625, "loss": 0.6264, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029694851487874985, "rewards/margins": 0.12272848188877106, "rewards/margins_max": 0.18134915828704834, "rewards/margins_min": 0.06410779803991318, "rewards/margins_std": 0.08290217071771622, "rewards/rejected": -0.09303363412618637, "step": 2620 }, { "epoch": 0.83, "grad_norm": 0.5546875, "learning_rate": 8.688061284200265e-08, "logits/chosen": -1.5567282438278198, "logits/rejected": -1.3019843101501465, "logps/chosen": -263.53070068359375, "logps/rejected": -248.8045654296875, "loss": 0.6245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0438472144305706, "rewards/margins": 0.15204408764839172, "rewards/margins_max": 0.24873380362987518, "rewards/margins_min": 0.055354367941617966, "rewards/margins_std": 0.13673990964889526, "rewards/rejected": -0.10819686949253082, "step": 2630 }, { "epoch": 0.83, "grad_norm": 0.341796875, "learning_rate": 8.380741560249726e-08, "logits/chosen": -1.3044310808181763, "logits/rejected": -1.0237683057785034, "logps/chosen": -185.43040466308594, "logps/rejected": -250.0048828125, "loss": 0.6321, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.042631763964891434, "rewards/margins": 0.12767274677753448, "rewards/margins_max": 0.19831757247447968, "rewards/margins_min": 0.05702788755297661, "rewards/margins_std": 0.09990689903497696, "rewards/rejected": -0.08504097163677216, "step": 2640 }, { "epoch": 0.83, "grad_norm": 0.3984375, "learning_rate": 8.078457692239809e-08, "logits/chosen": -1.2682950496673584, "logits/rejected": -1.0586285591125488, "logps/chosen": -223.0974884033203, "logps/rejected": -226.95260620117188, "loss": 0.6261, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04470429569482803, "rewards/margins": 0.17180202901363373, "rewards/margins_max": 0.2570766806602478, "rewards/margins_min": 0.08652739226818085, "rewards/margins_std": 0.12059654295444489, "rewards/rejected": -0.1270977258682251, "step": 2650 }, { "epoch": 0.84, "grad_norm": 0.39453125, "learning_rate": 7.781246255978685e-08, "logits/chosen": -1.250165581703186, "logits/rejected": -1.0497492551803589, "logps/chosen": -234.21694946289062, "logps/rejected": -198.62652587890625, "loss": 0.6232, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024458685889840126, "rewards/margins": 0.16458727419376373, "rewards/margins_max": 0.2734147012233734, "rewards/margins_min": 0.055759839713573456, "rewards/margins_std": 0.1539052277803421, "rewards/rejected": -0.14012858271598816, "step": 2660 }, { "epoch": 0.84, "grad_norm": 0.42578125, "learning_rate": 7.4891432135193e-08, "logits/chosen": -1.3641499280929565, "logits/rejected": -1.0219401121139526, "logps/chosen": -206.70797729492188, "logps/rejected": -205.9700927734375, "loss": 0.6325, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.035322077572345734, "rewards/margins": 0.13450254499912262, "rewards/margins_max": 0.18772754073143005, "rewards/margins_min": 0.0812775120139122, "rewards/margins_std": 0.07527154684066772, "rewards/rejected": -0.0991804450750351, "step": 2670 }, { "epoch": 0.84, "grad_norm": 0.408203125, "learning_rate": 7.202183908808124e-08, "logits/chosen": -1.22100031375885, "logits/rejected": -0.9506253004074097, "logps/chosen": -237.82974243164062, "logps/rejected": -252.8350372314453, "loss": 0.6205, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03242502361536026, "rewards/margins": 0.1422858089208603, "rewards/margins_max": 0.2102351188659668, "rewards/margins_min": 0.07433655858039856, "rewards/margins_std": 0.0960947722196579, "rewards/rejected": -0.10986080020666122, "step": 2680 }, { "epoch": 0.85, "grad_norm": 0.5, "learning_rate": 6.920403063408526e-08, "logits/chosen": -1.39353346824646, "logits/rejected": -0.8322644233703613, "logps/chosen": -393.1592102050781, "logps/rejected": -256.74774169921875, "loss": 0.6135, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.043674807995557785, "rewards/margins": 0.1816544234752655, "rewards/margins_max": 0.2530291676521301, "rewards/margins_min": 0.11027966439723969, "rewards/margins_std": 0.10093915462493896, "rewards/rejected": -0.13797961175441742, "step": 2690 }, { "epoch": 0.85, "grad_norm": 0.44921875, "learning_rate": 6.643834772299544e-08, "logits/chosen": -1.4720598459243774, "logits/rejected": -1.0714927911758423, "logps/chosen": -207.9117889404297, "logps/rejected": -220.92147827148438, "loss": 0.6249, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0551542267203331, "rewards/margins": 0.12827317416667938, "rewards/margins_max": 0.2003893405199051, "rewards/margins_min": 0.05615702271461487, "rewards/margins_std": 0.10198765993118286, "rewards/rejected": -0.07311895489692688, "step": 2700 }, { "epoch": 0.85, "grad_norm": 0.46484375, "learning_rate": 6.372512499750471e-08, "logits/chosen": -1.4824740886688232, "logits/rejected": -1.176748275756836, "logps/chosen": -191.59780883789062, "logps/rejected": -214.6171875, "loss": 0.6386, "rewards/accuracies": 1.0, "rewards/chosen": 0.05065562576055527, "rewards/margins": 0.12029320001602173, "rewards/margins_max": 0.16716250777244568, "rewards/margins_min": 0.07342389971017838, "rewards/margins_std": 0.0662832111120224, "rewards/rejected": -0.06963758170604706, "step": 2710 }, { "epoch": 0.86, "grad_norm": 0.49609375, "learning_rate": 6.106469075271714e-08, "logits/chosen": -1.5376232862472534, "logits/rejected": -1.1263943910598755, "logps/chosen": -180.64654541015625, "logps/rejected": -257.4786071777344, "loss": 0.6108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.055960871279239655, "rewards/margins": 0.17603328824043274, "rewards/margins_max": 0.26209133863449097, "rewards/margins_min": 0.08997530490159988, "rewards/margins_std": 0.12170439958572388, "rewards/rejected": -0.12007243931293488, "step": 2720 }, { "epoch": 0.86, "grad_norm": 0.55859375, "learning_rate": 5.845736689642472e-08, "logits/chosen": -1.3399255275726318, "logits/rejected": -0.8938875198364258, "logps/chosen": -219.58438110351562, "logps/rejected": -196.69725036621094, "loss": 0.6215, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0496777668595314, "rewards/margins": 0.14592623710632324, "rewards/margins_max": 0.22191736102104187, "rewards/margins_min": 0.06993507593870163, "rewards/margins_std": 0.10746772587299347, "rewards/rejected": -0.09624846279621124, "step": 2730 }, { "epoch": 0.86, "grad_norm": 0.47265625, "learning_rate": 5.590346891015757e-08, "logits/chosen": -1.5089246034622192, "logits/rejected": -0.9761932492256165, "logps/chosen": -251.63235473632812, "logps/rejected": -270.4111328125, "loss": 0.626, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.06311796605587006, "rewards/margins": 0.16048596799373627, "rewards/margins_max": 0.2200118750333786, "rewards/margins_min": 0.10096003860235214, "rewards/margins_std": 0.08418238908052444, "rewards/rejected": -0.09736800938844681, "step": 2740 }, { "epoch": 0.87, "grad_norm": 0.546875, "learning_rate": 5.340330581101088e-08, "logits/chosen": -1.4080811738967896, "logits/rejected": -0.965406060218811, "logps/chosen": -207.83154296875, "logps/rejected": -219.427978515625, "loss": 0.6289, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04717772826552391, "rewards/margins": 0.13114681839942932, "rewards/margins_max": 0.18514610826969147, "rewards/margins_min": 0.07714752852916718, "rewards/margins_std": 0.07636652886867523, "rewards/rejected": -0.08396908640861511, "step": 2750 }, { "epoch": 0.87, "grad_norm": 0.427734375, "learning_rate": 5.0957180114254536e-08, "logits/chosen": -1.4429428577423096, "logits/rejected": -1.047139048576355, "logps/chosen": -210.0402069091797, "logps/rejected": -201.76473999023438, "loss": 0.6171, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04892607405781746, "rewards/margins": 0.1490209847688675, "rewards/margins_max": 0.2127954214811325, "rewards/margins_min": 0.08524654805660248, "rewards/margins_std": 0.09019068628549576, "rewards/rejected": -0.10009489953517914, "step": 2760 }, { "epoch": 0.87, "grad_norm": 0.490234375, "learning_rate": 4.8565387796728864e-08, "logits/chosen": -1.4443720579147339, "logits/rejected": -0.9906848073005676, "logps/chosen": -175.06602478027344, "logps/rejected": -188.9914093017578, "loss": 0.6216, "rewards/accuracies": 1.0, "rewards/chosen": 0.05647973343729973, "rewards/margins": 0.17043130099773407, "rewards/margins_max": 0.23241189122200012, "rewards/margins_min": 0.10845069587230682, "rewards/margins_std": 0.08765380084514618, "rewards/rejected": -0.11395156383514404, "step": 2770 }, { "epoch": 0.88, "grad_norm": 0.375, "learning_rate": 4.622821826103285e-08, "logits/chosen": -1.3417904376983643, "logits/rejected": -0.9641525149345398, "logps/chosen": -182.08653259277344, "logps/rejected": -219.1973876953125, "loss": 0.6257, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03520121052861214, "rewards/margins": 0.16146528720855713, "rewards/margins_max": 0.23596426844596863, "rewards/margins_min": 0.08696627616882324, "rewards/margins_std": 0.10535748302936554, "rewards/rejected": -0.1262640655040741, "step": 2780 }, { "epoch": 0.88, "grad_norm": 0.33984375, "learning_rate": 4.394595430050613e-08, "logits/chosen": -1.5283844470977783, "logits/rejected": -0.9562716484069824, "logps/chosen": -246.2748260498047, "logps/rejected": -299.6006774902344, "loss": 0.6238, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.05441068485379219, "rewards/margins": 0.1941131055355072, "rewards/margins_max": 0.32748061418533325, "rewards/margins_min": 0.06074561923742294, "rewards/margins_std": 0.18861012160778046, "rewards/rejected": -0.1397024393081665, "step": 2790 }, { "epoch": 0.88, "grad_norm": 0.6484375, "learning_rate": 4.17188720650119e-08, "logits/chosen": -1.4502097368240356, "logits/rejected": -1.1390039920806885, "logps/chosen": -188.01138305664062, "logps/rejected": -214.70059204101562, "loss": 0.6255, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04769737645983696, "rewards/margins": 0.1742611676454544, "rewards/margins_max": 0.24131028354167938, "rewards/margins_min": 0.10721202194690704, "rewards/margins_std": 0.09482181072235107, "rewards/rejected": -0.12656378746032715, "step": 2800 }, { "epoch": 0.89, "grad_norm": 0.337890625, "learning_rate": 3.954724102752316e-08, "logits/chosen": -1.4321156740188599, "logits/rejected": -1.0594213008880615, "logps/chosen": -205.7993927001953, "logps/rejected": -190.0954132080078, "loss": 0.6276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.05445154756307602, "rewards/margins": 0.1348884403705597, "rewards/margins_max": 0.20686748623847961, "rewards/margins_min": 0.06290940940380096, "rewards/margins_std": 0.10179372131824493, "rewards/rejected": -0.08043689280748367, "step": 2810 }, { "epoch": 0.89, "grad_norm": 0.4375, "learning_rate": 3.743132395151705e-08, "logits/chosen": -1.2473394870758057, "logits/rejected": -1.0283358097076416, "logps/chosen": -165.55337524414062, "logps/rejected": -248.9470977783203, "loss": 0.6066, "rewards/accuracies": 1.0, "rewards/chosen": 0.0528048500418663, "rewards/margins": 0.17907902598381042, "rewards/margins_max": 0.27151721715927124, "rewards/margins_min": 0.08664089441299438, "rewards/margins_std": 0.13072729110717773, "rewards/rejected": -0.12627418339252472, "step": 2820 }, { "epoch": 0.89, "grad_norm": 0.470703125, "learning_rate": 3.537137685918074e-08, "logits/chosen": -1.4500157833099365, "logits/rejected": -1.026012659072876, "logps/chosen": -259.57025146484375, "logps/rejected": -245.7373046875, "loss": 0.6139, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03459121286869049, "rewards/margins": 0.16094355285167694, "rewards/margins_max": 0.2114548683166504, "rewards/margins_min": 0.1104322299361229, "rewards/margins_std": 0.07143379747867584, "rewards/rejected": -0.12635232508182526, "step": 2830 }, { "epoch": 0.89, "grad_norm": 0.45703125, "learning_rate": 3.336764900043332e-08, "logits/chosen": -1.3304523229599, "logits/rejected": -1.061004400253296, "logps/chosen": -204.9029083251953, "logps/rejected": -177.60275268554688, "loss": 0.6227, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.052144668996334076, "rewards/margins": 0.1289752721786499, "rewards/margins_max": 0.19827334582805634, "rewards/margins_min": 0.05967719480395317, "rewards/margins_std": 0.09800229221582413, "rewards/rejected": -0.07683060318231583, "step": 2840 }, { "epoch": 0.9, "grad_norm": 0.59375, "learning_rate": 3.142038282276732e-08, "logits/chosen": -1.378488540649414, "logits/rejected": -1.1016395092010498, "logps/chosen": -169.7418212890625, "logps/rejected": -185.75503540039062, "loss": 0.631, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04850053787231445, "rewards/margins": 0.12216831743717194, "rewards/margins_max": 0.18565914034843445, "rewards/margins_min": 0.058677464723587036, "rewards/margins_std": 0.08978961408138275, "rewards/rejected": -0.07366776466369629, "step": 2850 }, { "epoch": 0.9, "grad_norm": 0.6015625, "learning_rate": 2.9529813941912284e-08, "logits/chosen": -1.3763916492462158, "logits/rejected": -1.0432021617889404, "logps/chosen": -192.54698181152344, "logps/rejected": -203.70191955566406, "loss": 0.6215, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.045077886432409286, "rewards/margins": 0.13763293623924255, "rewards/margins_max": 0.18970827758312225, "rewards/margins_min": 0.08555762469768524, "rewards/margins_std": 0.07364563643932343, "rewards/rejected": -0.09255506098270416, "step": 2860 }, { "epoch": 0.9, "grad_norm": 0.53515625, "learning_rate": 2.7696171113326394e-08, "logits/chosen": -1.315382480621338, "logits/rejected": -0.939505398273468, "logps/chosen": -267.24749755859375, "logps/rejected": -210.3874969482422, "loss": 0.6245, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03983648866415024, "rewards/margins": 0.14058911800384521, "rewards/margins_max": 0.20600366592407227, "rewards/margins_min": 0.07517461478710175, "rewards/margins_std": 0.09251008927822113, "rewards/rejected": -0.10075263679027557, "step": 2870 }, { "epoch": 0.91, "grad_norm": 0.453125, "learning_rate": 2.591967620451707e-08, "logits/chosen": -1.398828148841858, "logits/rejected": -1.0324281454086304, "logps/chosen": -286.6908264160156, "logps/rejected": -261.8348083496094, "loss": 0.6276, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.040006984025239944, "rewards/margins": 0.12854711711406708, "rewards/margins_max": 0.18966877460479736, "rewards/margins_min": 0.06742547452449799, "rewards/margins_std": 0.08643907308578491, "rewards/rejected": -0.08854014426469803, "step": 2880 }, { "epoch": 0.91, "grad_norm": 0.361328125, "learning_rate": 2.4200544168195557e-08, "logits/chosen": -1.3377050161361694, "logits/rejected": -1.0328240394592285, "logps/chosen": -182.38858032226562, "logps/rejected": -261.2072448730469, "loss": 0.6172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05574542284011841, "rewards/margins": 0.1974661648273468, "rewards/margins_max": 0.28793179988861084, "rewards/margins_min": 0.10700048506259918, "rewards/margins_std": 0.12793776392936707, "rewards/rejected": -0.1417207270860672, "step": 2890 }, { "epoch": 0.91, "grad_norm": 0.392578125, "learning_rate": 2.253898301626789e-08, "logits/chosen": -1.54763925075531, "logits/rejected": -0.9423719644546509, "logps/chosen": -311.39129638671875, "logps/rejected": -230.20443725585938, "loss": 0.6198, "rewards/accuracies": 1.0, "rewards/chosen": 0.05252306908369064, "rewards/margins": 0.1566629707813263, "rewards/margins_max": 0.22501006722450256, "rewards/margins_min": 0.08831588923931122, "rewards/margins_std": 0.09665738046169281, "rewards/rejected": -0.10413990169763565, "step": 2900 }, { "epoch": 0.92, "grad_norm": 0.470703125, "learning_rate": 2.0935193794666016e-08, "logits/chosen": -1.3872407674789429, "logits/rejected": -0.9147874116897583, "logps/chosen": -230.1566619873047, "logps/rejected": -230.2250518798828, "loss": 0.6168, "rewards/accuracies": 1.0, "rewards/chosen": 0.05687636137008667, "rewards/margins": 0.15954247117042542, "rewards/margins_max": 0.23747679591178894, "rewards/margins_min": 0.0816081315279007, "rewards/margins_std": 0.11021579802036285, "rewards/rejected": -0.10266611725091934, "step": 2910 }, { "epoch": 0.92, "grad_norm": 0.375, "learning_rate": 1.9389370559021345e-08, "logits/chosen": -1.3618733882904053, "logits/rejected": -1.2244329452514648, "logps/chosen": -185.58168029785156, "logps/rejected": -240.06930541992188, "loss": 0.6358, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.021337289363145828, "rewards/margins": 0.12384787946939468, "rewards/margins_max": 0.1760438084602356, "rewards/margins_min": 0.07165192067623138, "rewards/margins_std": 0.0738162100315094, "rewards/rejected": -0.10251058638095856, "step": 2920 }, { "epoch": 0.92, "grad_norm": 0.416015625, "learning_rate": 1.7901700351184655e-08, "logits/chosen": -1.4315520524978638, "logits/rejected": -1.1171866655349731, "logps/chosen": -231.65274047851562, "logps/rejected": -276.26422119140625, "loss": 0.634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05234185978770256, "rewards/margins": 0.13299915194511414, "rewards/margins_max": 0.18938472867012024, "rewards/margins_min": 0.07661359012126923, "rewards/margins_std": 0.0797412246465683, "rewards/rejected": -0.08065730333328247, "step": 2930 }, { "epoch": 0.93, "grad_norm": 0.484375, "learning_rate": 1.647236317659423e-08, "logits/chosen": -1.3440803289413452, "logits/rejected": -1.0078037977218628, "logps/chosen": -256.86236572265625, "logps/rejected": -324.974609375, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 0.029775535687804222, "rewards/margins": 0.18259739875793457, "rewards/margins_max": 0.273844838142395, "rewards/margins_min": 0.09134997427463531, "rewards/margins_std": 0.1290433555841446, "rewards/rejected": -0.1528218686580658, "step": 2940 }, { "epoch": 0.93, "grad_norm": 0.46484375, "learning_rate": 1.5101531982495308e-08, "logits/chosen": -1.383049726486206, "logits/rejected": -1.0008172988891602, "logps/chosen": -236.1011505126953, "logps/rejected": -224.7187042236328, "loss": 0.626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0493309423327446, "rewards/margins": 0.13633129000663757, "rewards/margins_max": 0.20582230389118195, "rewards/margins_min": 0.06684030592441559, "rewards/margins_std": 0.09827511012554169, "rewards/rejected": -0.08700035512447357, "step": 2950 }, { "epoch": 0.93, "grad_norm": 0.3671875, "learning_rate": 1.3789372637014129e-08, "logits/chosen": -1.3957499265670776, "logits/rejected": -1.0853421688079834, "logps/chosen": -206.4956817626953, "logps/rejected": -263.0644836425781, "loss": 0.6279, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.047060225158929825, "rewards/margins": 0.1514916718006134, "rewards/margins_max": 0.20764505863189697, "rewards/margins_min": 0.09533828496932983, "rewards/margins_std": 0.07941287755966187, "rewards/rejected": -0.10443145036697388, "step": 2960 }, { "epoch": 0.94, "grad_norm": 0.474609375, "learning_rate": 1.253604390908819e-08, "logits/chosen": -1.1620736122131348, "logits/rejected": -1.0084644556045532, "logps/chosen": -197.8514862060547, "logps/rejected": -299.61712646484375, "loss": 0.628, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03723246604204178, "rewards/margins": 0.1657179892063141, "rewards/margins_max": 0.2536987364292145, "rewards/margins_min": 0.0777372270822525, "rewards/margins_std": 0.12442357838153839, "rewards/rejected": -0.1284855306148529, "step": 2970 }, { "epoch": 0.94, "grad_norm": 0.578125, "learning_rate": 1.1341697449255061e-08, "logits/chosen": -1.484635829925537, "logits/rejected": -1.0910263061523438, "logps/chosen": -301.75091552734375, "logps/rejected": -205.1429443359375, "loss": 0.6268, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023064447566866875, "rewards/margins": 0.09722733497619629, "rewards/margins_max": 0.1512151062488556, "rewards/margins_min": 0.04323957860469818, "rewards/margins_std": 0.07635021954774857, "rewards/rejected": -0.07416288554668427, "step": 2980 }, { "epoch": 0.94, "grad_norm": 0.50390625, "learning_rate": 1.0206477771303234e-08, "logits/chosen": -1.451160192489624, "logits/rejected": -1.0006252527236938, "logps/chosen": -236.98428344726562, "logps/rejected": -230.22091674804688, "loss": 0.6202, "rewards/accuracies": 1.0, "rewards/chosen": 0.048841338604688644, "rewards/margins": 0.15246547758579254, "rewards/margins_max": 0.22143657505512238, "rewards/margins_min": 0.08349435031414032, "rewards/margins_std": 0.09753988683223724, "rewards/rejected": -0.103624127805233, "step": 2990 }, { "epoch": 0.95, "grad_norm": 0.5078125, "learning_rate": 9.130522234786497e-09, "logits/chosen": -1.5553325414657593, "logits/rejected": -1.1860215663909912, "logps/chosen": -237.86328125, "logps/rejected": -204.50979614257812, "loss": 0.6222, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.046629082411527634, "rewards/margins": 0.11921097338199615, "rewards/margins_max": 0.17536310851573944, "rewards/margins_min": 0.06305884569883347, "rewards/margins_std": 0.0794111043214798, "rewards/rejected": -0.07258189469575882, "step": 3000 }, { "epoch": 0.95, "grad_norm": 0.43359375, "learning_rate": 8.113961028402894e-09, "logits/chosen": -1.3244291543960571, "logits/rejected": -0.9829646944999695, "logps/chosen": -202.76742553710938, "logps/rejected": -225.45693969726562, "loss": 0.6174, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04439740255475044, "rewards/margins": 0.16769596934318542, "rewards/margins_max": 0.2617490887641907, "rewards/margins_min": 0.07364289462566376, "rewards/margins_std": 0.13301116228103638, "rewards/rejected": -0.12329860031604767, "step": 3010 }, { "epoch": 0.95, "grad_norm": 0.41015625, "learning_rate": 7.156917154243047e-09, "logits/chosen": -1.3500535488128662, "logits/rejected": -0.9996110796928406, "logps/chosen": -176.10202026367188, "logps/rejected": -185.0475311279297, "loss": 0.6364, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.034986961632966995, "rewards/margins": 0.11707176268100739, "rewards/margins_max": 0.17164544761180878, "rewards/margins_min": 0.0624980702996254, "rewards/margins_std": 0.07717885076999664, "rewards/rejected": -0.0820847898721695, "step": 3020 }, { "epoch": 0.95, "grad_norm": 0.419921875, "learning_rate": 6.259506412906402e-09, "logits/chosen": -1.493024230003357, "logits/rejected": -1.05169677734375, "logps/chosen": -234.1255340576172, "logps/rejected": -242.9952392578125, "loss": 0.6167, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.047915857285261154, "rewards/margins": 0.13081197440624237, "rewards/margins_max": 0.20202195644378662, "rewards/margins_min": 0.05960196256637573, "rewards/margins_std": 0.10070616006851196, "rewards/rejected": -0.08289609849452972, "step": 3030 }, { "epoch": 0.96, "grad_norm": 0.359375, "learning_rate": 5.4218373894898696e-09, "logits/chosen": -1.4888322353363037, "logits/rejected": -1.1974503993988037, "logps/chosen": -184.48179626464844, "logps/rejected": -212.9262237548828, "loss": 0.6319, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.042879484593868256, "rewards/margins": 0.12140518426895142, "rewards/margins_max": 0.1660289615392685, "rewards/margins_min": 0.07678140699863434, "rewards/margins_std": 0.06310755014419556, "rewards/rejected": -0.07852570712566376, "step": 3040 }, { "epoch": 0.96, "grad_norm": 0.40625, "learning_rate": 4.644011440449236e-09, "logits/chosen": -1.241875410079956, "logits/rejected": -0.8721693158149719, "logps/chosen": -175.67909240722656, "logps/rejected": -243.4571990966797, "loss": 0.624, "rewards/accuracies": 1.0, "rewards/chosen": 0.050394732505083084, "rewards/margins": 0.16494488716125488, "rewards/margins_max": 0.24216532707214355, "rewards/margins_min": 0.087724469602108, "rewards/margins_std": 0.10920616239309311, "rewards/rejected": -0.1145501583814621, "step": 3050 }, { "epoch": 0.96, "grad_norm": 0.421875, "learning_rate": 3.926122681335353e-09, "logits/chosen": -1.3381614685058594, "logits/rejected": -1.1396186351776123, "logps/chosen": -203.67967224121094, "logps/rejected": -181.2338104248047, "loss": 0.6357, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.034172505140304565, "rewards/margins": 0.09833844006061554, "rewards/margins_max": 0.14537864923477173, "rewards/margins_min": 0.05129823088645935, "rewards/margins_std": 0.06652490049600601, "rewards/rejected": -0.06416593492031097, "step": 3060 }, { "epoch": 0.97, "grad_norm": 0.57421875, "learning_rate": 3.268257975405697e-09, "logits/chosen": -1.248270034790039, "logits/rejected": -0.9826574325561523, "logps/chosen": -219.90811157226562, "logps/rejected": -227.92318725585938, "loss": 0.6273, "rewards/accuracies": 1.0, "rewards/chosen": 0.04911006614565849, "rewards/margins": 0.166696235537529, "rewards/margins_max": 0.2482796609401703, "rewards/margins_min": 0.08511278033256531, "rewards/margins_std": 0.11537641286849976, "rewards/rejected": -0.1175861582159996, "step": 3070 }, { "epoch": 0.97, "grad_norm": 0.439453125, "learning_rate": 2.67049692311494e-09, "logits/chosen": -1.5718332529067993, "logits/rejected": -1.1442277431488037, "logps/chosen": -270.47186279296875, "logps/rejected": -239.42147827148438, "loss": 0.6136, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05378204584121704, "rewards/margins": 0.14788804948329926, "rewards/margins_max": 0.21103155612945557, "rewards/margins_min": 0.08474452048540115, "rewards/margins_std": 0.08929841965436935, "rewards/rejected": -0.09410599619150162, "step": 3080 }, { "epoch": 0.97, "grad_norm": 0.482421875, "learning_rate": 2.132911852482766e-09, "logits/chosen": -1.3375688791275024, "logits/rejected": -1.0424758195877075, "logps/chosen": -217.1631317138672, "logps/rejected": -195.4293212890625, "loss": 0.6254, "rewards/accuracies": 1.0, "rewards/chosen": 0.041864458471536636, "rewards/margins": 0.16093741357326508, "rewards/margins_max": 0.2589831054210663, "rewards/margins_min": 0.06289170682430267, "rewards/margins_std": 0.1386575698852539, "rewards/rejected": -0.11907295137643814, "step": 3090 }, { "epoch": 0.98, "grad_norm": 0.58203125, "learning_rate": 1.6555678103425397e-09, "logits/chosen": -1.3077802658081055, "logits/rejected": -0.9605744481086731, "logps/chosen": -252.1176300048828, "logps/rejected": -258.5810241699219, "loss": 0.6259, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05393581837415695, "rewards/margins": 0.12319433689117432, "rewards/margins_max": 0.18517567217350006, "rewards/margins_min": 0.06121302396059036, "rewards/margins_std": 0.08765482902526855, "rewards/rejected": -0.06925852596759796, "step": 3100 }, { "epoch": 0.98, "grad_norm": 0.6015625, "learning_rate": 1.2385225544709887e-09, "logits/chosen": -1.3411720991134644, "logits/rejected": -1.007331132888794, "logps/chosen": -249.7186279296875, "logps/rejected": -262.7857360839844, "loss": 0.6249, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.051185525953769684, "rewards/margins": 0.15800713002681732, "rewards/margins_max": 0.23525485396385193, "rewards/margins_min": 0.0807594358921051, "rewards/margins_std": 0.10924477875232697, "rewards/rejected": -0.10682161897420883, "step": 3110 }, { "epoch": 0.98, "grad_norm": 0.33984375, "learning_rate": 8.818265465991293e-10, "logits/chosen": -1.4163812398910522, "logits/rejected": -1.0825564861297607, "logps/chosen": -185.9810028076172, "logps/rejected": -186.9814453125, "loss": 0.622, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06464166939258575, "rewards/margins": 0.1453738510608673, "rewards/margins_max": 0.21843528747558594, "rewards/margins_min": 0.07231242954730988, "rewards/margins_std": 0.10332445055246353, "rewards/rejected": -0.08073217421770096, "step": 3120 }, { "epoch": 0.99, "grad_norm": 0.5078125, "learning_rate": 5.855229463068712e-10, "logits/chosen": -1.3847262859344482, "logits/rejected": -1.087875485420227, "logps/chosen": -213.4091339111328, "logps/rejected": -301.65533447265625, "loss": 0.6268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04394926875829697, "rewards/margins": 0.15025286376476288, "rewards/margins_max": 0.2213151901960373, "rewards/margins_min": 0.07919053733348846, "rewards/margins_std": 0.10049732029438019, "rewards/rejected": -0.1063036099076271, "step": 3130 }, { "epoch": 0.99, "grad_norm": 0.4375, "learning_rate": 3.4964760580069585e-10, "logits/chosen": -1.5246410369873047, "logits/rejected": -1.0728760957717896, "logps/chosen": -187.12368774414062, "logps/rejected": -201.53306579589844, "loss": 0.625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04492334648966789, "rewards/margins": 0.1522952765226364, "rewards/margins_max": 0.2121496945619583, "rewards/margins_min": 0.09244086593389511, "rewards/margins_std": 0.08464692533016205, "rewards/rejected": -0.10737194120883942, "step": 3140 }, { "epoch": 0.99, "grad_norm": 0.6171875, "learning_rate": 1.742290655755707e-10, "logits/chosen": -1.3557556867599487, "logits/rejected": -0.9096490740776062, "logps/chosen": -241.07785034179688, "logps/rejected": -226.32803344726562, "loss": 0.6191, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.045760609209537506, "rewards/margins": 0.16645553708076477, "rewards/margins_max": 0.2321435958147049, "rewards/margins_min": 0.10076741874217987, "rewards/margins_std": 0.09289699047803879, "rewards/rejected": -0.12069491297006607, "step": 3150 }, { "epoch": 1.0, "grad_norm": 0.5, "learning_rate": 5.928855096154483e-11, "logits/chosen": -1.4452476501464844, "logits/rejected": -1.1195826530456543, "logps/chosen": -207.44882202148438, "logps/rejected": -200.09043884277344, "loss": 0.6394, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04787784069776535, "rewards/margins": 0.1137089729309082, "rewards/margins_max": 0.15814949572086334, "rewards/margins_min": 0.06926842778921127, "rewards/margins_std": 0.06284840404987335, "rewards/rejected": -0.06583113223314285, "step": 3160 }, { "epoch": 1.0, "grad_norm": 0.58984375, "learning_rate": 4.839969555581192e-12, "logits/chosen": -1.35175359249115, "logits/rejected": -0.9090153574943542, "logps/chosen": -226.45285034179688, "logps/rejected": -246.6789093017578, "loss": 0.6183, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06078491359949112, "rewards/margins": 0.1678098440170288, "rewards/margins_max": 0.27439039945602417, "rewards/margins_min": 0.061229269951581955, "rewards/margins_std": 0.15072770416736603, "rewards/rejected": -0.10702493041753769, "step": 3170 }, { "epoch": 1.0, "eval_logits/chosen": -1.034969449043274, "eval_logits/rejected": -0.9127383232116699, "eval_logps/chosen": -324.2853088378906, "eval_logps/rejected": -316.0207824707031, "eval_loss": 0.689082682132721, "eval_rewards/accuracies": 0.5569999814033508, "eval_rewards/chosen": 0.006784230004996061, "eval_rewards/margins": 0.010349688120186329, "eval_rewards/margins_max": 0.12646563351154327, "eval_rewards/margins_min": -0.10888691246509552, "eval_rewards/margins_std": 0.07831301540136337, "eval_rewards/rejected": -0.003565457882359624, "eval_runtime": 1446.1513, "eval_samples_per_second": 2.766, "eval_steps_per_second": 0.173, "step": 3174 }, { "epoch": 1.0, "step": 3174, "total_flos": 0.0, "train_loss": 0.6393037238208168, "train_runtime": 24580.6168, "train_samples_per_second": 1.033, "train_steps_per_second": 0.129 } ], "logging_steps": 10, "max_steps": 3174, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }