diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5773 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998424948810837, + "eval_steps": 100, + "global_step": 3174, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 3.1446540880503143e-09, + "logits/chosen": -1.3876760005950928, + "logits/rejected": -1.4584133625030518, + "logps/chosen": -148.11717224121094, + "logps/rejected": -197.28189086914062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.474609375, + "learning_rate": 3.1446540880503146e-08, + "logits/chosen": -1.2969070672988892, + "logits/rejected": -1.0069364309310913, + "logps/chosen": -190.5032196044922, + "logps/rejected": -182.00355529785156, + "loss": 0.6932, + "rewards/accuracies": 0.5555555820465088, + "rewards/chosen": 0.0009341875556856394, + "rewards/margins": 0.0010361968306824565, + "rewards/margins_max": 0.0031799401622265577, + "rewards/margins_min": -0.0011075465008616447, + "rewards/margins_std": 0.003031711094081402, + "rewards/rejected": -0.00010200924589298666, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.443359375, + "learning_rate": 6.289308176100629e-08, + "logits/chosen": -1.3659212589263916, + "logits/rejected": -1.052756667137146, + "logps/chosen": -225.5138397216797, + "logps/rejected": -200.11280822753906, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00012772370246239007, + "rewards/margins": -0.00018917841953225434, + "rewards/margins_max": 0.0013527333503589034, + "rewards/margins_min": -0.0017310904804617167, + "rewards/margins_std": 0.0021805930882692337, + "rewards/rejected": 6.145476072560996e-05, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.43359375, + "learning_rate": 9.433962264150943e-08, + "logits/chosen": -1.2631334066390991, + "logits/rejected": -0.9830008745193481, + "logps/chosen": -180.3957061767578, + "logps/rejected": -184.52642822265625, + "loss": 0.6929, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0003226413391530514, + "rewards/margins": 0.0009109593229368329, + "rewards/margins_max": 0.002635856857523322, + "rewards/margins_min": -0.0008139380952343345, + "rewards/margins_std": 0.0024393731728196144, + "rewards/rejected": -0.0005883178091607988, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.33984375, + "learning_rate": 1.2578616352201258e-07, + "logits/chosen": -1.4588565826416016, + "logits/rejected": -1.1574698686599731, + "logps/chosen": -225.439697265625, + "logps/rejected": -276.75872802734375, + "loss": 0.6935, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0006082096369937062, + "rewards/margins": 0.00012304118718020618, + "rewards/margins_max": 0.0016710966592654586, + "rewards/margins_min": -0.001425014459528029, + "rewards/margins_std": 0.002189281163737178, + "rewards/rejected": 0.00048516839160583913, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.515625, + "learning_rate": 1.5723270440251572e-07, + "logits/chosen": -1.3671318292617798, + "logits/rejected": -0.8632100820541382, + "logps/chosen": -331.68609619140625, + "logps/rejected": -205.90982055664062, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00018301008094567806, + "rewards/margins": 0.0002522182185202837, + "rewards/margins_max": 0.002131999935954809, + "rewards/margins_min": -0.0016275634989142418, + "rewards/margins_std": 0.002658412791788578, + "rewards/rejected": -0.0004352282849140465, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.453125, + "learning_rate": 1.8867924528301886e-07, + "logits/chosen": -1.1950219869613647, + "logits/rejected": -1.0118684768676758, + "logps/chosen": -203.71957397460938, + "logps/rejected": -264.69964599609375, + "loss": 0.693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0003052177489735186, + "rewards/margins": 0.00022837640426587313, + "rewards/margins_max": 0.0015303840627893806, + "rewards/margins_min": -0.0010736312251538038, + "rewards/margins_std": 0.001841316930949688, + "rewards/rejected": 7.684133015573025e-05, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.4453125, + "learning_rate": 2.20125786163522e-07, + "logits/chosen": -1.4155724048614502, + "logits/rejected": -1.0938544273376465, + "logps/chosen": -218.9034423828125, + "logps/rejected": -225.016845703125, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00026987362070940435, + "rewards/margins": 0.0008336820756085217, + "rewards/margins_max": 0.002929271664470434, + "rewards/margins_min": -0.0012619076296687126, + "rewards/margins_std": 0.0029636111576110125, + "rewards/rejected": -0.0005638084840029478, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 0.51953125, + "learning_rate": 2.5157232704402517e-07, + "logits/chosen": -1.2726242542266846, + "logits/rejected": -0.9936110377311707, + "logps/chosen": -285.13433837890625, + "logps/rejected": -266.5086364746094, + "loss": 0.6926, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0007069502025842667, + "rewards/margins": 0.0009692258317954838, + "rewards/margins_max": 0.0028219607193022966, + "rewards/margins_min": -0.0008835086482577026, + "rewards/margins_std": 0.002620162209495902, + "rewards/rejected": -0.00026227571652270854, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 0.59375, + "learning_rate": 2.830188679245283e-07, + "logits/chosen": -1.4592866897583008, + "logits/rejected": -1.1695111989974976, + "logps/chosen": -212.254638671875, + "logps/rejected": -219.2605743408203, + "loss": 0.6925, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0002520198468118906, + "rewards/margins": 0.001166980480775237, + "rewards/margins_max": 0.0038443871308118105, + "rewards/margins_min": -0.001510425703600049, + "rewards/margins_std": 0.0037864241749048233, + "rewards/rejected": -0.0009149607503786683, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.337890625, + "learning_rate": 3.1446540880503144e-07, + "logits/chosen": -1.405822992324829, + "logits/rejected": -0.9022544622421265, + "logps/chosen": -257.4349670410156, + "logps/rejected": -205.3658447265625, + "loss": 0.6924, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0013813646510243416, + "rewards/margins": 0.0018943824106827378, + "rewards/margins_max": 0.0051645501516759396, + "rewards/margins_min": -0.0013757857959717512, + "rewards/margins_std": 0.004624716006219387, + "rewards/rejected": -0.0005130176432430744, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 0.388671875, + "learning_rate": 3.4591194968553456e-07, + "logits/chosen": -1.250548005104065, + "logits/rejected": -0.9772456884384155, + "logps/chosen": -230.58889770507812, + "logps/rejected": -190.00180053710938, + "loss": 0.6915, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0019372021779417992, + "rewards/margins": 0.003521789563819766, + "rewards/margins_max": 0.0056365011259913445, + "rewards/margins_min": 0.001407077768817544, + "rewards/margins_std": 0.002990653971210122, + "rewards/rejected": -0.0015845870366320014, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.455078125, + "learning_rate": 3.773584905660377e-07, + "logits/chosen": -1.4136645793914795, + "logits/rejected": -1.0485485792160034, + "logps/chosen": -195.27610778808594, + "logps/rejected": -186.0947723388672, + "loss": 0.6921, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0026215934194624424, + "rewards/margins": 0.0021645757369697094, + "rewards/margins_max": 0.0044735753908753395, + "rewards/margins_min": -0.00014442438259720802, + "rewards/margins_std": 0.0032654188107699156, + "rewards/rejected": 0.00045701785711571574, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.57421875, + "learning_rate": 4.088050314465409e-07, + "logits/chosen": -1.1770480871200562, + "logits/rejected": -0.9447425603866577, + "logps/chosen": -219.49655151367188, + "logps/rejected": -248.679443359375, + "loss": 0.692, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.00221074465662241, + "rewards/margins": 0.0024410211481153965, + "rewards/margins_max": 0.005052028689533472, + "rewards/margins_min": -0.00016998658247757703, + "rewards/margins_std": 0.0036925221793353558, + "rewards/rejected": -0.00023027621500659734, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.41796875, + "learning_rate": 4.40251572327044e-07, + "logits/chosen": -1.1848294734954834, + "logits/rejected": -0.9620411992073059, + "logps/chosen": -267.6338195800781, + "logps/rejected": -216.468505859375, + "loss": 0.6916, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0020743615459650755, + "rewards/margins": 0.002421380952000618, + "rewards/margins_max": 0.004907554481178522, + "rewards/margins_min": -6.479259172920138e-05, + "rewards/margins_std": 0.0035159799735993147, + "rewards/rejected": -0.0003470192023087293, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 0.3125, + "learning_rate": 4.7169811320754717e-07, + "logits/chosen": -1.3419865369796753, + "logits/rejected": -0.8775084614753723, + "logps/chosen": -304.682861328125, + "logps/rejected": -234.85049438476562, + "loss": 0.6905, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.004905511625111103, + "rewards/margins": 0.006082520820200443, + "rewards/margins_max": 0.010440012440085411, + "rewards/margins_min": 0.0017250289674848318, + "rewards/margins_std": 0.00616242503747344, + "rewards/rejected": -0.0011770094279199839, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.349609375, + "learning_rate": 5.031446540880503e-07, + "logits/chosen": -1.362574815750122, + "logits/rejected": -1.171775221824646, + "logps/chosen": -168.5498504638672, + "logps/rejected": -229.1747589111328, + "loss": 0.6907, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0031735021620988846, + "rewards/margins": 0.004087474662810564, + "rewards/margins_max": 0.007105571683496237, + "rewards/margins_min": 0.001069377874955535, + "rewards/margins_std": 0.00426823366433382, + "rewards/rejected": -0.000913972791749984, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.326171875, + "learning_rate": 5.345911949685534e-07, + "logits/chosen": -1.237182378768921, + "logits/rejected": -0.9322627782821655, + "logps/chosen": -220.6244354248047, + "logps/rejected": -198.8372802734375, + "loss": 0.6904, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0044016363099217415, + "rewards/margins": 0.006395402364432812, + "rewards/margins_max": 0.008824339136481285, + "rewards/margins_min": 0.003966464661061764, + "rewards/margins_std": 0.0034350368659943342, + "rewards/rejected": -0.001993766753003001, + "step": 170 + }, + { + "epoch": 0.06, + "grad_norm": 0.5625, + "learning_rate": 5.660377358490566e-07, + "logits/chosen": -1.3780791759490967, + "logits/rejected": -1.0467432737350464, + "logps/chosen": -213.366455078125, + "logps/rejected": -216.13211059570312, + "loss": 0.6898, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.005849289242178202, + "rewards/margins": 0.007411675062030554, + "rewards/margins_max": 0.012342330068349838, + "rewards/margins_min": 0.0024810179602354765, + "rewards/margins_std": 0.006973001174628735, + "rewards/rejected": -0.0015623854706063867, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 0.43359375, + "learning_rate": 5.974842767295597e-07, + "logits/chosen": -1.2821629047393799, + "logits/rejected": -1.0755701065063477, + "logps/chosen": -196.32594299316406, + "logps/rejected": -216.0081787109375, + "loss": 0.6899, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.005713514052331448, + "rewards/margins": 0.0055726682767271996, + "rewards/margins_max": 0.009180756285786629, + "rewards/margins_min": 0.001964580500498414, + "rewards/margins_std": 0.005102606490254402, + "rewards/rejected": 0.00014084615395404398, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.43359375, + "learning_rate": 6.289308176100629e-07, + "logits/chosen": -1.3140381574630737, + "logits/rejected": -1.105791687965393, + "logps/chosen": -218.8059539794922, + "logps/rejected": -207.1342010498047, + "loss": 0.6897, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.003872637404128909, + "rewards/margins": 0.0057051111944019794, + "rewards/margins_max": 0.009732077829539776, + "rewards/margins_min": 0.0016781443264335394, + "rewards/margins_std": 0.005694991443306208, + "rewards/rejected": -0.0018324736738577485, + "step": 200 + }, + { + "epoch": 0.07, + "grad_norm": 0.38671875, + "learning_rate": 6.60377358490566e-07, + "logits/chosen": -1.430407166481018, + "logits/rejected": -1.0918631553649902, + "logps/chosen": -237.2591552734375, + "logps/rejected": -253.4399871826172, + "loss": 0.6891, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0073573291301727295, + "rewards/margins": 0.007854488678276539, + "rewards/margins_max": 0.012713620439171791, + "rewards/margins_min": 0.0029953576158732176, + "rewards/margins_std": 0.006871848367154598, + "rewards/rejected": -0.0004971598973497748, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 0.333984375, + "learning_rate": 6.918238993710691e-07, + "logits/chosen": -1.3923743963241577, + "logits/rejected": -1.1259468793869019, + "logps/chosen": -275.0027770996094, + "logps/rejected": -198.38681030273438, + "loss": 0.6889, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.007775217294692993, + "rewards/margins": 0.009523289278149605, + "rewards/margins_max": 0.01376691646873951, + "rewards/margins_min": 0.005279661156237125, + "rewards/margins_std": 0.0060013956390321255, + "rewards/rejected": -0.0017480704700574279, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.4765625, + "learning_rate": 7.232704402515722e-07, + "logits/chosen": -1.4120699167251587, + "logits/rejected": -1.116763710975647, + "logps/chosen": -253.22354125976562, + "logps/rejected": -201.98855590820312, + "loss": 0.6879, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.00820042286068201, + "rewards/margins": 0.010577328503131866, + "rewards/margins_max": 0.01556847058236599, + "rewards/margins_min": 0.005586187355220318, + "rewards/margins_std": 0.00705854082480073, + "rewards/rejected": -0.0023769072722643614, + "step": 230 + }, + { + "epoch": 0.08, + "grad_norm": 0.484375, + "learning_rate": 7.547169811320754e-07, + "logits/chosen": -1.3950035572052002, + "logits/rejected": -1.2659811973571777, + "logps/chosen": -176.2644805908203, + "logps/rejected": -260.3876953125, + "loss": 0.6873, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.00841821264475584, + "rewards/margins": 0.011863857507705688, + "rewards/margins_max": 0.01701103337109089, + "rewards/margins_min": 0.006716682109981775, + "rewards/margins_std": 0.007279204670339823, + "rewards/rejected": -0.0034456446301192045, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 0.392578125, + "learning_rate": 7.861635220125787e-07, + "logits/chosen": -1.4266269207000732, + "logits/rejected": -1.1792528629302979, + "logps/chosen": -264.8574523925781, + "logps/rejected": -217.40841674804688, + "loss": 0.6877, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.009459974244236946, + "rewards/margins": 0.011690459214150906, + "rewards/margins_max": 0.01826130598783493, + "rewards/margins_min": 0.005119613837450743, + "rewards/margins_std": 0.00929258018732071, + "rewards/rejected": -0.002230485901236534, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 0.40625, + "learning_rate": 8.176100628930818e-07, + "logits/chosen": -1.2674744129180908, + "logits/rejected": -0.7844586968421936, + "logps/chosen": -282.67913818359375, + "logps/rejected": -258.6161193847656, + "loss": 0.6864, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.011977099813520908, + "rewards/margins": 0.013891148380935192, + "rewards/margins_max": 0.020919669419527054, + "rewards/margins_min": 0.00686262920498848, + "rewards/margins_std": 0.009939828887581825, + "rewards/rejected": -0.0019140491494908929, + "step": 260 + }, + { + "epoch": 0.09, + "grad_norm": 0.419921875, + "learning_rate": 8.490566037735849e-07, + "logits/chosen": -1.3822617530822754, + "logits/rejected": -0.8383499383926392, + "logps/chosen": -259.55377197265625, + "logps/rejected": -242.09774780273438, + "loss": 0.6839, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013840844854712486, + "rewards/margins": 0.020429277792572975, + "rewards/margins_max": 0.029015129432082176, + "rewards/margins_min": 0.011843429878354073, + "rewards/margins_std": 0.012142224237322807, + "rewards/rejected": -0.0065884338691830635, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 0.37890625, + "learning_rate": 8.80503144654088e-07, + "logits/chosen": -1.5422178506851196, + "logits/rejected": -1.203775405883789, + "logps/chosen": -194.75625610351562, + "logps/rejected": -190.38040161132812, + "loss": 0.6865, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01227253396064043, + "rewards/margins": 0.013298863545060158, + "rewards/margins_max": 0.018432527780532837, + "rewards/margins_min": 0.008165198378264904, + "rewards/margins_std": 0.007260097656399012, + "rewards/rejected": -0.0010263302829116583, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.30859375, + "learning_rate": 9.119496855345912e-07, + "logits/chosen": -1.457471489906311, + "logits/rejected": -1.0255846977233887, + "logps/chosen": -241.8590087890625, + "logps/rejected": -208.22671508789062, + "loss": 0.6839, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014999980106949806, + "rewards/margins": 0.020009437575936317, + "rewards/margins_max": 0.029007185250520706, + "rewards/margins_min": 0.011011689901351929, + "rewards/margins_std": 0.012724736705422401, + "rewards/rejected": -0.005009456072002649, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.369140625, + "learning_rate": 9.433962264150943e-07, + "logits/chosen": -1.6220054626464844, + "logits/rejected": -1.1889787912368774, + "logps/chosen": -226.6202850341797, + "logps/rejected": -211.44381713867188, + "loss": 0.6829, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.019335268065333366, + "rewards/margins": 0.021433234214782715, + "rewards/margins_max": 0.03181144595146179, + "rewards/margins_min": 0.011055031791329384, + "rewards/margins_std": 0.014677000232040882, + "rewards/rejected": -0.0020979673136025667, + "step": 300 + }, + { + "epoch": 0.1, + "grad_norm": 0.61328125, + "learning_rate": 9.748427672955975e-07, + "logits/chosen": -1.3852521181106567, + "logits/rejected": -1.1346018314361572, + "logps/chosen": -199.05203247070312, + "logps/rejected": -216.6929473876953, + "loss": 0.6839, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.017914317548274994, + "rewards/margins": 0.018279287964105606, + "rewards/margins_max": 0.02675766311585903, + "rewards/margins_min": 0.00980091467499733, + "rewards/margins_std": 0.011990231461822987, + "rewards/rejected": -0.0003649710270110518, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.400390625, + "learning_rate": 9.99998790006147e-07, + "logits/chosen": -1.3931553363800049, + "logits/rejected": -1.0446799993515015, + "logps/chosen": -249.444580078125, + "logps/rejected": -237.3548583984375, + "loss": 0.6836, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.015047195367515087, + "rewards/margins": 0.01817159913480282, + "rewards/margins_max": 0.028817584738135338, + "rewards/margins_min": 0.007525615394115448, + "rewards/margins_std": 0.015055695548653603, + "rewards/rejected": -0.0031244028359651566, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.4140625, + "learning_rate": 9.999564408362052e-07, + "logits/chosen": -1.484261155128479, + "logits/rejected": -0.9981343150138855, + "logps/chosen": -238.7038116455078, + "logps/rejected": -257.0067443847656, + "loss": 0.6801, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.024171117693185806, + "rewards/margins": 0.03251297399401665, + "rewards/margins_max": 0.04922250285744667, + "rewards/margins_min": 0.015803448855876923, + "rewards/margins_std": 0.023630838841199875, + "rewards/rejected": -0.008341856300830841, + "step": 330 + }, + { + "epoch": 0.11, + "grad_norm": 0.4453125, + "learning_rate": 9.998535978298279e-07, + "logits/chosen": -1.316965103149414, + "logits/rejected": -0.97691810131073, + "logps/chosen": -180.54393005371094, + "logps/rejected": -181.31243896484375, + "loss": 0.6808, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.019655724987387657, + "rewards/margins": 0.022490406408905983, + "rewards/margins_max": 0.028128573670983315, + "rewards/margins_min": 0.01685223914682865, + "rewards/margins_std": 0.007973574101924896, + "rewards/rejected": -0.0028346790932118893, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 0.48828125, + "learning_rate": 9.996902734308345e-07, + "logits/chosen": -1.3453712463378906, + "logits/rejected": -0.9177592992782593, + "logps/chosen": -262.0094299316406, + "logps/rejected": -242.67770385742188, + "loss": 0.6785, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.021431343629956245, + "rewards/margins": 0.030059820041060448, + "rewards/margins_max": 0.04707712680101395, + "rewards/margins_min": 0.01304252166301012, + "rewards/margins_std": 0.024066099897027016, + "rewards/rejected": -0.0086284838616848, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 0.4921875, + "learning_rate": 9.994664874011861e-07, + "logits/chosen": -1.6302440166473389, + "logits/rejected": -1.2027785778045654, + "logps/chosen": -275.63800048828125, + "logps/rejected": -223.16134643554688, + "loss": 0.6786, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023763367906212807, + "rewards/margins": 0.034584060311317444, + "rewards/margins_max": 0.05510062724351883, + "rewards/margins_min": 0.014067496173083782, + "rewards/margins_std": 0.02901480160653591, + "rewards/rejected": -0.010820695199072361, + "step": 360 + }, + { + "epoch": 0.12, + "grad_norm": 0.3984375, + "learning_rate": 9.991822668185925e-07, + "logits/chosen": -1.4096801280975342, + "logits/rejected": -1.0252230167388916, + "logps/chosen": -229.8339080810547, + "logps/rejected": -172.35845947265625, + "loss": 0.6745, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.026459768414497375, + "rewards/margins": 0.039466358721256256, + "rewards/margins_max": 0.05379994958639145, + "rewards/margins_min": 0.025132764130830765, + "rewards/margins_std": 0.020270761102437973, + "rewards/rejected": -0.013006587512791157, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 0.359375, + "learning_rate": 9.988376460732366e-07, + "logits/chosen": -1.3159959316253662, + "logits/rejected": -1.0778305530548096, + "logps/chosen": -227.1737518310547, + "logps/rejected": -181.856201171875, + "loss": 0.6776, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02110281027853489, + "rewards/margins": 0.03095998801290989, + "rewards/margins_max": 0.04912665858864784, + "rewards/margins_min": 0.01279332023113966, + "rewards/margins_std": 0.025691548362374306, + "rewards/rejected": -0.009857181459665298, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 0.38671875, + "learning_rate": 9.98432666863613e-07, + "logits/chosen": -1.3005465269088745, + "logits/rejected": -0.8841564059257507, + "logps/chosen": -265.8480529785156, + "logps/rejected": -214.14306640625, + "loss": 0.6753, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023480424657464027, + "rewards/margins": 0.03407277166843414, + "rewards/margins_max": 0.05221151188015938, + "rewards/margins_min": 0.015934035181999207, + "rewards/margins_std": 0.025652050971984863, + "rewards/rejected": -0.01059234980493784, + "step": 390 + }, + { + "epoch": 0.13, + "grad_norm": 0.390625, + "learning_rate": 9.979673781914829e-07, + "logits/chosen": -1.3220188617706299, + "logits/rejected": -0.9904147982597351, + "logps/chosen": -208.8026580810547, + "logps/rejected": -199.86724853515625, + "loss": 0.6786, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023009879514575005, + "rewards/margins": 0.027196567505598068, + "rewards/margins_max": 0.04160440340638161, + "rewards/margins_min": 0.012788738124072552, + "rewards/margins_std": 0.020375750958919525, + "rewards/rejected": -0.004186691250652075, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 0.55078125, + "learning_rate": 9.974418363559443e-07, + "logits/chosen": -1.4779157638549805, + "logits/rejected": -1.2148815393447876, + "logps/chosen": -191.52413940429688, + "logps/rejected": -176.9311981201172, + "loss": 0.6768, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.024117572233080864, + "rewards/margins": 0.029250899329781532, + "rewards/margins_max": 0.04297717660665512, + "rewards/margins_min": 0.015524620190262794, + "rewards/margins_std": 0.019411887973546982, + "rewards/rejected": -0.005133326631039381, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 0.39453125, + "learning_rate": 9.968561049466213e-07, + "logits/chosen": -1.3753879070281982, + "logits/rejected": -0.9945527911186218, + "logps/chosen": -254.3845672607422, + "logps/rejected": -244.81082153320312, + "loss": 0.6703, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04066943749785423, + "rewards/margins": 0.043636471033096313, + "rewards/margins_max": 0.06465307623147964, + "rewards/margins_min": 0.02261987514793873, + "rewards/margins_std": 0.02972196415066719, + "rewards/rejected": -0.0029670395888388157, + "step": 420 + }, + { + "epoch": 0.14, + "grad_norm": 0.435546875, + "learning_rate": 9.96210254835968e-07, + "logits/chosen": -1.3385958671569824, + "logits/rejected": -1.0368238687515259, + "logps/chosen": -257.49481201171875, + "logps/rejected": -251.6103515625, + "loss": 0.6703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.038941044360399246, + "rewards/margins": 0.0429171659052372, + "rewards/margins_max": 0.07056744396686554, + "rewards/margins_min": 0.015266889706254005, + "rewards/margins_std": 0.039103396236896515, + "rewards/rejected": -0.003976122476160526, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.466796875, + "learning_rate": 9.95504364170694e-07, + "logits/chosen": -1.523626685142517, + "logits/rejected": -0.9943062663078308, + "logps/chosen": -205.248291015625, + "logps/rejected": -203.47445678710938, + "loss": 0.6712, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04165233299136162, + "rewards/margins": 0.04524458199739456, + "rewards/margins_max": 0.07039403915405273, + "rewards/margins_min": 0.020095128566026688, + "rewards/margins_std": 0.03556669503450394, + "rewards/rejected": -0.0035922485403716564, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 0.5, + "learning_rate": 9.947385183623096e-07, + "logits/chosen": -1.3122961521148682, + "logits/rejected": -1.1558181047439575, + "logps/chosen": -210.66714477539062, + "logps/rejected": -231.5789031982422, + "loss": 0.6726, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03287133574485779, + "rewards/margins": 0.042133878916502, + "rewards/margins_max": 0.06311113387346268, + "rewards/margins_min": 0.02115662209689617, + "rewards/margins_std": 0.029666315764188766, + "rewards/rejected": -0.009262535721063614, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.36328125, + "learning_rate": 9.9391281007679e-07, + "logits/chosen": -1.267585039138794, + "logits/rejected": -0.9590380787849426, + "logps/chosen": -170.42410278320312, + "logps/rejected": -212.1375274658203, + "loss": 0.6685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03331318497657776, + "rewards/margins": 0.0470331646502018, + "rewards/margins_max": 0.0685221329331398, + "rewards/margins_min": 0.0255441851913929, + "rewards/margins_std": 0.030389999970793724, + "rewards/rejected": -0.013719978742301464, + "step": 460 + }, + { + "epoch": 0.15, + "grad_norm": 0.4140625, + "learning_rate": 9.930273392233624e-07, + "logits/chosen": -1.3601120710372925, + "logits/rejected": -1.0370827913284302, + "logps/chosen": -211.78121948242188, + "logps/rejected": -261.12493896484375, + "loss": 0.6665, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04147142544388771, + "rewards/margins": 0.05635608360171318, + "rewards/margins_max": 0.07943321764469147, + "rewards/margins_min": 0.03327895328402519, + "rewards/margins_std": 0.032635994255542755, + "rewards/rejected": -0.01488465815782547, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 0.53125, + "learning_rate": 9.920822129424189e-07, + "logits/chosen": -1.3620996475219727, + "logits/rejected": -1.0208871364593506, + "logps/chosen": -181.54205322265625, + "logps/rejected": -218.82913208007812, + "loss": 0.6703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03728248178958893, + "rewards/margins": 0.047586847096681595, + "rewards/margins_max": 0.06849299371242523, + "rewards/margins_min": 0.026680713519454002, + "rewards/margins_std": 0.02956574596464634, + "rewards/rejected": -0.010304367169737816, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.4375, + "learning_rate": 9.910775455925517e-07, + "logits/chosen": -1.4793182611465454, + "logits/rejected": -1.2021210193634033, + "logps/chosen": -174.04185485839844, + "logps/rejected": -171.1067352294922, + "loss": 0.6724, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02975636161863804, + "rewards/margins": 0.03999444097280502, + "rewards/margins_max": 0.05971541255712509, + "rewards/margins_min": 0.020273465663194656, + "rewards/margins_std": 0.02788967452943325, + "rewards/rejected": -0.01023807656019926, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 0.39453125, + "learning_rate": 9.90013458736716e-07, + "logits/chosen": -1.6029140949249268, + "logits/rejected": -1.194563865661621, + "logps/chosen": -213.6199493408203, + "logps/rejected": -206.2983856201172, + "loss": 0.6673, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0341024175286293, + "rewards/margins": 0.05521542578935623, + "rewards/margins_max": 0.07687483727931976, + "rewards/margins_min": 0.0335560217499733, + "rewards/margins_std": 0.03063102997839451, + "rewards/rejected": -0.02111300453543663, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 0.373046875, + "learning_rate": 9.888900811275203e-07, + "logits/chosen": -1.3525406122207642, + "logits/rejected": -1.0656477212905884, + "logps/chosen": -206.7255401611328, + "logps/rejected": -196.1541290283203, + "loss": 0.6665, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.030226921662688255, + "rewards/margins": 0.05328403785824776, + "rewards/margins_max": 0.06973598897457123, + "rewards/margins_min": 0.036832086741924286, + "rewards/margins_std": 0.02326657809317112, + "rewards/rejected": -0.0230571199208498, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.36328125, + "learning_rate": 9.877075486916496e-07, + "logits/chosen": -1.3600490093231201, + "logits/rejected": -1.037398099899292, + "logps/chosen": -177.47750854492188, + "logps/rejected": -182.17776489257812, + "loss": 0.6732, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03540753945708275, + "rewards/margins": 0.04670108109712601, + "rewards/margins_max": 0.07210978865623474, + "rewards/margins_min": 0.02129237726330757, + "rewards/margins_std": 0.03593333810567856, + "rewards/rejected": -0.011293541640043259, + "step": 520 + }, + { + "epoch": 0.17, + "grad_norm": 0.4765625, + "learning_rate": 9.864660045134162e-07, + "logits/chosen": -1.3215343952178955, + "logits/rejected": -1.0994096994400024, + "logps/chosen": -210.7837371826172, + "logps/rejected": -205.7008056640625, + "loss": 0.6662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03428806737065315, + "rewards/margins": 0.06019355729222298, + "rewards/margins_max": 0.08791188150644302, + "rewards/margins_min": 0.03247522935271263, + "rewards/margins_std": 0.03919963538646698, + "rewards/rejected": -0.025905489921569824, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 0.45703125, + "learning_rate": 9.851655988174489e-07, + "logits/chosen": -1.47915780544281, + "logits/rejected": -0.9851093292236328, + "logps/chosen": -194.7159881591797, + "logps/rejected": -213.84048461914062, + "loss": 0.6662, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.041106559336185455, + "rewards/margins": 0.054338376969099045, + "rewards/margins_max": 0.0830872431397438, + "rewards/margins_min": 0.025589507073163986, + "rewards/margins_std": 0.04065703600645065, + "rewards/rejected": -0.013231811113655567, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.421875, + "learning_rate": 9.83806488950514e-07, + "logits/chosen": -1.4582946300506592, + "logits/rejected": -1.2372512817382812, + "logps/chosen": -231.83242797851562, + "logps/rejected": -204.83743286132812, + "loss": 0.6648, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03986522555351257, + "rewards/margins": 0.05911695212125778, + "rewards/margins_max": 0.09305725246667862, + "rewards/margins_min": 0.02517666481435299, + "rewards/margins_std": 0.0479988157749176, + "rewards/rejected": -0.019251730293035507, + "step": 550 + }, + { + "epoch": 0.18, + "grad_norm": 0.34765625, + "learning_rate": 9.82388839362478e-07, + "logits/chosen": -1.4161055088043213, + "logits/rejected": -1.1000274419784546, + "logps/chosen": -198.75758361816406, + "logps/rejected": -205.18826293945312, + "loss": 0.6659, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04866677150130272, + "rewards/margins": 0.0624736063182354, + "rewards/margins_max": 0.09315863996744156, + "rewards/margins_min": 0.03178856521844864, + "rewards/margins_std": 0.04339519888162613, + "rewards/rejected": -0.013806832954287529, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 0.462890625, + "learning_rate": 9.809128215864096e-07, + "logits/chosen": -1.2735754251480103, + "logits/rejected": -0.9051140546798706, + "logps/chosen": -270.4387512207031, + "logps/rejected": -241.47079467773438, + "loss": 0.6665, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03088981844484806, + "rewards/margins": 0.05677186697721481, + "rewards/margins_max": 0.08117306232452393, + "rewards/margins_min": 0.0323706679046154, + "rewards/margins_std": 0.03450850397348404, + "rewards/rejected": -0.025882050395011902, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 0.4453125, + "learning_rate": 9.79378614217823e-07, + "logits/chosen": -1.354640245437622, + "logits/rejected": -1.0751674175262451, + "logps/chosen": -227.79849243164062, + "logps/rejected": -248.08642578125, + "loss": 0.6527, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04743616655468941, + "rewards/margins": 0.0788937360048294, + "rewards/margins_max": 0.11354710906744003, + "rewards/margins_min": 0.04424036294221878, + "rewards/margins_std": 0.04900727421045303, + "rewards/rejected": -0.0314575657248497, + "step": 580 + }, + { + "epoch": 0.19, + "grad_norm": 0.4140625, + "learning_rate": 9.777864028930705e-07, + "logits/chosen": -1.322510004043579, + "logits/rejected": -1.0033257007598877, + "logps/chosen": -206.4609375, + "logps/rejected": -225.65158081054688, + "loss": 0.6621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03883618488907814, + "rewards/margins": 0.06851398199796677, + "rewards/margins_max": 0.08924350887537003, + "rewards/margins_min": 0.04778445512056351, + "rewards/margins_std": 0.029315978288650513, + "rewards/rejected": -0.029677793383598328, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 0.515625, + "learning_rate": 9.76136380266878e-07, + "logits/chosen": -1.431841492652893, + "logits/rejected": -1.2090071439743042, + "logps/chosen": -291.49847412109375, + "logps/rejected": -261.60888671875, + "loss": 0.6575, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.043944261968135834, + "rewards/margins": 0.07684491574764252, + "rewards/margins_max": 0.10341174900531769, + "rewards/margins_min": 0.05027808994054794, + "rewards/margins_std": 0.03757117688655853, + "rewards/rejected": -0.03290066123008728, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 0.423828125, + "learning_rate": 9.744287459890369e-07, + "logits/chosen": -1.2307569980621338, + "logits/rejected": -0.8211034536361694, + "logps/chosen": -216.7576141357422, + "logps/rejected": -194.070556640625, + "loss": 0.6609, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.039227746427059174, + "rewards/margins": 0.07264076173305511, + "rewards/margins_max": 0.10165262222290039, + "rewards/margins_min": 0.043628908693790436, + "rewards/margins_std": 0.04102896526455879, + "rewards/rejected": -0.03341301903128624, + "step": 610 + }, + { + "epoch": 0.2, + "grad_norm": 0.5, + "learning_rate": 9.726637066802446e-07, + "logits/chosen": -1.3241937160491943, + "logits/rejected": -0.8831518292427063, + "logps/chosen": -244.5774383544922, + "logps/rejected": -273.13427734375, + "loss": 0.6549, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04124955087900162, + "rewards/margins": 0.07716774940490723, + "rewards/margins_max": 0.12114681303501129, + "rewards/margins_min": 0.03318869322538376, + "rewards/margins_std": 0.062195777893066406, + "rewards/rejected": -0.03591819852590561, + "step": 620 + }, + { + "epoch": 0.2, + "grad_norm": 0.32421875, + "learning_rate": 9.708414759071057e-07, + "logits/chosen": -1.4460439682006836, + "logits/rejected": -1.0423699617385864, + "logps/chosen": -252.99484252929688, + "logps/rejected": -239.76608276367188, + "loss": 0.6609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03034689649939537, + "rewards/margins": 0.06652723252773285, + "rewards/margins_max": 0.0981488898396492, + "rewards/margins_min": 0.03490559384226799, + "rewards/margins_std": 0.044719766825437546, + "rewards/rejected": -0.03618033975362778, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 0.4140625, + "learning_rate": 9.689622741562891e-07, + "logits/chosen": -1.4432752132415771, + "logits/rejected": -1.0086311101913452, + "logps/chosen": -242.03866577148438, + "logps/rejected": -226.80911254882812, + "loss": 0.6593, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05991531163454056, + "rewards/margins": 0.08118681609630585, + "rewards/margins_max": 0.11632559448480606, + "rewards/margins_min": 0.046048033982515335, + "rewards/margins_std": 0.049693748354911804, + "rewards/rejected": -0.021271510049700737, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 0.455078125, + "learning_rate": 9.670263288078503e-07, + "logits/chosen": -1.4905498027801514, + "logits/rejected": -0.9651784896850586, + "logps/chosen": -337.162353515625, + "logps/rejected": -227.63101196289062, + "loss": 0.6499, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04678003862500191, + "rewards/margins": 0.09588982164859772, + "rewards/margins_max": 0.12638990581035614, + "rewards/margins_min": 0.0653897300362587, + "rewards/margins_std": 0.043133631348609924, + "rewards/rejected": -0.04910977929830551, + "step": 650 + }, + { + "epoch": 0.21, + "grad_norm": 0.337890625, + "learning_rate": 9.650338741077188e-07, + "logits/chosen": -1.2945762872695923, + "logits/rejected": -1.0813788175582886, + "logps/chosen": -229.5716552734375, + "logps/rejected": -247.0192108154297, + "loss": 0.6584, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04455048590898514, + "rewards/margins": 0.06444776803255081, + "rewards/margins_max": 0.10383981466293335, + "rewards/margins_min": 0.025055717676877975, + "rewards/margins_std": 0.05570877343416214, + "rewards/rejected": -0.019897282123565674, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 0.484375, + "learning_rate": 9.629851511393555e-07, + "logits/chosen": -1.4708611965179443, + "logits/rejected": -0.9937572479248047, + "logps/chosen": -286.068603515625, + "logps/rejected": -254.18637084960938, + "loss": 0.6544, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04879416525363922, + "rewards/margins": 0.07758750021457672, + "rewards/margins_max": 0.10908614099025726, + "rewards/margins_min": 0.04608884081244469, + "rewards/margins_std": 0.044545818120241165, + "rewards/rejected": -0.0287933312356472, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 0.5625, + "learning_rate": 9.608804077945797e-07, + "logits/chosen": -1.4359239339828491, + "logits/rejected": -1.0322556495666504, + "logps/chosen": -269.5269470214844, + "logps/rejected": -274.97137451171875, + "loss": 0.6522, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.051246218383312225, + "rewards/margins": 0.07667367160320282, + "rewards/margins_max": 0.10428421199321747, + "rewards/margins_min": 0.049063123762607574, + "rewards/margins_std": 0.03904721140861511, + "rewards/rejected": -0.025427449494600296, + "step": 680 + }, + { + "epoch": 0.22, + "grad_norm": 0.296875, + "learning_rate": 9.58719898743578e-07, + "logits/chosen": -1.348769187927246, + "logits/rejected": -0.9472673535346985, + "logps/chosen": -255.1873016357422, + "logps/rejected": -228.0542755126953, + "loss": 0.6554, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04409017041325569, + "rewards/margins": 0.08384691178798676, + "rewards/margins_max": 0.11428749561309814, + "rewards/margins_min": 0.05340634658932686, + "rewards/margins_std": 0.04304947704076767, + "rewards/rejected": -0.03975675255060196, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 0.5546875, + "learning_rate": 9.565038854040865e-07, + "logits/chosen": -1.3250253200531006, + "logits/rejected": -0.9937537908554077, + "logps/chosen": -203.81344604492188, + "logps/rejected": -221.4913787841797, + "loss": 0.6536, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.028742622584104538, + "rewards/margins": 0.07320950925350189, + "rewards/margins_max": 0.11118922382593155, + "rewards/margins_min": 0.03522980213165283, + "rewards/margins_std": 0.0537114143371582, + "rewards/rejected": -0.04446689039468765, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.341796875, + "learning_rate": 9.542326359097617e-07, + "logits/chosen": -1.405347228050232, + "logits/rejected": -0.9768081903457642, + "logps/chosen": -331.59649658203125, + "logps/rejected": -235.75888061523438, + "loss": 0.6429, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.046662431210279465, + "rewards/margins": 0.10430089384317398, + "rewards/margins_max": 0.14537875354290009, + "rewards/margins_min": 0.06322301924228668, + "rewards/margins_std": 0.058092884719371796, + "rewards/rejected": -0.05763845518231392, + "step": 710 + }, + { + "epoch": 0.23, + "grad_norm": 0.443359375, + "learning_rate": 9.51906425077736e-07, + "logits/chosen": -1.232671856880188, + "logits/rejected": -0.8695996999740601, + "logps/chosen": -260.71917724609375, + "logps/rejected": -237.81472778320312, + "loss": 0.6456, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06164161115884781, + "rewards/margins": 0.11577005684375763, + "rewards/margins_max": 0.18085698783397675, + "rewards/margins_min": 0.05068312957882881, + "rewards/margins_std": 0.092046819627285, + "rewards/rejected": -0.054128456860780716, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.42578125, + "learning_rate": 9.495255343753657e-07, + "logits/chosen": -1.4524829387664795, + "logits/rejected": -1.0947620868682861, + "logps/chosen": -204.1517333984375, + "logps/rejected": -222.0264129638672, + "loss": 0.6525, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04527204856276512, + "rewards/margins": 0.09181281924247742, + "rewards/margins_max": 0.13302180171012878, + "rewards/margins_min": 0.050603847950696945, + "rewards/margins_std": 0.058278292417526245, + "rewards/rejected": -0.046540774405002594, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 0.455078125, + "learning_rate": 9.470902518861731e-07, + "logits/chosen": -1.3769677877426147, + "logits/rejected": -1.1935275793075562, + "logps/chosen": -191.8888397216797, + "logps/rejected": -201.0779571533203, + "loss": 0.6528, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03776038438081741, + "rewards/margins": 0.08057109266519547, + "rewards/margins_max": 0.12366624921560287, + "rewards/margins_min": 0.03747590631246567, + "rewards/margins_std": 0.0609457865357399, + "rewards/rejected": -0.042810700833797455, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 0.455078125, + "learning_rate": 9.446008722749905e-07, + "logits/chosen": -1.3329544067382812, + "logits/rejected": -1.0925482511520386, + "logps/chosen": -228.0487060546875, + "logps/rejected": -202.3518524169922, + "loss": 0.6537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03692751005291939, + "rewards/margins": 0.08129950612783432, + "rewards/margins_max": 0.12913137674331665, + "rewards/margins_min": 0.03346762806177139, + "rewards/margins_std": 0.06764448434114456, + "rewards/rejected": -0.04437199607491493, + "step": 750 + }, + { + "epoch": 0.24, + "grad_norm": 0.392578125, + "learning_rate": 9.420576967523048e-07, + "logits/chosen": -1.1731605529785156, + "logits/rejected": -0.8517470359802246, + "logps/chosen": -237.1516571044922, + "logps/rejected": -215.74270629882812, + "loss": 0.6573, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.034002404659986496, + "rewards/margins": 0.08593714237213135, + "rewards/margins_max": 0.12129978835582733, + "rewards/margins_min": 0.05057450011372566, + "rewards/margins_std": 0.05001033470034599, + "rewards/rejected": -0.05193474888801575, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 0.423828125, + "learning_rate": 9.394610330378124e-07, + "logits/chosen": -1.4420990943908691, + "logits/rejected": -1.063472032546997, + "logps/chosen": -228.5890655517578, + "logps/rejected": -231.9294891357422, + "loss": 0.6502, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.049244366586208344, + "rewards/margins": 0.08763855695724487, + "rewards/margins_max": 0.13933846354484558, + "rewards/margins_min": 0.03593864664435387, + "rewards/margins_std": 0.07311470806598663, + "rewards/rejected": -0.03839418664574623, + "step": 770 + }, + { + "epoch": 0.25, + "grad_norm": 0.45703125, + "learning_rate": 9.368111953231847e-07, + "logits/chosen": -1.4083888530731201, + "logits/rejected": -0.9682759046554565, + "logps/chosen": -223.3924560546875, + "logps/rejected": -241.82778930664062, + "loss": 0.6447, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06056595593690872, + "rewards/margins": 0.12897726893424988, + "rewards/margins_max": 0.17253781855106354, + "rewards/margins_min": 0.08541673421859741, + "rewards/margins_std": 0.06160390377044678, + "rewards/rejected": -0.06841133534908295, + "step": 780 + }, + { + "epoch": 0.25, + "grad_norm": 0.56640625, + "learning_rate": 9.341085042340531e-07, + "logits/chosen": -1.4172804355621338, + "logits/rejected": -1.1236498355865479, + "logps/chosen": -171.3277130126953, + "logps/rejected": -193.0868682861328, + "loss": 0.6512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03927486762404442, + "rewards/margins": 0.08602278679609299, + "rewards/margins_max": 0.12318630516529083, + "rewards/margins_min": 0.04885926470160484, + "rewards/margins_std": 0.05255715921521187, + "rewards/rejected": -0.04674791917204857, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 0.451171875, + "learning_rate": 9.313532867912124e-07, + "logits/chosen": -1.413651466369629, + "logits/rejected": -0.906880259513855, + "logps/chosen": -246.77468872070312, + "logps/rejected": -223.56912231445312, + "loss": 0.6408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.055256299674510956, + "rewards/margins": 0.12263475358486176, + "rewards/margins_max": 0.17601759731769562, + "rewards/margins_min": 0.06925191730260849, + "rewards/margins_std": 0.07549472898244858, + "rewards/rejected": -0.0673784539103508, + "step": 800 + }, + { + "epoch": 0.26, + "grad_norm": 0.36328125, + "learning_rate": 9.285458763710523e-07, + "logits/chosen": -1.436037302017212, + "logits/rejected": -1.0014102458953857, + "logps/chosen": -231.8644561767578, + "logps/rejected": -184.65553283691406, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06529637426137924, + "rewards/margins": 0.0870235487818718, + "rewards/margins_max": 0.12472760677337646, + "rewards/margins_min": 0.04931949824094772, + "rewards/margins_std": 0.05332158878445625, + "rewards/rejected": -0.021727172657847404, + "step": 810 + }, + { + "epoch": 0.26, + "grad_norm": 0.5, + "learning_rate": 9.256866126652199e-07, + "logits/chosen": -1.33461594581604, + "logits/rejected": -0.9908684492111206, + "logps/chosen": -201.02322387695312, + "logps/rejected": -202.8111572265625, + "loss": 0.6461, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.031793173402547836, + "rewards/margins": 0.08713125437498093, + "rewards/margins_max": 0.12756696343421936, + "rewards/margins_min": 0.04669555649161339, + "rewards/margins_std": 0.05718470364809036, + "rewards/rejected": -0.05533808469772339, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 0.4453125, + "learning_rate": 9.227758416395169e-07, + "logits/chosen": -1.4254963397979736, + "logits/rejected": -1.0587215423583984, + "logps/chosen": -221.55386352539062, + "logps/rejected": -192.1359405517578, + "loss": 0.6463, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05291753262281418, + "rewards/margins": 0.09469744563102722, + "rewards/margins_max": 0.14799687266349792, + "rewards/margins_min": 0.04139800742268562, + "rewards/margins_std": 0.07537679374217987, + "rewards/rejected": -0.04177991300821304, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 0.451171875, + "learning_rate": 9.198139154920388e-07, + "logits/chosen": -1.4358642101287842, + "logits/rejected": -1.1785621643066406, + "logps/chosen": -278.039306640625, + "logps/rejected": -247.2188720703125, + "loss": 0.6388, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06873653084039688, + "rewards/margins": 0.11557067930698395, + "rewards/margins_max": 0.16702882945537567, + "rewards/margins_min": 0.06411250680685043, + "rewards/margins_std": 0.07277283817529678, + "rewards/rejected": -0.04683414846658707, + "step": 840 + }, + { + "epoch": 0.27, + "grad_norm": 0.36328125, + "learning_rate": 9.168011926105597e-07, + "logits/chosen": -1.5257022380828857, + "logits/rejected": -1.1070044040679932, + "logps/chosen": -201.38771057128906, + "logps/rejected": -222.58096313476562, + "loss": 0.6444, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05585918575525284, + "rewards/margins": 0.10483954101800919, + "rewards/margins_max": 0.1490497589111328, + "rewards/margins_min": 0.060629308223724365, + "rewards/margins_std": 0.06252270191907883, + "rewards/rejected": -0.04898035526275635, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 0.50390625, + "learning_rate": 9.137380375291677e-07, + "logits/chosen": -1.4331508874893188, + "logits/rejected": -1.0888936519622803, + "logps/chosen": -208.6669921875, + "logps/rejected": -228.1973419189453, + "loss": 0.6464, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.053120195865631104, + "rewards/margins": 0.1023564338684082, + "rewards/margins_max": 0.15425564348697662, + "rewards/margins_min": 0.05045723915100098, + "rewards/margins_std": 0.07339654862880707, + "rewards/rejected": -0.0492362454533577, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 0.55078125, + "learning_rate": 9.106248208841568e-07, + "logits/chosen": -1.3399418592453003, + "logits/rejected": -1.1214873790740967, + "logps/chosen": -204.1221466064453, + "logps/rejected": -213.8696746826172, + "loss": 0.6472, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03532610088586807, + "rewards/margins": 0.10521572828292847, + "rewards/margins_max": 0.14728297293186188, + "rewards/margins_min": 0.06314849108457565, + "rewards/margins_std": 0.05949206277728081, + "rewards/rejected": -0.06988963484764099, + "step": 870 + }, + { + "epoch": 0.28, + "grad_norm": 0.376953125, + "learning_rate": 9.07461919369181e-07, + "logits/chosen": -1.4420771598815918, + "logits/rejected": -1.1662547588348389, + "logps/chosen": -201.39486694335938, + "logps/rejected": -187.26492309570312, + "loss": 0.6419, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.051577143371105194, + "rewards/margins": 0.10872938483953476, + "rewards/margins_max": 0.15409475564956665, + "rewards/margins_min": 0.06336402893066406, + "rewards/margins_std": 0.06415630877017975, + "rewards/rejected": -0.05715225264430046, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.380859375, + "learning_rate": 9.042497156896746e-07, + "logits/chosen": -1.311943769454956, + "logits/rejected": -1.2416942119598389, + "logps/chosen": -165.70957946777344, + "logps/rejected": -237.78829956054688, + "loss": 0.6482, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.036909494549036026, + "rewards/margins": 0.07109662890434265, + "rewards/margins_max": 0.10757263749837875, + "rewards/margins_min": 0.03462062403559685, + "rewards/margins_std": 0.05158485844731331, + "rewards/rejected": -0.034187134355306625, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 0.4296875, + "learning_rate": 9.009885985165464e-07, + "logits/chosen": -1.3028219938278198, + "logits/rejected": -1.0853294134140015, + "logps/chosen": -199.5616912841797, + "logps/rejected": -219.22830200195312, + "loss": 0.6427, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05228592827916145, + "rewards/margins": 0.08949766308069229, + "rewards/margins_max": 0.13855160772800446, + "rewards/margins_min": 0.04044371843338013, + "rewards/margins_std": 0.0693727508187294, + "rewards/rejected": -0.03721173480153084, + "step": 900 + }, + { + "epoch": 0.29, + "grad_norm": 0.38671875, + "learning_rate": 8.976789624391497e-07, + "logits/chosen": -1.3990576267242432, + "logits/rejected": -1.134710669517517, + "logps/chosen": -168.1616668701172, + "logps/rejected": -225.7877197265625, + "loss": 0.6429, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.051915861666202545, + "rewards/margins": 0.10637024790048599, + "rewards/margins_max": 0.1688658893108368, + "rewards/margins_min": 0.043874599039554596, + "rewards/margins_std": 0.08838219195604324, + "rewards/rejected": -0.054454393684864044, + "step": 910 + }, + { + "epoch": 0.29, + "grad_norm": 0.494140625, + "learning_rate": 8.94321207917539e-07, + "logits/chosen": -1.4574586153030396, + "logits/rejected": -1.2704302072525024, + "logps/chosen": -248.19015502929688, + "logps/rejected": -254.5595245361328, + "loss": 0.6465, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.051451824605464935, + "rewards/margins": 0.08263147622346878, + "rewards/margins_max": 0.12576524913311005, + "rewards/margins_min": 0.03949769213795662, + "rewards/margins_std": 0.06100037693977356, + "rewards/rejected": -0.03117964044213295, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.3515625, + "learning_rate": 8.909157412340149e-07, + "logits/chosen": -1.4800761938095093, + "logits/rejected": -1.0025885105133057, + "logps/chosen": -189.4911651611328, + "logps/rejected": -236.02151489257812, + "loss": 0.6394, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0598456934094429, + "rewards/margins": 0.11991927772760391, + "rewards/margins_max": 0.1739063560962677, + "rewards/margins_min": 0.06593217700719833, + "rewards/margins_std": 0.07634927332401276, + "rewards/rejected": -0.060073576867580414, + "step": 930 + }, + { + "epoch": 0.3, + "grad_norm": 0.51953125, + "learning_rate": 8.874629744439637e-07, + "logits/chosen": -1.3226550817489624, + "logits/rejected": -1.000619888305664, + "logps/chosen": -201.4481964111328, + "logps/rejected": -239.4997100830078, + "loss": 0.6387, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.051911354064941406, + "rewards/margins": 0.14231614768505096, + "rewards/margins_max": 0.22301022708415985, + "rewards/margins_min": 0.061622101813554764, + "rewards/margins_std": 0.11411863565444946, + "rewards/rejected": -0.09040482342243195, + "step": 940 + }, + { + "epoch": 0.3, + "grad_norm": 0.55078125, + "learning_rate": 8.839633253260005e-07, + "logits/chosen": -1.5083190202713013, + "logits/rejected": -1.0948280096054077, + "logps/chosen": -222.4230194091797, + "logps/rejected": -244.3561248779297, + "loss": 0.6331, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05216183513402939, + "rewards/margins": 0.12567996978759766, + "rewards/margins_max": 0.18674659729003906, + "rewards/margins_min": 0.06461338698863983, + "rewards/margins_std": 0.08636122196912766, + "rewards/rejected": -0.07351814210414886, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 0.462890625, + "learning_rate": 8.804172173314184e-07, + "logits/chosen": -1.5267778635025024, + "logits/rejected": -1.0082361698150635, + "logps/chosen": -195.0020294189453, + "logps/rejected": -216.0598907470703, + "loss": 0.6426, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04607005417346954, + "rewards/margins": 0.10273507982492447, + "rewards/margins_max": 0.15412285923957825, + "rewards/margins_min": 0.05134730786085129, + "rewards/margins_std": 0.07267327606678009, + "rewards/rejected": -0.05666501447558403, + "step": 960 + }, + { + "epoch": 0.31, + "grad_norm": 0.49609375, + "learning_rate": 8.768250795329517e-07, + "logits/chosen": -1.4357895851135254, + "logits/rejected": -1.077530860900879, + "logps/chosen": -193.4352264404297, + "logps/rejected": -194.35830688476562, + "loss": 0.6363, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.06332554668188095, + "rewards/margins": 0.12346392869949341, + "rewards/margins_max": 0.18960857391357422, + "rewards/margins_min": 0.0573192834854126, + "rewards/margins_std": 0.09354265034198761, + "rewards/rejected": -0.06013838201761246, + "step": 970 + }, + { + "epoch": 0.31, + "grad_norm": 0.30078125, + "learning_rate": 8.731873465728583e-07, + "logits/chosen": -1.2073941230773926, + "logits/rejected": -1.077758550643921, + "logps/chosen": -210.8450164794922, + "logps/rejected": -259.5902404785156, + "loss": 0.6416, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.039309777319431305, + "rewards/margins": 0.09090758115053177, + "rewards/margins_max": 0.13156287372112274, + "rewards/margins_min": 0.050252266228199005, + "rewards/margins_std": 0.05749528482556343, + "rewards/rejected": -0.05159779265522957, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 0.55078125, + "learning_rate": 8.695044586103295e-07, + "logits/chosen": -1.404300332069397, + "logits/rejected": -1.14784836769104, + "logps/chosen": -208.1896209716797, + "logps/rejected": -215.95730590820312, + "loss": 0.6418, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0341053307056427, + "rewards/margins": 0.09708726406097412, + "rewards/margins_max": 0.14782105386257172, + "rewards/margins_min": 0.04635345935821533, + "rewards/margins_std": 0.07174843549728394, + "rewards/rejected": -0.06298193335533142, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 0.40234375, + "learning_rate": 8.657768612682315e-07, + "logits/chosen": -1.6107196807861328, + "logits/rejected": -1.1812701225280762, + "logps/chosen": -211.580810546875, + "logps/rejected": -204.4039764404297, + "loss": 0.636, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04763215780258179, + "rewards/margins": 0.11304394900798798, + "rewards/margins_max": 0.16250091791152954, + "rewards/margins_min": 0.06358698755502701, + "rewards/margins_std": 0.06994272023439407, + "rewards/rejected": -0.06541179120540619, + "step": 1000 + }, + { + "epoch": 0.32, + "grad_norm": 0.453125, + "learning_rate": 8.62005005579185e-07, + "logits/chosen": -1.4514292478561401, + "logits/rejected": -1.1223483085632324, + "logps/chosen": -213.6953125, + "logps/rejected": -233.45425415039062, + "loss": 0.6357, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.055231235921382904, + "rewards/margins": 0.11501912772655487, + "rewards/margins_max": 0.16858163475990295, + "rewards/margins_min": 0.06145660951733589, + "rewards/margins_std": 0.07574883848428726, + "rewards/rejected": -0.05978789180517197, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 0.48046875, + "learning_rate": 8.581893479309924e-07, + "logits/chosen": -1.3202977180480957, + "logits/rejected": -0.9375591278076172, + "logps/chosen": -244.69229125976562, + "logps/rejected": -231.681640625, + "loss": 0.6353, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04148910194635391, + "rewards/margins": 0.11631004512310028, + "rewards/margins_max": 0.1614200919866562, + "rewards/margins_min": 0.07120002061128616, + "rewards/margins_std": 0.06379522383213043, + "rewards/rejected": -0.07482095062732697, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 0.34375, + "learning_rate": 8.543303500114141e-07, + "logits/chosen": -1.4135452508926392, + "logits/rejected": -1.2024281024932861, + "logps/chosen": -185.69488525390625, + "logps/rejected": -222.9813995361328, + "loss": 0.6506, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.036480437964200974, + "rewards/margins": 0.0974053293466568, + "rewards/margins_max": 0.14427343010902405, + "rewards/margins_min": 0.050537217408418655, + "rewards/margins_std": 0.0662815272808075, + "rewards/rejected": -0.06092488765716553, + "step": 1030 + }, + { + "epoch": 0.33, + "grad_norm": 0.5390625, + "learning_rate": 8.504284787523066e-07, + "logits/chosen": -1.4746744632720947, + "logits/rejected": -1.0196959972381592, + "logps/chosen": -249.49453735351562, + "logps/rejected": -290.43304443359375, + "loss": 0.6352, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04987958446145058, + "rewards/margins": 0.13131950795650482, + "rewards/margins_max": 0.1925027072429657, + "rewards/margins_min": 0.07013632357120514, + "rewards/margins_std": 0.08652608841657639, + "rewards/rejected": -0.08143992722034454, + "step": 1040 + }, + { + "epoch": 0.33, + "grad_norm": 0.416015625, + "learning_rate": 8.464842062731234e-07, + "logits/chosen": -1.3476040363311768, + "logits/rejected": -1.1598175764083862, + "logps/chosen": -179.7411651611328, + "logps/rejected": -220.35177612304688, + "loss": 0.6394, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03946131095290184, + "rewards/margins": 0.12150558084249496, + "rewards/margins_max": 0.17859359085559845, + "rewards/margins_min": 0.06441758573055267, + "rewards/margins_std": 0.08073462545871735, + "rewards/rejected": -0.08204427361488342, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 0.37890625, + "learning_rate": 8.424980098237902e-07, + "logits/chosen": -1.4661829471588135, + "logits/rejected": -1.1622331142425537, + "logps/chosen": -197.33993530273438, + "logps/rejected": -245.0823211669922, + "loss": 0.64, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04446860030293465, + "rewards/margins": 0.1190033107995987, + "rewards/margins_max": 0.17709189653396606, + "rewards/margins_min": 0.06091470643877983, + "rewards/margins_std": 0.0821496844291687, + "rewards/rejected": -0.07453471422195435, + "step": 1060 + }, + { + "epoch": 0.34, + "grad_norm": 0.345703125, + "learning_rate": 8.384703717269583e-07, + "logits/chosen": -1.4809454679489136, + "logits/rejected": -1.0145283937454224, + "logps/chosen": -208.355712890625, + "logps/rejected": -197.4699249267578, + "loss": 0.6417, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04945468157529831, + "rewards/margins": 0.12217775732278824, + "rewards/margins_max": 0.17732298374176025, + "rewards/margins_min": 0.06703253090381622, + "rewards/margins_std": 0.07798713445663452, + "rewards/rejected": -0.07272306829690933, + "step": 1070 + }, + { + "epoch": 0.34, + "grad_norm": 0.53125, + "learning_rate": 8.344017793196442e-07, + "logits/chosen": -1.3273048400878906, + "logits/rejected": -1.0948415994644165, + "logps/chosen": -199.98806762695312, + "logps/rejected": -194.45005798339844, + "loss": 0.6375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04742967337369919, + "rewards/margins": 0.13943785429000854, + "rewards/margins_max": 0.18036355078220367, + "rewards/margins_min": 0.0985121950507164, + "rewards/margins_std": 0.057877641171216965, + "rewards/rejected": -0.09200819581747055, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 0.435546875, + "learning_rate": 8.302927248942626e-07, + "logits/chosen": -1.347947359085083, + "logits/rejected": -0.8450009226799011, + "logps/chosen": -309.2386474609375, + "logps/rejected": -231.5939483642578, + "loss": 0.6228, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06135648488998413, + "rewards/margins": 0.17102745175361633, + "rewards/margins_max": 0.2658650279045105, + "rewards/margins_min": 0.07618991285562515, + "rewards/margins_std": 0.13412055373191833, + "rewards/rejected": -0.1096709817647934, + "step": 1090 + }, + { + "epoch": 0.35, + "grad_norm": 0.40234375, + "learning_rate": 8.261437056390606e-07, + "logits/chosen": -1.509397268295288, + "logits/rejected": -1.0450458526611328, + "logps/chosen": -227.46694946289062, + "logps/rejected": -230.9382781982422, + "loss": 0.6316, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.06662856787443161, + "rewards/margins": 0.14657410979270935, + "rewards/margins_max": 0.20933778584003448, + "rewards/margins_min": 0.0838104858994484, + "rewards/margins_std": 0.08876121044158936, + "rewards/rejected": -0.07994556427001953, + "step": 1100 + }, + { + "epoch": 0.35, + "grad_norm": 0.345703125, + "learning_rate": 8.219552235779578e-07, + "logits/chosen": -1.3927332162857056, + "logits/rejected": -0.9824946522712708, + "logps/chosen": -254.88949584960938, + "logps/rejected": -268.16265869140625, + "loss": 0.6306, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06077701598405838, + "rewards/margins": 0.13684354722499847, + "rewards/margins_max": 0.18205778300762177, + "rewards/margins_min": 0.09162933379411697, + "rewards/margins_std": 0.0639425665140152, + "rewards/rejected": -0.07606653869152069, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 0.380859375, + "learning_rate": 8.177277855098032e-07, + "logits/chosen": -1.4236234426498413, + "logits/rejected": -1.0470139980316162, + "logps/chosen": -255.99038696289062, + "logps/rejected": -229.17282104492188, + "loss": 0.6393, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.044217340648174286, + "rewards/margins": 0.12237548828125, + "rewards/margins_max": 0.1730232536792755, + "rewards/margins_min": 0.0717277079820633, + "rewards/margins_std": 0.07162677496671677, + "rewards/rejected": -0.07815815508365631, + "step": 1120 + }, + { + "epoch": 0.36, + "grad_norm": 0.375, + "learning_rate": 8.134619029470533e-07, + "logits/chosen": -1.3011561632156372, + "logits/rejected": -0.9734943509101868, + "logps/chosen": -252.0817413330078, + "logps/rejected": -222.7303466796875, + "loss": 0.6262, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04272838681936264, + "rewards/margins": 0.1569289267063141, + "rewards/margins_max": 0.21832945942878723, + "rewards/margins_min": 0.09552840888500214, + "rewards/margins_std": 0.08683343231678009, + "rewards/rejected": -0.11420054733753204, + "step": 1130 + }, + { + "epoch": 0.36, + "grad_norm": 0.451171875, + "learning_rate": 8.09158092053879e-07, + "logits/chosen": -1.3196508884429932, + "logits/rejected": -0.9734094738960266, + "logps/chosen": -220.8783416748047, + "logps/rejected": -209.29800415039062, + "loss": 0.6299, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04809530824422836, + "rewards/margins": 0.12365426868200302, + "rewards/margins_max": 0.19381490349769592, + "rewards/margins_min": 0.05349363014101982, + "rewards/margins_std": 0.09922213107347488, + "rewards/rejected": -0.07555896788835526, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 0.5078125, + "learning_rate": 8.04816873583712e-07, + "logits/chosen": -1.3901035785675049, + "logits/rejected": -1.029416799545288, + "logps/chosen": -232.9812469482422, + "logps/rejected": -244.72299194335938, + "loss": 0.631, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05810508131980896, + "rewards/margins": 0.15548478066921234, + "rewards/margins_max": 0.22354455292224884, + "rewards/margins_min": 0.08742500096559525, + "rewards/margins_std": 0.09625106304883957, + "rewards/rejected": -0.09737969934940338, + "step": 1150 + }, + { + "epoch": 0.37, + "grad_norm": 0.796875, + "learning_rate": 8.004387728162343e-07, + "logits/chosen": -1.3547106981277466, + "logits/rejected": -0.9800487756729126, + "logps/chosen": -220.08432006835938, + "logps/rejected": -366.79425048828125, + "loss": 0.6261, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05310923978686333, + "rewards/margins": 0.16483795642852783, + "rewards/margins_max": 0.24075451493263245, + "rewards/margins_min": 0.0889214277267456, + "rewards/margins_std": 0.10736221075057983, + "rewards/rejected": -0.1117287278175354, + "step": 1160 + }, + { + "epoch": 0.37, + "grad_norm": 0.3515625, + "learning_rate": 7.96024319493819e-07, + "logits/chosen": -1.4385137557983398, + "logits/rejected": -1.0657742023468018, + "logps/chosen": -221.01040649414062, + "logps/rejected": -200.88296508789062, + "loss": 0.6401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.036314692348241806, + "rewards/margins": 0.1054370254278183, + "rewards/margins_max": 0.14237162470817566, + "rewards/margins_min": 0.06850244104862213, + "rewards/margins_std": 0.05223340913653374, + "rewards/rejected": -0.06912233680486679, + "step": 1170 + }, + { + "epoch": 0.37, + "grad_norm": 0.44921875, + "learning_rate": 7.915740477574347e-07, + "logits/chosen": -1.5004260540008545, + "logits/rejected": -1.0796833038330078, + "logps/chosen": -198.33592224121094, + "logps/rejected": -186.93319702148438, + "loss": 0.6383, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.06108757108449936, + "rewards/margins": 0.11955182254314423, + "rewards/margins_max": 0.1842404305934906, + "rewards/margins_min": 0.05486319214105606, + "rewards/margins_std": 0.09148352593183517, + "rewards/rejected": -0.05846424773335457, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 0.359375, + "learning_rate": 7.870884960820129e-07, + "logits/chosen": -1.3537534475326538, + "logits/rejected": -1.0633940696716309, + "logps/chosen": -178.4303436279297, + "logps/rejected": -234.31103515625, + "loss": 0.6258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06088218092918396, + "rewards/margins": 0.12319563329219818, + "rewards/margins_max": 0.18492364883422852, + "rewards/margins_min": 0.061467599123716354, + "rewards/margins_std": 0.08729662001132965, + "rewards/rejected": -0.062313444912433624, + "step": 1190 + }, + { + "epoch": 0.38, + "grad_norm": 0.291015625, + "learning_rate": 7.825682072112959e-07, + "logits/chosen": -1.464988112449646, + "logits/rejected": -0.8578370809555054, + "logps/chosen": -237.8139190673828, + "logps/rejected": -234.4029998779297, + "loss": 0.6403, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04808584973216057, + "rewards/margins": 0.13337358832359314, + "rewards/margins_max": 0.18245692551136017, + "rewards/margins_min": 0.0842902883887291, + "rewards/margins_std": 0.06941428035497665, + "rewards/rejected": -0.08528774976730347, + "step": 1200 + }, + { + "epoch": 0.38, + "grad_norm": 0.384765625, + "learning_rate": 7.780137280921635e-07, + "logits/chosen": -1.4263708591461182, + "logits/rejected": -1.171579122543335, + "logps/chosen": -246.2294464111328, + "logps/rejected": -251.85107421875, + "loss": 0.6281, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.035905174911022186, + "rewards/margins": 0.14339666068553925, + "rewards/margins_max": 0.2217930257320404, + "rewards/margins_min": 0.06500030308961868, + "rewards/margins_std": 0.11086919158697128, + "rewards/rejected": -0.10749147832393646, + "step": 1210 + }, + { + "epoch": 0.38, + "grad_norm": 0.365234375, + "learning_rate": 7.734256098084551e-07, + "logits/chosen": -1.5084072351455688, + "logits/rejected": -1.2369990348815918, + "logps/chosen": -172.4056854248047, + "logps/rejected": -221.0992431640625, + "loss": 0.6373, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04651130363345146, + "rewards/margins": 0.12719354033470154, + "rewards/margins_max": 0.17494919896125793, + "rewards/margins_min": 0.07943786680698395, + "rewards/margins_std": 0.06753672659397125, + "rewards/rejected": -0.08068223297595978, + "step": 1220 + }, + { + "epoch": 0.39, + "grad_norm": 0.3828125, + "learning_rate": 7.688044075142887e-07, + "logits/chosen": -1.452924370765686, + "logits/rejected": -0.9414194822311401, + "logps/chosen": -194.08250427246094, + "logps/rejected": -194.6898651123047, + "loss": 0.6331, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.023848187178373337, + "rewards/margins": 0.1178591400384903, + "rewards/margins_max": 0.18268531560897827, + "rewards/margins_min": 0.053032953292131424, + "rewards/margins_std": 0.09167807549238205, + "rewards/rejected": -0.09401094913482666, + "step": 1230 + }, + { + "epoch": 0.39, + "grad_norm": 0.462890625, + "learning_rate": 7.641506803668887e-07, + "logits/chosen": -1.487422227859497, + "logits/rejected": -1.252956509590149, + "logps/chosen": -224.56399536132812, + "logps/rejected": -222.06387329101562, + "loss": 0.6238, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.031578294932842255, + "rewards/margins": 0.12413300573825836, + "rewards/margins_max": 0.1808335930109024, + "rewards/margins_min": 0.06743242591619492, + "rewards/margins_std": 0.08018673211336136, + "rewards/rejected": -0.09255470335483551, + "step": 1240 + }, + { + "epoch": 0.39, + "grad_norm": 0.3984375, + "learning_rate": 7.594649914589286e-07, + "logits/chosen": -1.4718706607818604, + "logits/rejected": -1.1124763488769531, + "logps/chosen": -196.78677368164062, + "logps/rejected": -192.97157287597656, + "loss": 0.6286, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03744439780712128, + "rewards/margins": 0.0985964983701706, + "rewards/margins_max": 0.15701943635940552, + "rewards/margins_min": 0.04017355293035507, + "rewards/margins_std": 0.08262249827384949, + "rewards/rejected": -0.06115208938717842, + "step": 1250 + }, + { + "epoch": 0.4, + "grad_norm": 0.40234375, + "learning_rate": 7.547479077503976e-07, + "logits/chosen": -1.475367546081543, + "logits/rejected": -1.0469552278518677, + "logps/chosen": -244.2837371826172, + "logps/rejected": -213.0482635498047, + "loss": 0.6273, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04668278247117996, + "rewards/margins": 0.10903759300708771, + "rewards/margins_max": 0.15385808050632477, + "rewards/margins_min": 0.06421708315610886, + "rewards/margins_std": 0.0633857473731041, + "rewards/rejected": -0.06235479563474655, + "step": 1260 + }, + { + "epoch": 0.4, + "grad_norm": 0.47265625, + "learning_rate": 7.5e-07, + "logits/chosen": -1.4103657007217407, + "logits/rejected": -0.9712256193161011, + "logps/chosen": -338.1557922363281, + "logps/rejected": -219.37918090820312, + "loss": 0.6395, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04872440919280052, + "rewards/margins": 0.11783953756093979, + "rewards/margins_max": 0.171439990401268, + "rewards/margins_min": 0.06423909962177277, + "rewards/margins_std": 0.07580247521400452, + "rewards/rejected": -0.06911513209342957, + "step": 1270 + }, + { + "epoch": 0.4, + "grad_norm": 0.4375, + "learning_rate": 7.452218426960939e-07, + "logits/chosen": -1.3297299146652222, + "logits/rejected": -1.0080119371414185, + "logps/chosen": -203.5293731689453, + "logps/rejected": -218.4529571533203, + "loss": 0.6359, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05152253434062004, + "rewards/margins": 0.13503798842430115, + "rewards/margins_max": 0.2129654586315155, + "rewards/margins_min": 0.05711054056882858, + "rewards/margins_std": 0.11020606756210327, + "rewards/rejected": -0.083515465259552, + "step": 1280 + }, + { + "epoch": 0.41, + "grad_norm": 0.431640625, + "learning_rate": 7.404140139871796e-07, + "logits/chosen": -1.4835853576660156, + "logits/rejected": -1.0349833965301514, + "logps/chosen": -226.5528564453125, + "logps/rejected": -228.1876220703125, + "loss": 0.6331, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04554816335439682, + "rewards/margins": 0.12928791344165802, + "rewards/margins_max": 0.17952772974967957, + "rewards/margins_min": 0.07904806733131409, + "rewards/margins_std": 0.07104986160993576, + "rewards/rejected": -0.0837397426366806, + "step": 1290 + }, + { + "epoch": 0.41, + "grad_norm": 0.6640625, + "learning_rate": 7.355770956119443e-07, + "logits/chosen": -1.307871699333191, + "logits/rejected": -1.0118768215179443, + "logps/chosen": -220.9732208251953, + "logps/rejected": -188.6660919189453, + "loss": 0.6339, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.043763868510723114, + "rewards/margins": 0.12307041883468628, + "rewards/margins_max": 0.17446239292621613, + "rewards/margins_min": 0.07167843729257584, + "rewards/margins_std": 0.07267922908067703, + "rewards/rejected": -0.07930653542280197, + "step": 1300 + }, + { + "epoch": 0.41, + "grad_norm": 0.5234375, + "learning_rate": 7.307116728288726e-07, + "logits/chosen": -1.4419422149658203, + "logits/rejected": -1.1987005472183228, + "logps/chosen": -180.5616455078125, + "logps/rejected": -223.50927734375, + "loss": 0.6274, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.056863777339458466, + "rewards/margins": 0.13441364467144012, + "rewards/margins_max": 0.20376619696617126, + "rewards/margins_min": 0.0650610700249672, + "rewards/margins_std": 0.09807933866977692, + "rewards/rejected": -0.07754985243082047, + "step": 1310 + }, + { + "epoch": 0.42, + "grad_norm": 0.462890625, + "learning_rate": 7.258183343454318e-07, + "logits/chosen": -1.4006013870239258, + "logits/rejected": -0.9503320455551147, + "logps/chosen": -235.5669708251953, + "logps/rejected": -239.7013702392578, + "loss": 0.6273, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.047770898789167404, + "rewards/margins": 0.14523038268089294, + "rewards/margins_max": 0.217434361577034, + "rewards/margins_min": 0.07302640378475189, + "rewards/margins_std": 0.10211183875799179, + "rewards/rejected": -0.09745948016643524, + "step": 1320 + }, + { + "epoch": 0.42, + "grad_norm": 0.55078125, + "learning_rate": 7.208976722468391e-07, + "logits/chosen": -1.514934778213501, + "logits/rejected": -1.2314091920852661, + "logps/chosen": -173.3019256591797, + "logps/rejected": -180.0929718017578, + "loss": 0.6342, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.058720219880342484, + "rewards/margins": 0.11582016944885254, + "rewards/margins_max": 0.16832241415977478, + "rewards/margins_min": 0.0633179321885109, + "rewards/margins_std": 0.07424938678741455, + "rewards/rejected": -0.057099949568510056, + "step": 1330 + }, + { + "epoch": 0.42, + "grad_norm": 0.4296875, + "learning_rate": 7.159502819244205e-07, + "logits/chosen": -1.470738410949707, + "logits/rejected": -1.1212527751922607, + "logps/chosen": -232.70425415039062, + "logps/rejected": -220.24850463867188, + "loss": 0.635, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.042904727160930634, + "rewards/margins": 0.1314111053943634, + "rewards/margins_max": 0.20116429030895233, + "rewards/margins_min": 0.061657924205064774, + "rewards/margins_std": 0.0986458957195282, + "rewards/rejected": -0.08850638568401337, + "step": 1340 + }, + { + "epoch": 0.43, + "grad_norm": 0.41015625, + "learning_rate": 7.109767620035688e-07, + "logits/chosen": -1.3368486166000366, + "logits/rejected": -1.093397855758667, + "logps/chosen": -180.45089721679688, + "logps/rejected": -187.77438354492188, + "loss": 0.6422, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.041960351169109344, + "rewards/margins": 0.09945043921470642, + "rewards/margins_max": 0.1653245985507965, + "rewards/margins_min": 0.03357627987861633, + "rewards/margins_std": 0.09316011518239975, + "rewards/rejected": -0.057490088045597076, + "step": 1350 + }, + { + "epoch": 0.43, + "grad_norm": 0.359375, + "learning_rate": 7.059777142713122e-07, + "logits/chosen": -1.273991584777832, + "logits/rejected": -1.0232566595077515, + "logps/chosen": -249.97781372070312, + "logps/rejected": -238.1562957763672, + "loss": 0.6404, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.030306842178106308, + "rewards/margins": 0.09815285354852676, + "rewards/margins_max": 0.15446361899375916, + "rewards/margins_min": 0.04184209182858467, + "rewards/margins_std": 0.07963544130325317, + "rewards/rejected": -0.06784601509571075, + "step": 1360 + }, + { + "epoch": 0.43, + "grad_norm": 0.51953125, + "learning_rate": 7.00953743603498e-07, + "logits/chosen": -1.3965544700622559, + "logits/rejected": -0.992017388343811, + "logps/chosen": -297.09332275390625, + "logps/rejected": -215.32479858398438, + "loss": 0.6317, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04624886438250542, + "rewards/margins": 0.11734838783740997, + "rewards/margins_max": 0.16934475302696228, + "rewards/margins_min": 0.06535204499959946, + "rewards/margins_std": 0.07353393733501434, + "rewards/rejected": -0.07109951227903366, + "step": 1370 + }, + { + "epoch": 0.43, + "grad_norm": 0.376953125, + "learning_rate": 6.959054578916042e-07, + "logits/chosen": -1.3886009454727173, + "logits/rejected": -1.0957224369049072, + "logps/chosen": -183.44839477539062, + "logps/rejected": -231.7744903564453, + "loss": 0.6268, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03902550786733627, + "rewards/margins": 0.14433899521827698, + "rewards/margins_max": 0.2244710922241211, + "rewards/margins_min": 0.06420690566301346, + "rewards/margins_std": 0.11332390457391739, + "rewards/rejected": -0.10531347990036011, + "step": 1380 + }, + { + "epoch": 0.44, + "grad_norm": 0.453125, + "learning_rate": 6.908334679691863e-07, + "logits/chosen": -1.3139946460723877, + "logits/rejected": -1.1197882890701294, + "logps/chosen": -209.2381134033203, + "logps/rejected": -235.80380249023438, + "loss": 0.6347, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.027201693505048752, + "rewards/margins": 0.11521643400192261, + "rewards/margins_max": 0.15470090508460999, + "rewards/margins_min": 0.07573194801807404, + "rewards/margins_std": 0.05583949014544487, + "rewards/rejected": -0.08801472932100296, + "step": 1390 + }, + { + "epoch": 0.44, + "grad_norm": 0.5625, + "learning_rate": 6.857383875379661e-07, + "logits/chosen": -1.2649810314178467, + "logits/rejected": -0.9927312731742859, + "logps/chosen": -219.4835968017578, + "logps/rejected": -233.77133178710938, + "loss": 0.6318, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0319049134850502, + "rewards/margins": 0.11133754253387451, + "rewards/margins_max": 0.15614716708660126, + "rewards/margins_min": 0.06652794033288956, + "rewards/margins_std": 0.06337036192417145, + "rewards/rejected": -0.07943262904882431, + "step": 1400 + }, + { + "epoch": 0.44, + "grad_norm": 0.609375, + "learning_rate": 6.806208330935766e-07, + "logits/chosen": -1.4187123775482178, + "logits/rejected": -1.213805913925171, + "logps/chosen": -275.88079833984375, + "logps/rejected": -248.2821502685547, + "loss": 0.6253, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04402131587266922, + "rewards/margins": 0.12928104400634766, + "rewards/margins_max": 0.21939381957054138, + "rewards/margins_min": 0.03916824236512184, + "rewards/margins_std": 0.1274387240409851, + "rewards/rejected": -0.08525971323251724, + "step": 1410 + }, + { + "epoch": 0.45, + "grad_norm": 0.53125, + "learning_rate": 6.754814238509652e-07, + "logits/chosen": -1.3234349489212036, + "logits/rejected": -0.9899675250053406, + "logps/chosen": -261.2273254394531, + "logps/rejected": -228.1154327392578, + "loss": 0.6347, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03984065353870392, + "rewards/margins": 0.15004102885723114, + "rewards/margins_max": 0.22247202694416046, + "rewards/margins_min": 0.07761004567146301, + "rewards/margins_std": 0.10243289172649384, + "rewards/rejected": -0.11020038276910782, + "step": 1420 + }, + { + "epoch": 0.45, + "grad_norm": 0.61328125, + "learning_rate": 6.703207816694718e-07, + "logits/chosen": -1.4122707843780518, + "logits/rejected": -1.0776605606079102, + "logps/chosen": -243.84994506835938, + "logps/rejected": -214.42257690429688, + "loss": 0.6313, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.028529832139611244, + "rewards/margins": 0.09134817868471146, + "rewards/margins_max": 0.12654173374176025, + "rewards/margins_min": 0.05615462735295296, + "rewards/margins_std": 0.049771200865507126, + "rewards/rejected": -0.06281835585832596, + "step": 1430 + }, + { + "epoch": 0.45, + "grad_norm": 0.48828125, + "learning_rate": 6.651395309775836e-07, + "logits/chosen": -1.481340765953064, + "logits/rejected": -1.1970375776290894, + "logps/chosen": -167.13339233398438, + "logps/rejected": -208.0955810546875, + "loss": 0.63, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05509837716817856, + "rewards/margins": 0.1572759747505188, + "rewards/margins_max": 0.2205711305141449, + "rewards/margins_min": 0.09398078173398972, + "rewards/margins_std": 0.0895129069685936, + "rewards/rejected": -0.10217758268117905, + "step": 1440 + }, + { + "epoch": 0.46, + "grad_norm": 0.45703125, + "learning_rate": 6.599382986973807e-07, + "logits/chosen": -1.4374146461486816, + "logits/rejected": -0.964760422706604, + "logps/chosen": -208.99783325195312, + "logps/rejected": -257.08526611328125, + "loss": 0.6281, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.07429121434688568, + "rewards/margins": 0.14184515178203583, + "rewards/margins_max": 0.22351637482643127, + "rewards/margins_min": 0.06017392873764038, + "rewards/margins_std": 0.1155005469918251, + "rewards/rejected": -0.06755392998456955, + "step": 1450 + }, + { + "epoch": 0.46, + "grad_norm": 0.41796875, + "learning_rate": 6.547177141686798e-07, + "logits/chosen": -1.3717443943023682, + "logits/rejected": -0.8923895955085754, + "logps/chosen": -218.4834442138672, + "logps/rejected": -211.63876342773438, + "loss": 0.6241, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05127422884106636, + "rewards/margins": 0.14416508376598358, + "rewards/margins_max": 0.21262860298156738, + "rewards/margins_min": 0.07570157200098038, + "rewards/margins_std": 0.09682202339172363, + "rewards/rejected": -0.09289085119962692, + "step": 1460 + }, + { + "epoch": 0.46, + "grad_norm": 0.6015625, + "learning_rate": 6.494784090728851e-07, + "logits/chosen": -1.381131887435913, + "logits/rejected": -1.1379430294036865, + "logps/chosen": -200.09182739257812, + "logps/rejected": -255.30020141601562, + "loss": 0.6205, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0368528813123703, + "rewards/margins": 0.14170141518115997, + "rewards/margins_max": 0.2006821632385254, + "rewards/margins_min": 0.08272063732147217, + "rewards/margins_std": 0.08341138064861298, + "rewards/rejected": -0.10484850406646729, + "step": 1470 + }, + { + "epoch": 0.47, + "grad_norm": 0.43359375, + "learning_rate": 6.442210173565561e-07, + "logits/chosen": -1.4332636594772339, + "logits/rejected": -1.0423600673675537, + "logps/chosen": -216.1157684326172, + "logps/rejected": -278.70843505859375, + "loss": 0.6277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.051350273191928864, + "rewards/margins": 0.15817813575267792, + "rewards/margins_max": 0.2108217179775238, + "rewards/margins_min": 0.10553457587957382, + "rewards/margins_std": 0.07444924116134644, + "rewards/rejected": -0.10682785511016846, + "step": 1480 + }, + { + "epoch": 0.47, + "grad_norm": 0.380859375, + "learning_rate": 6.389461751547008e-07, + "logits/chosen": -1.3558521270751953, + "logits/rejected": -1.085033655166626, + "logps/chosen": -225.6574249267578, + "logps/rejected": -217.3332977294922, + "loss": 0.6273, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03980935364961624, + "rewards/margins": 0.11275990307331085, + "rewards/margins_max": 0.16510936617851257, + "rewards/margins_min": 0.06041043996810913, + "rewards/margins_std": 0.07403331995010376, + "rewards/rejected": -0.07295055687427521, + "step": 1490 + }, + { + "epoch": 0.47, + "grad_norm": 0.4765625, + "learning_rate": 6.33654520713805e-07, + "logits/chosen": -1.426582932472229, + "logits/rejected": -0.8861996531486511, + "logps/chosen": -253.96957397460938, + "logps/rejected": -198.83224487304688, + "loss": 0.6299, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04899178445339203, + "rewards/margins": 0.14212216436862946, + "rewards/margins_max": 0.2016027718782425, + "rewards/margins_min": 0.08264155685901642, + "rewards/margins_std": 0.08411829173564911, + "rewards/rejected": -0.09313038736581802, + "step": 1500 + }, + { + "epoch": 0.48, + "grad_norm": 0.44140625, + "learning_rate": 6.283466943146051e-07, + "logits/chosen": -1.3297570943832397, + "logits/rejected": -1.0105302333831787, + "logps/chosen": -273.25579833984375, + "logps/rejected": -214.6565399169922, + "loss": 0.6199, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04776372015476227, + "rewards/margins": 0.12771016359329224, + "rewards/margins_max": 0.1960275024175644, + "rewards/margins_min": 0.05939285084605217, + "rewards/margins_std": 0.0966152772307396, + "rewards/rejected": -0.07994645088911057, + "step": 1510 + }, + { + "epoch": 0.48, + "grad_norm": 0.345703125, + "learning_rate": 6.230233381946162e-07, + "logits/chosen": -1.2974770069122314, + "logits/rejected": -1.0966564416885376, + "logps/chosen": -169.97372436523438, + "logps/rejected": -194.61965942382812, + "loss": 0.6351, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03883309289813042, + "rewards/margins": 0.130173459649086, + "rewards/margins_max": 0.20102062821388245, + "rewards/margins_min": 0.05932629853487015, + "rewards/margins_std": 0.10019302368164062, + "rewards/rejected": -0.09134037047624588, + "step": 1520 + }, + { + "epoch": 0.48, + "grad_norm": 0.4765625, + "learning_rate": 6.176850964704212e-07, + "logits/chosen": -1.3526862859725952, + "logits/rejected": -0.9741379618644714, + "logps/chosen": -177.74484252929688, + "logps/rejected": -170.9863739013672, + "loss": 0.6383, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03418565168976784, + "rewards/margins": 0.13273611664772034, + "rewards/margins_max": 0.19749236106872559, + "rewards/margins_min": 0.06797986477613449, + "rewards/margins_std": 0.09157915413379669, + "rewards/rejected": -0.0985504761338234, + "step": 1530 + }, + { + "epoch": 0.49, + "grad_norm": 0.62109375, + "learning_rate": 6.12332615059735e-07, + "logits/chosen": -1.5363820791244507, + "logits/rejected": -1.1499285697937012, + "logps/chosen": -249.35995483398438, + "logps/rejected": -278.024169921875, + "loss": 0.6262, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05033854395151138, + "rewards/margins": 0.12232635915279388, + "rewards/margins_max": 0.1788022816181183, + "rewards/margins_min": 0.0658503919839859, + "rewards/margins_std": 0.07986906915903091, + "rewards/rejected": -0.07198779284954071, + "step": 1540 + }, + { + "epoch": 0.49, + "grad_norm": 0.48046875, + "learning_rate": 6.069665416032486e-07, + "logits/chosen": -1.5334231853485107, + "logits/rejected": -1.0874627828598022, + "logps/chosen": -260.1651916503906, + "logps/rejected": -216.29342651367188, + "loss": 0.6185, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04319532588124275, + "rewards/margins": 0.11832845211029053, + "rewards/margins_max": 0.17873892188072205, + "rewards/margins_min": 0.05791795998811722, + "rewards/margins_std": 0.08543331921100616, + "rewards/rejected": -0.07513312250375748, + "step": 1550 + }, + { + "epoch": 0.49, + "grad_norm": 0.462890625, + "learning_rate": 6.015875253862671e-07, + "logits/chosen": -1.4311401844024658, + "logits/rejected": -1.0909273624420166, + "logps/chosen": -206.8611297607422, + "logps/rejected": -273.14886474609375, + "loss": 0.6263, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.040190037339925766, + "rewards/margins": 0.12395022064447403, + "rewards/margins_max": 0.1803322434425354, + "rewards/margins_min": 0.06756818294525146, + "rewards/margins_std": 0.07973622530698776, + "rewards/rejected": -0.08376017212867737, + "step": 1560 + }, + { + "epoch": 0.49, + "grad_norm": 0.353515625, + "learning_rate": 5.961962172601457e-07, + "logits/chosen": -1.459913730621338, + "logits/rejected": -1.0906808376312256, + "logps/chosen": -199.64151000976562, + "logps/rejected": -214.5613555908203, + "loss": 0.6352, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05259844660758972, + "rewards/margins": 0.11418597400188446, + "rewards/margins_max": 0.16717670857906342, + "rewards/margins_min": 0.061195243149995804, + "rewards/margins_std": 0.07494021207094193, + "rewards/rejected": -0.06158752366900444, + "step": 1570 + }, + { + "epoch": 0.5, + "grad_norm": 0.62890625, + "learning_rate": 5.907932695635389e-07, + "logits/chosen": -1.2819427251815796, + "logits/rejected": -1.1336212158203125, + "logps/chosen": -206.28005981445312, + "logps/rejected": -259.9476013183594, + "loss": 0.6302, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.026775460690259933, + "rewards/margins": 0.14914140105247498, + "rewards/margins_max": 0.24029541015625, + "rewards/margins_min": 0.057987384498119354, + "rewards/margins_std": 0.12891125679016113, + "rewards/rejected": -0.12236593663692474, + "step": 1580 + }, + { + "epoch": 0.5, + "grad_norm": 0.515625, + "learning_rate": 5.853793360434687e-07, + "logits/chosen": -1.4024218320846558, + "logits/rejected": -0.9892303347587585, + "logps/chosen": -269.4578857421875, + "logps/rejected": -227.0701141357422, + "loss": 0.6304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.046691518276929855, + "rewards/margins": 0.12285880744457245, + "rewards/margins_max": 0.18432477116584778, + "rewards/margins_min": 0.06139283627271652, + "rewards/margins_std": 0.08692601323127747, + "rewards/rejected": -0.0761672854423523, + "step": 1590 + }, + { + "epoch": 0.5, + "grad_norm": 0.52734375, + "learning_rate": 5.79955071776222e-07, + "logits/chosen": -1.3737763166427612, + "logits/rejected": -1.0089573860168457, + "logps/chosen": -176.09317016601562, + "logps/rejected": -192.53468322753906, + "loss": 0.6285, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.042514294385910034, + "rewards/margins": 0.17080818116664886, + "rewards/margins_max": 0.24430468678474426, + "rewards/margins_min": 0.09731169044971466, + "rewards/margins_std": 0.1039397269487381, + "rewards/rejected": -0.12829387187957764, + "step": 1600 + }, + { + "epoch": 0.51, + "grad_norm": 0.412109375, + "learning_rate": 5.745211330880872e-07, + "logits/chosen": -1.3686656951904297, + "logits/rejected": -1.1941941976547241, + "logps/chosen": -168.5927734375, + "logps/rejected": -232.0594024658203, + "loss": 0.6176, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04633691906929016, + "rewards/margins": 0.15676456689834595, + "rewards/margins_max": 0.2298036366701126, + "rewards/margins_min": 0.08372551202774048, + "rewards/margins_std": 0.10329282283782959, + "rewards/rejected": -0.11042765527963638, + "step": 1610 + }, + { + "epoch": 0.51, + "grad_norm": 0.408203125, + "learning_rate": 5.690781774759412e-07, + "logits/chosen": -1.4598623514175415, + "logits/rejected": -1.1373037099838257, + "logps/chosen": -224.392333984375, + "logps/rejected": -206.09756469726562, + "loss": 0.6346, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04373541846871376, + "rewards/margins": 0.12310346215963364, + "rewards/margins_max": 0.1717989593744278, + "rewards/margins_min": 0.07440796494483948, + "rewards/margins_std": 0.0688658356666565, + "rewards/rejected": -0.07936803251504898, + "step": 1620 + }, + { + "epoch": 0.51, + "grad_norm": 0.58203125, + "learning_rate": 5.636268635276917e-07, + "logits/chosen": -1.237717866897583, + "logits/rejected": -1.0289928913116455, + "logps/chosen": -194.26791381835938, + "logps/rejected": -347.3554382324219, + "loss": 0.6174, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02769922837615013, + "rewards/margins": 0.17334696650505066, + "rewards/margins_max": 0.23788447678089142, + "rewards/margins_min": 0.1088094711303711, + "rewards/margins_std": 0.09126981347799301, + "rewards/rejected": -0.14564773440361023, + "step": 1630 + }, + { + "epoch": 0.52, + "grad_norm": 0.455078125, + "learning_rate": 5.581678508425907e-07, + "logits/chosen": -1.5093035697937012, + "logits/rejected": -1.1646114587783813, + "logps/chosen": -225.5840606689453, + "logps/rejected": -273.4601135253906, + "loss": 0.625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.044809166342020035, + "rewards/margins": 0.16066548228263855, + "rewards/margins_max": 0.22280371189117432, + "rewards/margins_min": 0.09852725267410278, + "rewards/margins_std": 0.08787672221660614, + "rewards/rejected": -0.11585632711648941, + "step": 1640 + }, + { + "epoch": 0.52, + "grad_norm": 0.443359375, + "learning_rate": 5.527017999514238e-07, + "logits/chosen": -1.434874176979065, + "logits/rejected": -1.2691529989242554, + "logps/chosen": -214.0901336669922, + "logps/rejected": -304.82098388671875, + "loss": 0.6337, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05037333816289902, + "rewards/margins": 0.13265621662139893, + "rewards/margins_max": 0.1799599826335907, + "rewards/margins_min": 0.08535243570804596, + "rewards/margins_std": 0.06689763814210892, + "rewards/rejected": -0.0822828859090805, + "step": 1650 + }, + { + "epoch": 0.52, + "grad_norm": 0.439453125, + "learning_rate": 5.472293722365865e-07, + "logits/chosen": -1.5142433643341064, + "logits/rejected": -1.1502716541290283, + "logps/chosen": -199.11727905273438, + "logps/rejected": -236.5269317626953, + "loss": 0.6188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04829222336411476, + "rewards/margins": 0.15703055262565613, + "rewards/margins_max": 0.24611596763134003, + "rewards/margins_min": 0.06794511526823044, + "rewards/margins_std": 0.12598583102226257, + "rewards/rejected": -0.10873832553625107, + "step": 1660 + }, + { + "epoch": 0.53, + "grad_norm": 0.30859375, + "learning_rate": 5.417512298520584e-07, + "logits/chosen": -1.43448007106781, + "logits/rejected": -1.0308877229690552, + "logps/chosen": -233.7875518798828, + "logps/rejected": -209.86734008789062, + "loss": 0.6302, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04728182405233383, + "rewards/margins": 0.1253185272216797, + "rewards/margins_max": 0.18824602663516998, + "rewards/margins_min": 0.062391042709350586, + "rewards/margins_std": 0.08899290859699249, + "rewards/rejected": -0.07803670316934586, + "step": 1670 + }, + { + "epoch": 0.53, + "grad_norm": 0.65234375, + "learning_rate": 5.362680356432846e-07, + "logits/chosen": -1.6510565280914307, + "logits/rejected": -1.247899055480957, + "logps/chosen": -236.86453247070312, + "logps/rejected": -262.64324951171875, + "loss": 0.6231, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0422038659453392, + "rewards/margins": 0.1669834703207016, + "rewards/margins_max": 0.2213924676179886, + "rewards/margins_min": 0.11257448047399521, + "rewards/margins_std": 0.07694593816995621, + "rewards/rejected": -0.1247796043753624, + "step": 1680 + }, + { + "epoch": 0.53, + "grad_norm": 0.5546875, + "learning_rate": 5.307804530669715e-07, + "logits/chosen": -1.3352489471435547, + "logits/rejected": -1.0017801523208618, + "logps/chosen": -203.45884704589844, + "logps/rejected": -276.4349060058594, + "loss": 0.6183, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04065268859267235, + "rewards/margins": 0.1817699670791626, + "rewards/margins_max": 0.28018659353256226, + "rewards/margins_min": 0.08335334062576294, + "rewards/margins_std": 0.13918212056159973, + "rewards/rejected": -0.14111728966236115, + "step": 1690 + }, + { + "epoch": 0.54, + "grad_norm": 0.416015625, + "learning_rate": 5.2528914611081e-07, + "logits/chosen": -1.5503590106964111, + "logits/rejected": -1.1069653034210205, + "logps/chosen": -192.2991943359375, + "logps/rejected": -210.0384979248047, + "loss": 0.6254, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05085741728544235, + "rewards/margins": 0.17244575917720795, + "rewards/margins_max": 0.2783913016319275, + "rewards/margins_min": 0.06650026142597198, + "rewards/margins_std": 0.14982958137989044, + "rewards/rejected": -0.12158836424350739, + "step": 1700 + }, + { + "epoch": 0.54, + "grad_norm": 0.515625, + "learning_rate": 5.197947792131348e-07, + "logits/chosen": -1.1360037326812744, + "logits/rejected": -0.9705570936203003, + "logps/chosen": -245.7555694580078, + "logps/rejected": -288.6032409667969, + "loss": 0.6258, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0327492281794548, + "rewards/margins": 0.16705994307994843, + "rewards/margins_max": 0.22337321937084198, + "rewards/margins_min": 0.11074666678905487, + "rewards/margins_std": 0.0796389952301979, + "rewards/rejected": -0.13431070744991302, + "step": 1710 + }, + { + "epoch": 0.54, + "grad_norm": 0.345703125, + "learning_rate": 5.142980171825276e-07, + "logits/chosen": -1.3152287006378174, + "logits/rejected": -0.9166573286056519, + "logps/chosen": -222.42495727539062, + "logps/rejected": -229.1804962158203, + "loss": 0.6292, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03760639578104019, + "rewards/margins": 0.13950268924236298, + "rewards/margins_max": 0.1909518986940384, + "rewards/margins_min": 0.08805350959300995, + "rewards/margins_std": 0.07276014238595963, + "rewards/rejected": -0.10189630836248398, + "step": 1720 + }, + { + "epoch": 0.54, + "grad_norm": 0.60546875, + "learning_rate": 5.087995251173769e-07, + "logits/chosen": -1.3346498012542725, + "logits/rejected": -0.980597198009491, + "logps/chosen": -201.7723388671875, + "logps/rejected": -276.3261413574219, + "loss": 0.6213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.025363069027662277, + "rewards/margins": 0.17956769466400146, + "rewards/margins_max": 0.2888151705265045, + "rewards/margins_min": 0.07032018154859543, + "rewards/margins_std": 0.15449929237365723, + "rewards/rejected": -0.1542046070098877, + "step": 1730 + }, + { + "epoch": 0.55, + "grad_norm": 0.490234375, + "learning_rate": 5.032999683254027e-07, + "logits/chosen": -1.4949066638946533, + "logits/rejected": -1.2108285427093506, + "logps/chosen": -225.4260711669922, + "logps/rejected": -247.3677215576172, + "loss": 0.6252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.027850795537233353, + "rewards/margins": 0.1680610179901123, + "rewards/margins_max": 0.2702367603778839, + "rewards/margins_min": 0.06588525325059891, + "rewards/margins_std": 0.14449834823608398, + "rewards/rejected": -0.14021022617816925, + "step": 1740 + }, + { + "epoch": 0.55, + "grad_norm": 0.4609375, + "learning_rate": 4.97800012243155e-07, + "logits/chosen": -1.400756597518921, + "logits/rejected": -1.0894193649291992, + "logps/chosen": -211.5016326904297, + "logps/rejected": -224.2008056640625, + "loss": 0.6291, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.043883297592401505, + "rewards/margins": 0.13359126448631287, + "rewards/margins_max": 0.20171597599983215, + "rewards/margins_min": 0.06546656787395477, + "rewards/margins_std": 0.09634287655353546, + "rewards/rejected": -0.08970797061920166, + "step": 1750 + }, + { + "epoch": 0.55, + "grad_norm": 0.39453125, + "learning_rate": 4.923003223554966e-07, + "logits/chosen": -1.346853494644165, + "logits/rejected": -1.0727789402008057, + "logps/chosen": -197.31280517578125, + "logps/rejected": -219.85580444335938, + "loss": 0.6402, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03675428405404091, + "rewards/margins": 0.10769043117761612, + "rewards/margins_max": 0.15701591968536377, + "rewards/margins_min": 0.05836494639515877, + "rewards/margins_std": 0.06975677609443665, + "rewards/rejected": -0.0709361582994461, + "step": 1760 + }, + { + "epoch": 0.56, + "grad_norm": 0.359375, + "learning_rate": 4.868015641150819e-07, + "logits/chosen": -1.489463448524475, + "logits/rejected": -1.0774259567260742, + "logps/chosen": -228.04257202148438, + "logps/rejected": -254.78701782226562, + "loss": 0.6211, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03930111974477768, + "rewards/margins": 0.162466898560524, + "rewards/margins_max": 0.2493637055158615, + "rewards/margins_min": 0.07557009160518646, + "rewards/margins_std": 0.12289062887430191, + "rewards/rejected": -0.1231658011674881, + "step": 1770 + }, + { + "epoch": 0.56, + "grad_norm": 0.427734375, + "learning_rate": 4.813044028618372e-07, + "logits/chosen": -1.3595679998397827, + "logits/rejected": -1.0415996313095093, + "logps/chosen": -217.2411651611328, + "logps/rejected": -226.8505401611328, + "loss": 0.6318, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03175649791955948, + "rewards/margins": 0.13404306769371033, + "rewards/margins_max": 0.1951790750026703, + "rewards/margins_min": 0.07290709763765335, + "rewards/margins_std": 0.08645935356616974, + "rewards/rejected": -0.10228659212589264, + "step": 1780 + }, + { + "epoch": 0.56, + "grad_norm": 0.443359375, + "learning_rate": 4.7580950374245664e-07, + "logits/chosen": -1.3718421459197998, + "logits/rejected": -1.0821744203567505, + "logps/chosen": -202.16091918945312, + "logps/rejected": -192.60728454589844, + "loss": 0.6311, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.041288070380687714, + "rewards/margins": 0.14421400427818298, + "rewards/margins_max": 0.2147628366947174, + "rewards/margins_min": 0.07366515696048737, + "rewards/margins_std": 0.09977111220359802, + "rewards/rejected": -0.10292591899633408, + "step": 1790 + }, + { + "epoch": 0.57, + "grad_norm": 0.44140625, + "learning_rate": 4.703175316299196e-07, + "logits/chosen": -1.4759793281555176, + "logits/rejected": -0.9586470723152161, + "logps/chosen": -306.47882080078125, + "logps/rejected": -301.02825927734375, + "loss": 0.6237, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.050575144588947296, + "rewards/margins": 0.13904833793640137, + "rewards/margins_max": 0.204876109957695, + "rewards/margins_min": 0.07322058826684952, + "rewards/margins_std": 0.09309452027082443, + "rewards/rejected": -0.08847320824861526, + "step": 1800 + }, + { + "epoch": 0.57, + "grad_norm": 0.421875, + "learning_rate": 4.6482915104304373e-07, + "logits/chosen": -1.3048204183578491, + "logits/rejected": -0.8675423860549927, + "logps/chosen": -255.3675079345703, + "logps/rejected": -200.3520050048828, + "loss": 0.6279, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04115379601716995, + "rewards/margins": 0.16566821932792664, + "rewards/margins_max": 0.2521827518939972, + "rewards/margins_min": 0.07915371656417847, + "rewards/margins_std": 0.12234999984502792, + "rewards/rejected": -0.12451444566249847, + "step": 1810 + }, + { + "epoch": 0.57, + "grad_norm": 0.50390625, + "learning_rate": 4.593450260660775e-07, + "logits/chosen": -1.3136993646621704, + "logits/rejected": -1.0857809782028198, + "logps/chosen": -173.83749389648438, + "logps/rejected": -204.83389282226562, + "loss": 0.6326, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.031986989080905914, + "rewards/margins": 0.11422686278820038, + "rewards/margins_max": 0.1664976328611374, + "rewards/margins_min": 0.061956118792295456, + "rewards/margins_std": 0.07392201572656631, + "rewards/rejected": -0.08223988860845566, + "step": 1820 + }, + { + "epoch": 0.58, + "grad_norm": 0.30859375, + "learning_rate": 4.5386582026834904e-07, + "logits/chosen": -1.407066822052002, + "logits/rejected": -1.1559934616088867, + "logps/chosen": -194.22482299804688, + "logps/rejected": -226.273193359375, + "loss": 0.6325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.06259353458881378, + "rewards/margins": 0.1343492567539215, + "rewards/margins_max": 0.2052960842847824, + "rewards/margins_min": 0.06340241432189941, + "rewards/margins_std": 0.10033398866653442, + "rewards/rejected": -0.07175572961568832, + "step": 1830 + }, + { + "epoch": 0.58, + "grad_norm": 0.49609375, + "learning_rate": 4.483921966239739e-07, + "logits/chosen": -1.3300710916519165, + "logits/rejected": -1.107683539390564, + "logps/chosen": -202.5210723876953, + "logps/rejected": -293.78265380859375, + "loss": 0.6212, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0330859050154686, + "rewards/margins": 0.18205811083316803, + "rewards/margins_max": 0.25554248690605164, + "rewards/margins_min": 0.10857371240854263, + "rewards/margins_std": 0.10392262041568756, + "rewards/rejected": -0.14897218346595764, + "step": 1840 + }, + { + "epoch": 0.58, + "grad_norm": 0.55078125, + "learning_rate": 4.429248174316375e-07, + "logits/chosen": -1.50588059425354, + "logits/rejected": -1.1238529682159424, + "logps/chosen": -256.1008605957031, + "logps/rejected": -309.0824279785156, + "loss": 0.6336, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0622735433280468, + "rewards/margins": 0.16536816954612732, + "rewards/margins_max": 0.22724337875843048, + "rewards/margins_min": 0.10349295288324356, + "rewards/margins_std": 0.08750475943088531, + "rewards/rejected": -0.10309461504220963, + "step": 1850 + }, + { + "epoch": 0.59, + "grad_norm": 0.46484375, + "learning_rate": 4.374643442344576e-07, + "logits/chosen": -1.2853957414627075, + "logits/rejected": -0.9773873090744019, + "logps/chosen": -202.54776000976562, + "logps/rejected": -197.9080352783203, + "loss": 0.6202, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03965754434466362, + "rewards/margins": 0.17071697115898132, + "rewards/margins_max": 0.264207661151886, + "rewards/margins_min": 0.07722628116607666, + "rewards/margins_std": 0.13221579790115356, + "rewards/rejected": -0.1310594230890274, + "step": 1860 + }, + { + "epoch": 0.59, + "grad_norm": 0.333984375, + "learning_rate": 4.3201143773993864e-07, + "logits/chosen": -1.4333173036575317, + "logits/rejected": -0.9873960614204407, + "logps/chosen": -254.46841430664062, + "logps/rejected": -246.49880981445312, + "loss": 0.6263, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.041944604367017746, + "rewards/margins": 0.1601918637752533, + "rewards/margins_max": 0.23515033721923828, + "rewards/margins_min": 0.08523334562778473, + "rewards/margins_std": 0.10600732266902924, + "rewards/rejected": -0.11824724823236465, + "step": 1870 + }, + { + "epoch": 0.59, + "grad_norm": 0.373046875, + "learning_rate": 4.2656675774002773e-07, + "logits/chosen": -1.3904842138290405, + "logits/rejected": -0.9373539686203003, + "logps/chosen": -249.3865203857422, + "logps/rejected": -224.6757354736328, + "loss": 0.6203, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04652171581983566, + "rewards/margins": 0.18809521198272705, + "rewards/margins_max": 0.27226054668426514, + "rewards/margins_min": 0.10392986238002777, + "rewards/margins_std": 0.1190277710556984, + "rewards/rejected": -0.1415734887123108, + "step": 1880 + }, + { + "epoch": 0.6, + "grad_norm": 0.66796875, + "learning_rate": 4.211309630312812e-07, + "logits/chosen": -1.2958967685699463, + "logits/rejected": -0.9569181203842163, + "logps/chosen": -210.689453125, + "logps/rejected": -248.34646606445312, + "loss": 0.6234, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.046597760170698166, + "rewards/margins": 0.1351958066225052, + "rewards/margins_max": 0.1851089894771576, + "rewards/margins_min": 0.08528260141611099, + "rewards/margins_std": 0.07058792561292648, + "rewards/rejected": -0.08859803527593613, + "step": 1890 + }, + { + "epoch": 0.6, + "grad_norm": 0.5078125, + "learning_rate": 4.1570471133515033e-07, + "logits/chosen": -1.539520025253296, + "logits/rejected": -1.1000487804412842, + "logps/chosen": -274.437744140625, + "logps/rejected": -278.19781494140625, + "loss": 0.626, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.029712975025177002, + "rewards/margins": 0.13053396344184875, + "rewards/margins_max": 0.2025536596775055, + "rewards/margins_min": 0.058514248579740524, + "rewards/margins_std": 0.10185122489929199, + "rewards/rejected": -0.10082097351551056, + "step": 1900 + }, + { + "epoch": 0.6, + "grad_norm": 0.46484375, + "learning_rate": 4.102886592183995e-07, + "logits/chosen": -1.347434401512146, + "logits/rejected": -0.91209876537323, + "logps/chosen": -275.15252685546875, + "logps/rejected": -234.12081909179688, + "loss": 0.6245, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.043122995644807816, + "rewards/margins": 0.12455437332391739, + "rewards/margins_max": 0.19914036989212036, + "rewards/margins_min": 0.049968358129262924, + "rewards/margins_std": 0.10548055171966553, + "rewards/rejected": -0.08143137395381927, + "step": 1910 + }, + { + "epoch": 0.6, + "grad_norm": 0.40234375, + "learning_rate": 4.048834620136618e-07, + "logits/chosen": -1.3737701177597046, + "logits/rejected": -1.062455415725708, + "logps/chosen": -242.2274169921875, + "logps/rejected": -230.4326629638672, + "loss": 0.6262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04452799633145332, + "rewards/margins": 0.16601449251174927, + "rewards/margins_max": 0.25104230642318726, + "rewards/margins_min": 0.08098666369915009, + "rewards/margins_std": 0.1202474981546402, + "rewards/rejected": -0.12148649990558624, + "step": 1920 + }, + { + "epoch": 0.61, + "grad_norm": 0.427734375, + "learning_rate": 3.9948977374014545e-07, + "logits/chosen": -1.3358091115951538, + "logits/rejected": -0.9706098437309265, + "logps/chosen": -188.8526153564453, + "logps/rejected": -214.7004852294922, + "loss": 0.6333, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04406962916254997, + "rewards/margins": 0.1181538924574852, + "rewards/margins_max": 0.17844833433628082, + "rewards/margins_min": 0.057859472930431366, + "rewards/margins_std": 0.08526919782161713, + "rewards/rejected": -0.07408426702022552, + "step": 1930 + }, + { + "epoch": 0.61, + "grad_norm": 0.349609375, + "learning_rate": 3.941082470244987e-07, + "logits/chosen": -1.3635414838790894, + "logits/rejected": -1.0531474351882935, + "logps/chosen": -260.57159423828125, + "logps/rejected": -218.0052032470703, + "loss": 0.6226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05570930987596512, + "rewards/margins": 0.17012056708335876, + "rewards/margins_max": 0.2493591010570526, + "rewards/margins_min": 0.09088209271430969, + "rewards/margins_std": 0.11206014454364777, + "rewards/rejected": -0.11441127955913544, + "step": 1940 + }, + { + "epoch": 0.61, + "grad_norm": 0.5859375, + "learning_rate": 3.8873953302184283e-07, + "logits/chosen": -1.468860387802124, + "logits/rejected": -1.0624154806137085, + "logps/chosen": -321.9212951660156, + "logps/rejected": -267.7339782714844, + "loss": 0.617, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0559014193713665, + "rewards/margins": 0.16720013320446014, + "rewards/margins_max": 0.22909954190254211, + "rewards/margins_min": 0.10530078411102295, + "rewards/margins_std": 0.08753892034292221, + "rewards/rejected": -0.11129872500896454, + "step": 1950 + }, + { + "epoch": 0.62, + "grad_norm": 0.41796875, + "learning_rate": 3.8338428133698396e-07, + "logits/chosen": -1.351555585861206, + "logits/rejected": -1.0230903625488281, + "logps/chosen": -206.9521026611328, + "logps/rejected": -223.57754516601562, + "loss": 0.6306, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03831537812948227, + "rewards/margins": 0.11696416139602661, + "rewards/margins_max": 0.1850701868534088, + "rewards/margins_min": 0.04885811731219292, + "rewards/margins_std": 0.09631648659706116, + "rewards/rejected": -0.07864876836538315, + "step": 1960 + }, + { + "epoch": 0.62, + "grad_norm": 0.392578125, + "learning_rate": 3.780431399458114e-07, + "logits/chosen": -1.3833266496658325, + "logits/rejected": -0.972434401512146, + "logps/chosen": -220.4893035888672, + "logps/rejected": -226.40487670898438, + "loss": 0.6258, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03797771409153938, + "rewards/margins": 0.14182588458061218, + "rewards/margins_max": 0.19501110911369324, + "rewards/margins_min": 0.08864064514636993, + "rewards/margins_std": 0.07521527260541916, + "rewards/rejected": -0.1038481742143631, + "step": 1970 + }, + { + "epoch": 0.62, + "grad_norm": 0.419921875, + "learning_rate": 3.7271675511689473e-07, + "logits/chosen": -1.570255160331726, + "logits/rejected": -1.1389049291610718, + "logps/chosen": -190.87924194335938, + "logps/rejected": -196.2376251220703, + "loss": 0.6235, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05351024121046066, + "rewards/margins": 0.14232777059078217, + "rewards/margins_max": 0.20520441234111786, + "rewards/margins_min": 0.07945115119218826, + "rewards/margins_std": 0.08892098814249039, + "rewards/rejected": -0.0888175368309021, + "step": 1980 + }, + { + "epoch": 0.63, + "grad_norm": 0.3671875, + "learning_rate": 3.674057713332852e-07, + "logits/chosen": -1.418208122253418, + "logits/rejected": -1.0676791667938232, + "logps/chosen": -222.4821014404297, + "logps/rejected": -205.3859405517578, + "loss": 0.6245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04518379643559456, + "rewards/margins": 0.15014731884002686, + "rewards/margins_max": 0.23736615478992462, + "rewards/margins_min": 0.06292847543954849, + "rewards/margins_std": 0.12334605306386948, + "rewards/rejected": -0.104963518679142, + "step": 1990 + }, + { + "epoch": 0.63, + "grad_norm": 0.408203125, + "learning_rate": 3.6211083121453566e-07, + "logits/chosen": -1.5330979824066162, + "logits/rejected": -1.023696780204773, + "logps/chosen": -215.0279541015625, + "logps/rejected": -207.7816162109375, + "loss": 0.6251, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.048994213342666626, + "rewards/margins": 0.15364879369735718, + "rewards/margins_max": 0.23372311890125275, + "rewards/margins_min": 0.07357443124055862, + "rewards/margins_std": 0.11324223130941391, + "rewards/rejected": -0.10465456545352936, + "step": 2000 + }, + { + "epoch": 0.63, + "grad_norm": 0.376953125, + "learning_rate": 3.568325754389437e-07, + "logits/chosen": -1.4234205484390259, + "logits/rejected": -0.9812124371528625, + "logps/chosen": -233.41748046875, + "logps/rejected": -208.1199493408203, + "loss": 0.6291, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0440436489880085, + "rewards/margins": 0.12118716537952423, + "rewards/margins_max": 0.17412233352661133, + "rewards/margins_min": 0.06825198978185654, + "rewards/margins_std": 0.07486163079738617, + "rewards/rejected": -0.07714351266622543, + "step": 2010 + }, + { + "epoch": 0.64, + "grad_norm": 0.3828125, + "learning_rate": 3.515716426660314e-07, + "logits/chosen": -1.2047122716903687, + "logits/rejected": -0.977331280708313, + "logps/chosen": -228.931396484375, + "logps/rejected": -291.92071533203125, + "loss": 0.6243, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04894689470529556, + "rewards/margins": 0.1800243854522705, + "rewards/margins_max": 0.2859472930431366, + "rewards/margins_min": 0.07410150021314621, + "rewards/margins_std": 0.14979760348796844, + "rewards/rejected": -0.13107749819755554, + "step": 2020 + }, + { + "epoch": 0.64, + "grad_norm": 0.5, + "learning_rate": 3.463286694592685e-07, + "logits/chosen": -1.5284559726715088, + "logits/rejected": -1.197145700454712, + "logps/chosen": -280.45672607421875, + "logps/rejected": -251.2618865966797, + "loss": 0.6211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.049861326813697815, + "rewards/margins": 0.14641502499580383, + "rewards/margins_max": 0.20501860976219177, + "rewards/margins_min": 0.0878114253282547, + "rewards/margins_std": 0.08287801593542099, + "rewards/rejected": -0.09655369818210602, + "step": 2030 + }, + { + "epoch": 0.64, + "grad_norm": 0.375, + "learning_rate": 3.4110429020904916e-07, + "logits/chosen": -1.3765310049057007, + "logits/rejected": -0.8682848811149597, + "logps/chosen": -331.6241760253906, + "logps/rejected": -291.6063232421875, + "loss": 0.6177, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.040959432721138, + "rewards/margins": 0.19673797488212585, + "rewards/margins_max": 0.30799782276153564, + "rewards/margins_min": 0.08547808974981308, + "rewards/margins_std": 0.1573452204465866, + "rewards/rejected": -0.15577852725982666, + "step": 2040 + }, + { + "epoch": 0.65, + "grad_norm": 0.427734375, + "learning_rate": 3.358991370559323e-07, + "logits/chosen": -1.4426238536834717, + "logits/rejected": -1.0889427661895752, + "logps/chosen": -212.64932250976562, + "logps/rejected": -232.1751708984375, + "loss": 0.617, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.061146706342697144, + "rewards/margins": 0.1886657178401947, + "rewards/margins_max": 0.285667359828949, + "rewards/margins_min": 0.09166404604911804, + "rewards/margins_std": 0.13718107342720032, + "rewards/rejected": -0.12751901149749756, + "step": 2050 + }, + { + "epoch": 0.65, + "grad_norm": 0.33203125, + "learning_rate": 3.307138398141528e-07, + "logits/chosen": -1.4597798585891724, + "logits/rejected": -1.02079176902771, + "logps/chosen": -205.18765258789062, + "logps/rejected": -242.97683715820312, + "loss": 0.6111, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0566553995013237, + "rewards/margins": 0.18628427386283875, + "rewards/margins_max": 0.2999788224697113, + "rewards/margins_min": 0.07258973270654678, + "rewards/margins_std": 0.1607883721590042, + "rewards/rejected": -0.12962886691093445, + "step": 2060 + }, + { + "epoch": 0.65, + "grad_norm": 0.61328125, + "learning_rate": 3.2554902589541664e-07, + "logits/chosen": -1.4502575397491455, + "logits/rejected": -1.0646450519561768, + "logps/chosen": -172.86062622070312, + "logps/rejected": -165.33616638183594, + "loss": 0.6217, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04238683357834816, + "rewards/margins": 0.11539296805858612, + "rewards/margins_max": 0.17349234223365784, + "rewards/margins_min": 0.05729362368583679, + "rewards/margins_std": 0.08216488361358643, + "rewards/rejected": -0.07300613820552826, + "step": 2070 + }, + { + "epoch": 0.66, + "grad_norm": 0.40625, + "learning_rate": 3.204053202329835e-07, + "logits/chosen": -1.3668615818023682, + "logits/rejected": -0.9603246450424194, + "logps/chosen": -245.4371795654297, + "logps/rejected": -241.2156524658203, + "loss": 0.6261, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03958984464406967, + "rewards/margins": 0.15117119252681732, + "rewards/margins_max": 0.22184331715106964, + "rewards/margins_min": 0.080499067902565, + "rewards/margins_std": 0.09994547069072723, + "rewards/rejected": -0.11158134043216705, + "step": 2080 + }, + { + "epoch": 0.66, + "grad_norm": 0.4296875, + "learning_rate": 3.1528334520605216e-07, + "logits/chosen": -1.282301425933838, + "logits/rejected": -1.1172327995300293, + "logps/chosen": -212.07632446289062, + "logps/rejected": -271.19989013671875, + "loss": 0.6307, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03222063556313515, + "rewards/margins": 0.157441645860672, + "rewards/margins_max": 0.22982993721961975, + "rewards/margins_min": 0.08505336940288544, + "rewards/margins_std": 0.10237250477075577, + "rewards/rejected": -0.12522102892398834, + "step": 2090 + }, + { + "epoch": 0.66, + "grad_norm": 0.43359375, + "learning_rate": 3.1018372056445305e-07, + "logits/chosen": -1.3811043500900269, + "logits/rejected": -0.9078986048698425, + "logps/chosen": -239.8875274658203, + "logps/rejected": -216.6652069091797, + "loss": 0.6223, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.046834252774715424, + "rewards/margins": 0.18769201636314392, + "rewards/margins_max": 0.2837710678577423, + "rewards/margins_min": 0.09161292761564255, + "rewards/margins_std": 0.13587632775306702, + "rewards/rejected": -0.1408577710390091, + "step": 2100 + }, + { + "epoch": 0.66, + "grad_norm": 0.421875, + "learning_rate": 3.0510706335366034e-07, + "logits/chosen": -1.4970591068267822, + "logits/rejected": -1.091909408569336, + "logps/chosen": -195.78086853027344, + "logps/rejected": -198.8799591064453, + "loss": 0.6334, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.054044730961322784, + "rewards/margins": 0.12333653122186661, + "rewards/margins_max": 0.1747966706752777, + "rewards/margins_min": 0.0718763917684555, + "rewards/margins_std": 0.07277561724185944, + "rewards/rejected": -0.06929179280996323, + "step": 2110 + }, + { + "epoch": 0.67, + "grad_norm": 0.474609375, + "learning_rate": 3.000539878401296e-07, + "logits/chosen": -1.4365322589874268, + "logits/rejected": -1.1631158590316772, + "logps/chosen": -179.49465942382812, + "logps/rejected": -198.1630401611328, + "loss": 0.6298, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04496608301997185, + "rewards/margins": 0.11149311065673828, + "rewards/margins_max": 0.1663789451122284, + "rewards/margins_min": 0.05660729482769966, + "rewards/margins_std": 0.07762027531862259, + "rewards/rejected": -0.06652702391147614, + "step": 2120 + }, + { + "epoch": 0.67, + "grad_norm": 0.41796875, + "learning_rate": 2.9502510543697323e-07, + "logits/chosen": -1.4205095767974854, + "logits/rejected": -1.1358397006988525, + "logps/chosen": -190.26431274414062, + "logps/rejected": -216.4228057861328, + "loss": 0.6252, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0446227490901947, + "rewards/margins": 0.13534995913505554, + "rewards/margins_max": 0.2004757821559906, + "rewards/margins_min": 0.07022411376237869, + "rewards/margins_std": 0.09210184216499329, + "rewards/rejected": -0.09072719514369965, + "step": 2130 + }, + { + "epoch": 0.67, + "grad_norm": 0.482421875, + "learning_rate": 2.900210246299808e-07, + "logits/chosen": -1.3750842809677124, + "logits/rejected": -0.9162171483039856, + "logps/chosen": -235.06005859375, + "logps/rejected": -295.83526611328125, + "loss": 0.6131, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06156709045171738, + "rewards/margins": 0.19232510030269623, + "rewards/margins_max": 0.29850488901138306, + "rewards/margins_min": 0.0861453041434288, + "rewards/margins_std": 0.15016090869903564, + "rewards/rejected": -0.13075801730155945, + "step": 2140 + }, + { + "epoch": 0.68, + "grad_norm": 0.404296875, + "learning_rate": 2.8504235090399275e-07, + "logits/chosen": -1.246057391166687, + "logits/rejected": -1.2714288234710693, + "logps/chosen": -128.15052795410156, + "logps/rejected": -195.8909454345703, + "loss": 0.6329, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03380041942000389, + "rewards/margins": 0.14983999729156494, + "rewards/margins_max": 0.23497620224952698, + "rewards/margins_min": 0.06470384448766708, + "rewards/margins_std": 0.12040072679519653, + "rewards/rejected": -0.11603958904743195, + "step": 2150 + }, + { + "epoch": 0.68, + "grad_norm": 0.4609375, + "learning_rate": 2.800896866696382e-07, + "logits/chosen": -1.375130534172058, + "logits/rejected": -1.0531413555145264, + "logps/chosen": -180.38128662109375, + "logps/rejected": -198.62501525878906, + "loss": 0.6281, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04046819731593132, + "rewards/margins": 0.1361556351184845, + "rewards/margins_max": 0.22056671977043152, + "rewards/margins_min": 0.05174453184008598, + "rewards/margins_std": 0.1193753108382225, + "rewards/rejected": -0.09568743407726288, + "step": 2160 + }, + { + "epoch": 0.68, + "grad_norm": 0.408203125, + "learning_rate": 2.7516363119044437e-07, + "logits/chosen": -1.3951034545898438, + "logits/rejected": -1.1207072734832764, + "logps/chosen": -194.72064208984375, + "logps/rejected": -230.1772003173828, + "loss": 0.6286, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04992605373263359, + "rewards/margins": 0.12686730921268463, + "rewards/margins_max": 0.18470284342765808, + "rewards/margins_min": 0.06903177499771118, + "rewards/margins_std": 0.08179178088903427, + "rewards/rejected": -0.07694125175476074, + "step": 2170 + }, + { + "epoch": 0.69, + "grad_norm": 0.50390625, + "learning_rate": 2.702647805103262e-07, + "logits/chosen": -1.4403280019760132, + "logits/rejected": -1.1164398193359375, + "logps/chosen": -186.88186645507812, + "logps/rejected": -217.31051635742188, + "loss": 0.6124, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.049945756793022156, + "rewards/margins": 0.16797830164432526, + "rewards/margins_max": 0.266406774520874, + "rewards/margins_min": 0.0695497915148735, + "rewards/margins_std": 0.1391989141702652, + "rewards/rejected": -0.1180325299501419, + "step": 2180 + }, + { + "epoch": 0.69, + "grad_norm": 0.51953125, + "learning_rate": 2.6539372738146694e-07, + "logits/chosen": -1.4543806314468384, + "logits/rejected": -0.953220009803772, + "logps/chosen": -255.16073608398438, + "logps/rejected": -233.70040893554688, + "loss": 0.6168, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03919634595513344, + "rewards/margins": 0.20791907608509064, + "rewards/margins_max": 0.3329697251319885, + "rewards/margins_min": 0.08286843448877335, + "rewards/margins_std": 0.17684832215309143, + "rewards/rejected": -0.1687227189540863, + "step": 2190 + }, + { + "epoch": 0.69, + "grad_norm": 0.6328125, + "learning_rate": 2.605510611925955e-07, + "logits/chosen": -1.2929773330688477, + "logits/rejected": -0.9437387585639954, + "logps/chosen": -202.4819793701172, + "logps/rejected": -246.94662475585938, + "loss": 0.6224, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04549327492713928, + "rewards/margins": 0.17824196815490723, + "rewards/margins_max": 0.2709406912326813, + "rewards/margins_min": 0.08554325997829437, + "rewards/margins_std": 0.131095752120018, + "rewards/rejected": -0.13274869322776794, + "step": 2200 + }, + { + "epoch": 0.7, + "grad_norm": 0.462890625, + "learning_rate": 2.557373678976723e-07, + "logits/chosen": -1.408676266670227, + "logits/rejected": -0.9201229214668274, + "logps/chosen": -237.0033721923828, + "logps/rejected": -179.01229858398438, + "loss": 0.6269, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.043176356703042984, + "rewards/margins": 0.1315043866634369, + "rewards/margins_max": 0.21515369415283203, + "rewards/margins_min": 0.047855086624622345, + "rewards/margins_std": 0.11829797923564911, + "rewards/rejected": -0.0883280336856842, + "step": 2210 + }, + { + "epoch": 0.7, + "grad_norm": 0.318359375, + "learning_rate": 2.5095322994498846e-07, + "logits/chosen": -1.5247070789337158, + "logits/rejected": -1.1096593141555786, + "logps/chosen": -250.4240264892578, + "logps/rejected": -194.3154296875, + "loss": 0.6146, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04496272653341293, + "rewards/margins": 0.1702985316514969, + "rewards/margins_max": 0.27228885889053345, + "rewards/margins_min": 0.06830821931362152, + "rewards/margins_std": 0.14423608779907227, + "rewards/rejected": -0.12533581256866455, + "step": 2220 + }, + { + "epoch": 0.7, + "grad_norm": 0.421875, + "learning_rate": 2.4619922620669215e-07, + "logits/chosen": -1.4068553447723389, + "logits/rejected": -1.0456701517105103, + "logps/chosen": -208.51882934570312, + "logps/rejected": -200.5174102783203, + "loss": 0.6294, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05011656880378723, + "rewards/margins": 0.14396075904369354, + "rewards/margins_max": 0.22980065643787384, + "rewards/margins_min": 0.058120857924222946, + "rewards/margins_std": 0.12139594554901123, + "rewards/rejected": -0.09384419023990631, + "step": 2230 + }, + { + "epoch": 0.71, + "grad_norm": 0.51171875, + "learning_rate": 2.414759319087452e-07, + "logits/chosen": -1.5388705730438232, + "logits/rejected": -1.0942213535308838, + "logps/chosen": -234.5261688232422, + "logps/rejected": -196.3878631591797, + "loss": 0.6207, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04927833378314972, + "rewards/margins": 0.12129292637109756, + "rewards/margins_max": 0.17791934311389923, + "rewards/margins_min": 0.06466653198003769, + "rewards/margins_std": 0.08008182793855667, + "rewards/rejected": -0.07201460003852844, + "step": 2240 + }, + { + "epoch": 0.71, + "grad_norm": 0.51171875, + "learning_rate": 2.3678391856132202e-07, + "logits/chosen": -1.5716922283172607, + "logits/rejected": -1.0814467668533325, + "logps/chosen": -236.632080078125, + "logps/rejected": -212.31906127929688, + "loss": 0.6342, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.044985391199588776, + "rewards/margins": 0.15734454989433289, + "rewards/margins_max": 0.2386656105518341, + "rewards/margins_min": 0.07602350413799286, + "rewards/margins_std": 0.11500532925128937, + "rewards/rejected": -0.1123591810464859, + "step": 2250 + }, + { + "epoch": 0.71, + "grad_norm": 0.478515625, + "learning_rate": 2.321237538896579e-07, + "logits/chosen": -1.3435510396957397, + "logits/rejected": -1.0200581550598145, + "logps/chosen": -340.9153747558594, + "logps/rejected": -310.7401428222656, + "loss": 0.6111, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0401308573782444, + "rewards/margins": 0.1969205141067505, + "rewards/margins_max": 0.30011191964149475, + "rewards/margins_min": 0.09372911602258682, + "rewards/margins_std": 0.14593467116355896, + "rewards/rejected": -0.1567896604537964, + "step": 2260 + }, + { + "epoch": 0.72, + "grad_norm": 0.58984375, + "learning_rate": 2.2749600176535533e-07, + "logits/chosen": -1.4122085571289062, + "logits/rejected": -0.8751947283744812, + "logps/chosen": -255.9865264892578, + "logps/rejected": -214.06289672851562, + "loss": 0.6122, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04100383445620537, + "rewards/margins": 0.16915416717529297, + "rewards/margins_max": 0.25182080268859863, + "rewards/margins_min": 0.08648748695850372, + "rewards/margins_std": 0.11690831184387207, + "rewards/rejected": -0.1281503140926361, + "step": 2270 + }, + { + "epoch": 0.72, + "grad_norm": 0.43359375, + "learning_rate": 2.2290122213815605e-07, + "logits/chosen": -1.3835846185684204, + "logits/rejected": -1.1778188943862915, + "logps/chosen": -212.3909149169922, + "logps/rejected": -286.0127868652344, + "loss": 0.6164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05380062013864517, + "rewards/margins": 0.19799622893333435, + "rewards/margins_max": 0.3002737760543823, + "rewards/margins_min": 0.09571869671344757, + "rewards/margins_std": 0.14464230835437775, + "rewards/rejected": -0.14419563114643097, + "step": 2280 + }, + { + "epoch": 0.72, + "grad_norm": 0.46484375, + "learning_rate": 2.1833997096818895e-07, + "logits/chosen": -1.2950363159179688, + "logits/rejected": -1.0533965826034546, + "logps/chosen": -207.44741821289062, + "logps/rejected": -266.3078918457031, + "loss": 0.6148, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04672679305076599, + "rewards/margins": 0.18707481026649475, + "rewards/margins_max": 0.26824522018432617, + "rewards/margins_min": 0.10590440034866333, + "rewards/margins_std": 0.11479228734970093, + "rewards/rejected": -0.14034804701805115, + "step": 2290 + }, + { + "epoch": 0.72, + "grad_norm": 0.64453125, + "learning_rate": 2.1381280015869956e-07, + "logits/chosen": -1.3823282718658447, + "logits/rejected": -0.8315450549125671, + "logps/chosen": -262.98089599609375, + "logps/rejected": -228.6239013671875, + "loss": 0.6403, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04306049272418022, + "rewards/margins": 0.13284507393836975, + "rewards/margins_max": 0.195632204413414, + "rewards/margins_min": 0.07005792111158371, + "rewards/margins_std": 0.08879442512989044, + "rewards/rejected": -0.08978457748889923, + "step": 2300 + }, + { + "epoch": 0.73, + "grad_norm": 0.494140625, + "learning_rate": 2.0932025748927014e-07, + "logits/chosen": -1.300336480140686, + "logits/rejected": -1.0065299272537231, + "logps/chosen": -266.71722412109375, + "logps/rejected": -252.8298797607422, + "loss": 0.6336, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.045079831033945084, + "rewards/margins": 0.14786089956760406, + "rewards/margins_max": 0.21760694682598114, + "rewards/margins_min": 0.0781148225069046, + "rewards/margins_std": 0.09863585978746414, + "rewards/rejected": -0.10278107225894928, + "step": 2310 + }, + { + "epoch": 0.73, + "grad_norm": 0.51953125, + "learning_rate": 2.0486288654954027e-07, + "logits/chosen": -1.4983582496643066, + "logits/rejected": -1.009060025215149, + "logps/chosen": -221.823486328125, + "logps/rejected": -218.0142364501953, + "loss": 0.6192, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06399594247341156, + "rewards/margins": 0.17152294516563416, + "rewards/margins_max": 0.24920019507408142, + "rewards/margins_min": 0.0938456803560257, + "rewards/margins_std": 0.10985223203897476, + "rewards/rejected": -0.1075270026922226, + "step": 2320 + }, + { + "epoch": 0.73, + "grad_norm": 0.392578125, + "learning_rate": 2.0044122667343295e-07, + "logits/chosen": -1.349254846572876, + "logits/rejected": -0.9567793011665344, + "logps/chosen": -220.45394897460938, + "logps/rejected": -214.1141815185547, + "loss": 0.6235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.055171359330415726, + "rewards/margins": 0.16959527134895325, + "rewards/margins_max": 0.24443653225898743, + "rewards/margins_min": 0.09475398808717728, + "rewards/margins_std": 0.10584155470132828, + "rewards/rejected": -0.11442389339208603, + "step": 2330 + }, + { + "epoch": 0.74, + "grad_norm": 0.53125, + "learning_rate": 1.9605581287389633e-07, + "logits/chosen": -1.3867130279541016, + "logits/rejected": -1.2174745798110962, + "logps/chosen": -196.43331909179688, + "logps/rejected": -213.40365600585938, + "loss": 0.6256, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03618159890174866, + "rewards/margins": 0.15966863930225372, + "rewards/margins_max": 0.2609175443649292, + "rewards/margins_min": 0.05841972678899765, + "rewards/margins_std": 0.14318758249282837, + "rewards/rejected": -0.12348704040050507, + "step": 2340 + }, + { + "epoch": 0.74, + "grad_norm": 0.52734375, + "learning_rate": 1.9170717577816786e-07, + "logits/chosen": -1.4080374240875244, + "logits/rejected": -1.087418794631958, + "logps/chosen": -227.89010620117188, + "logps/rejected": -259.26116943359375, + "loss": 0.6217, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03795627877116203, + "rewards/margins": 0.15768983960151672, + "rewards/margins_max": 0.24818861484527588, + "rewards/margins_min": 0.06719101220369339, + "rewards/margins_std": 0.1279846429824829, + "rewards/rejected": -0.1197335347533226, + "step": 2350 + }, + { + "epoch": 0.74, + "grad_norm": 0.50390625, + "learning_rate": 1.873958415635698e-07, + "logits/chosen": -1.5294129848480225, + "logits/rejected": -1.2719072103500366, + "logps/chosen": -238.9661102294922, + "logps/rejected": -242.7552947998047, + "loss": 0.6315, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04142700880765915, + "rewards/margins": 0.14358402788639069, + "rewards/margins_max": 0.21057486534118652, + "rewards/margins_min": 0.07659320533275604, + "rewards/margins_std": 0.09473933279514313, + "rewards/rejected": -0.10215701907873154, + "step": 2360 + }, + { + "epoch": 0.75, + "grad_norm": 0.65625, + "learning_rate": 1.8312233189384192e-07, + "logits/chosen": -1.48410964012146, + "logits/rejected": -0.9692693948745728, + "logps/chosen": -229.7482452392578, + "logps/rejected": -221.9854278564453, + "loss": 0.6061, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.059097446501255035, + "rewards/margins": 0.17855104804039001, + "rewards/margins_max": 0.27842646837234497, + "rewards/margins_min": 0.07867564260959625, + "rewards/margins_std": 0.14124515652656555, + "rewards/rejected": -0.11945360898971558, + "step": 2370 + }, + { + "epoch": 0.75, + "grad_norm": 0.412109375, + "learning_rate": 1.7888716385602205e-07, + "logits/chosen": -1.4760620594024658, + "logits/rejected": -0.9696518182754517, + "logps/chosen": -222.237548828125, + "logps/rejected": -202.98983764648438, + "loss": 0.6081, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04904181510210037, + "rewards/margins": 0.19830942153930664, + "rewards/margins_max": 0.2989969253540039, + "rewards/margins_min": 0.09762193262577057, + "rewards/margins_std": 0.14239361882209778, + "rewards/rejected": -0.14926761388778687, + "step": 2380 + }, + { + "epoch": 0.75, + "grad_norm": 0.462890625, + "learning_rate": 1.7469084989787908e-07, + "logits/chosen": -1.5105534791946411, + "logits/rejected": -1.268873929977417, + "logps/chosen": -218.3324432373047, + "logps/rejected": -277.1694030761719, + "loss": 0.6269, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05644305422902107, + "rewards/margins": 0.1589832305908203, + "rewards/margins_max": 0.2425994873046875, + "rewards/margins_min": 0.07536697387695312, + "rewards/margins_std": 0.11825122684240341, + "rewards/rejected": -0.10254015773534775, + "step": 2390 + }, + { + "epoch": 0.76, + "grad_norm": 0.5546875, + "learning_rate": 1.705338977659071e-07, + "logits/chosen": -1.3833867311477661, + "logits/rejected": -1.0862524509429932, + "logps/chosen": -222.6822967529297, + "logps/rejected": -247.6257781982422, + "loss": 0.6138, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04524652659893036, + "rewards/margins": 0.17377988994121552, + "rewards/margins_max": 0.2417949140071869, + "rewards/margins_min": 0.10576488077640533, + "rewards/margins_std": 0.09618774056434631, + "rewards/rejected": -0.12853336334228516, + "step": 2400 + }, + { + "epoch": 0.76, + "grad_norm": 0.462890625, + "learning_rate": 1.664168104438901e-07, + "logits/chosen": -1.3712468147277832, + "logits/rejected": -1.1276142597198486, + "logps/chosen": -232.34817504882812, + "logps/rejected": -245.1379852294922, + "loss": 0.616, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04633709043264389, + "rewards/margins": 0.16087999939918518, + "rewards/margins_max": 0.24791808426380157, + "rewards/margins_min": 0.07384191453456879, + "rewards/margins_std": 0.123090460896492, + "rewards/rejected": -0.11454291641712189, + "step": 2410 + }, + { + "epoch": 0.76, + "grad_norm": 0.443359375, + "learning_rate": 1.6234008609204104e-07, + "logits/chosen": -1.4258971214294434, + "logits/rejected": -1.0054690837860107, + "logps/chosen": -274.0898132324219, + "logps/rejected": -227.77304077148438, + "loss": 0.6347, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04632297903299332, + "rewards/margins": 0.1191258653998375, + "rewards/margins_max": 0.17983102798461914, + "rewards/margins_min": 0.058420680463314056, + "rewards/margins_std": 0.0858500748872757, + "rewards/rejected": -0.07280287891626358, + "step": 2420 + }, + { + "epoch": 0.77, + "grad_norm": 0.412109375, + "learning_rate": 1.5830421798672565e-07, + "logits/chosen": -1.5638043880462646, + "logits/rejected": -1.2185170650482178, + "logps/chosen": -212.51797485351562, + "logps/rejected": -266.51495361328125, + "loss": 0.6243, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06803154945373535, + "rewards/margins": 0.15410315990447998, + "rewards/margins_max": 0.22409594058990479, + "rewards/margins_min": 0.08411036431789398, + "rewards/margins_std": 0.09898475557565689, + "rewards/rejected": -0.08607159554958344, + "step": 2430 + }, + { + "epoch": 0.77, + "grad_norm": 0.5703125, + "learning_rate": 1.5430969446077675e-07, + "logits/chosen": -1.206554651260376, + "logits/rejected": -0.9525250196456909, + "logps/chosen": -220.4082489013672, + "logps/rejected": -265.2112731933594, + "loss": 0.6248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.025963077321648598, + "rewards/margins": 0.16358208656311035, + "rewards/margins_max": 0.23860347270965576, + "rewards/margins_min": 0.08856071531772614, + "rewards/margins_std": 0.10609626770019531, + "rewards/rejected": -0.1376190185546875, + "step": 2440 + }, + { + "epoch": 0.77, + "grad_norm": 0.486328125, + "learning_rate": 1.5035699884440695e-07, + "logits/chosen": -1.298842191696167, + "logits/rejected": -0.9138208627700806, + "logps/chosen": -207.61990356445312, + "logps/rejected": -222.8286590576172, + "loss": 0.6145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.034749679267406464, + "rewards/margins": 0.13267698884010315, + "rewards/margins_max": 0.20068855583667755, + "rewards/margins_min": 0.06466543674468994, + "rewards/margins_std": 0.09618286788463593, + "rewards/rejected": -0.09792731702327728, + "step": 2450 + }, + { + "epoch": 0.77, + "grad_norm": 0.5234375, + "learning_rate": 1.4644660940672627e-07, + "logits/chosen": -1.326765775680542, + "logits/rejected": -1.1471151113510132, + "logps/chosen": -204.8334197998047, + "logps/rejected": -205.65042114257812, + "loss": 0.6283, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.018175512552261353, + "rewards/margins": 0.12012593448162079, + "rewards/margins_max": 0.1839393973350525, + "rewards/margins_min": 0.05631248280405998, + "rewards/margins_std": 0.09024585783481598, + "rewards/rejected": -0.10195042937994003, + "step": 2460 + }, + { + "epoch": 0.78, + "grad_norm": 0.451171875, + "learning_rate": 1.4257899929787292e-07, + "logits/chosen": -1.4262887239456177, + "logits/rejected": -1.038913607597351, + "logps/chosen": -245.40530395507812, + "logps/rejected": -239.4207305908203, + "loss": 0.6269, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05134139209985733, + "rewards/margins": 0.15652242302894592, + "rewards/margins_max": 0.22957094013690948, + "rewards/margins_min": 0.08347393572330475, + "rewards/margins_std": 0.10330617427825928, + "rewards/rejected": -0.10518103837966919, + "step": 2470 + }, + { + "epoch": 0.78, + "grad_norm": 0.328125, + "learning_rate": 1.3875463649176282e-07, + "logits/chosen": -1.3459253311157227, + "logits/rejected": -1.2240893840789795, + "logps/chosen": -160.60562133789062, + "logps/rejected": -167.91712951660156, + "loss": 0.6363, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03096461296081543, + "rewards/margins": 0.10902484506368637, + "rewards/margins_max": 0.15568535029888153, + "rewards/margins_min": 0.062364332377910614, + "rewards/margins_std": 0.06598792225122452, + "rewards/rejected": -0.07806022465229034, + "step": 2480 + }, + { + "epoch": 0.78, + "grad_norm": 0.462890625, + "learning_rate": 1.34973983729465e-07, + "logits/chosen": -1.245940089225769, + "logits/rejected": -0.9054630994796753, + "logps/chosen": -240.6822509765625, + "logps/rejected": -227.9288787841797, + "loss": 0.6368, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05721098184585571, + "rewards/margins": 0.12238309532403946, + "rewards/margins_max": 0.16339154541492462, + "rewards/margins_min": 0.0813746303319931, + "rewards/margins_std": 0.05799471214413643, + "rewards/rejected": -0.06517211347818375, + "step": 2490 + }, + { + "epoch": 0.79, + "grad_norm": 0.482421875, + "learning_rate": 1.312374984632118e-07, + "logits/chosen": -1.4160993099212646, + "logits/rejected": -0.9629823565483093, + "logps/chosen": -220.81332397460938, + "logps/rejected": -166.48544311523438, + "loss": 0.6145, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.025352340191602707, + "rewards/margins": 0.1693764626979828, + "rewards/margins_max": 0.24544978141784668, + "rewards/margins_min": 0.0933031514286995, + "rewards/margins_std": 0.1075839176774025, + "rewards/rejected": -0.14402411878108978, + "step": 2500 + }, + { + "epoch": 0.79, + "grad_norm": 0.5703125, + "learning_rate": 1.2754563280104714e-07, + "logits/chosen": -1.5705251693725586, + "logits/rejected": -1.1976501941680908, + "logps/chosen": -190.46922302246094, + "logps/rejected": -252.851806640625, + "loss": 0.6235, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.06647202372550964, + "rewards/margins": 0.14776286482810974, + "rewards/margins_max": 0.23496422171592712, + "rewards/margins_min": 0.06056150048971176, + "rewards/margins_std": 0.12332135438919067, + "rewards/rejected": -0.0812908411026001, + "step": 2510 + }, + { + "epoch": 0.79, + "grad_norm": 0.5859375, + "learning_rate": 1.238988334521226e-07, + "logits/chosen": -1.442922592163086, + "logits/rejected": -1.111483097076416, + "logps/chosen": -182.8762664794922, + "logps/rejected": -199.7118682861328, + "loss": 0.6118, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.050991643220186234, + "rewards/margins": 0.15598046779632568, + "rewards/margins_max": 0.24103443324565887, + "rewards/margins_min": 0.0709264725446701, + "rewards/margins_std": 0.12028451263904572, + "rewards/rejected": -0.10498883575201035, + "step": 2520 + }, + { + "epoch": 0.8, + "grad_norm": 0.44140625, + "learning_rate": 1.202975416726464e-07, + "logits/chosen": -1.4351770877838135, + "logits/rejected": -1.0629112720489502, + "logps/chosen": -229.93716430664062, + "logps/rejected": -258.378173828125, + "loss": 0.6123, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.058566201478242874, + "rewards/margins": 0.15714897215366364, + "rewards/margins_max": 0.21117070317268372, + "rewards/margins_min": 0.10312725603580475, + "rewards/margins_std": 0.07639826089143753, + "rewards/rejected": -0.09858278185129166, + "step": 2530 + }, + { + "epoch": 0.8, + "grad_norm": 0.6484375, + "learning_rate": 1.1674219321249212e-07, + "logits/chosen": -1.4108346700668335, + "logits/rejected": -1.0379550457000732, + "logps/chosen": -225.57199096679688, + "logps/rejected": -248.28396606445312, + "loss": 0.6231, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.042424093931913376, + "rewards/margins": 0.16670480370521545, + "rewards/margins_max": 0.23077841103076935, + "rewards/margins_min": 0.10263122618198395, + "rewards/margins_std": 0.0906137228012085, + "rewards/rejected": -0.12428070604801178, + "step": 2540 + }, + { + "epoch": 0.8, + "grad_norm": 0.3671875, + "learning_rate": 1.1323321826247345e-07, + "logits/chosen": -1.5495703220367432, + "logits/rejected": -1.2719228267669678, + "logps/chosen": -207.6171417236328, + "logps/rejected": -208.72048950195312, + "loss": 0.6275, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05759170651435852, + "rewards/margins": 0.14356836676597595, + "rewards/margins_max": 0.2183394432067871, + "rewards/margins_min": 0.0687972754240036, + "rewards/margins_std": 0.10574229806661606, + "rewards/rejected": -0.08597666025161743, + "step": 2550 + }, + { + "epoch": 0.81, + "grad_norm": 0.5859375, + "learning_rate": 1.0977104140229265e-07, + "logits/chosen": -1.298298954963684, + "logits/rejected": -0.9742182493209839, + "logps/chosen": -213.3368682861328, + "logps/rejected": -246.1780548095703, + "loss": 0.6203, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04572229087352753, + "rewards/margins": 0.20839472115039825, + "rewards/margins_max": 0.3573569357395172, + "rewards/margins_min": 0.059432536363601685, + "rewards/margins_std": 0.21066434681415558, + "rewards/rejected": -0.16267244517803192, + "step": 2560 + }, + { + "epoch": 0.81, + "grad_norm": 0.51953125, + "learning_rate": 1.0635608154916647e-07, + "logits/chosen": -1.477253794670105, + "logits/rejected": -0.9998503923416138, + "logps/chosen": -280.1942138671875, + "logps/rejected": -280.0354919433594, + "loss": 0.6302, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04247181490063667, + "rewards/margins": 0.15468376874923706, + "rewards/margins_max": 0.21085813641548157, + "rewards/margins_min": 0.09850938618183136, + "rewards/margins_std": 0.0794425681233406, + "rewards/rejected": -0.11221196502447128, + "step": 2570 + }, + { + "epoch": 0.81, + "grad_norm": 0.4921875, + "learning_rate": 1.0298875190713801e-07, + "logits/chosen": -1.3592890501022339, + "logits/rejected": -0.968646228313446, + "logps/chosen": -242.7200927734375, + "logps/rejected": -223.07980346679688, + "loss": 0.6299, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0529966726899147, + "rewards/margins": 0.1338491290807724, + "rewards/margins_max": 0.20661978423595428, + "rewards/margins_min": 0.061078451573848724, + "rewards/margins_std": 0.1029132753610611, + "rewards/rejected": -0.0808524563908577, + "step": 2580 + }, + { + "epoch": 0.82, + "grad_norm": 0.5390625, + "learning_rate": 9.966945991708003e-08, + "logits/chosen": -1.3368492126464844, + "logits/rejected": -0.9701792597770691, + "logps/chosen": -254.62747192382812, + "logps/rejected": -180.07473754882812, + "loss": 0.6205, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04608957841992378, + "rewards/margins": 0.12919795513153076, + "rewards/margins_max": 0.20184385776519775, + "rewards/margins_min": 0.056552063673734665, + "rewards/margins_std": 0.10273680835962296, + "rewards/rejected": -0.08310838788747787, + "step": 2590 + }, + { + "epoch": 0.82, + "grad_norm": 0.279296875, + "learning_rate": 9.639860720739523e-08, + "logits/chosen": -1.458553433418274, + "logits/rejected": -1.2161775827407837, + "logps/chosen": -211.18978881835938, + "logps/rejected": -259.21673583984375, + "loss": 0.6296, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04403019696474075, + "rewards/margins": 0.13392777740955353, + "rewards/margins_max": 0.21003444492816925, + "rewards/margins_min": 0.057821135967969894, + "rewards/margins_std": 0.10763102769851685, + "rewards/rejected": -0.08989757299423218, + "step": 2600 + }, + { + "epoch": 0.82, + "grad_norm": 0.6796875, + "learning_rate": 9.31765895454199e-08, + "logits/chosen": -1.5028635263442993, + "logits/rejected": -1.0774997472763062, + "logps/chosen": -204.22152709960938, + "logps/rejected": -289.7691345214844, + "loss": 0.6152, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.056610412895679474, + "rewards/margins": 0.1657501459121704, + "rewards/margins_max": 0.2222793996334076, + "rewards/margins_min": 0.10922084748744965, + "rewards/margins_std": 0.07994447648525238, + "rewards/rejected": -0.10913971811532974, + "step": 2610 + }, + { + "epoch": 0.83, + "grad_norm": 0.41015625, + "learning_rate": 9.000379678953667e-08, + "logits/chosen": -1.4289562702178955, + "logits/rejected": -1.1931815147399902, + "logps/chosen": -211.5123291015625, + "logps/rejected": -219.6265869140625, + "loss": 0.6264, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.029694851487874985, + "rewards/margins": 0.12272848188877106, + "rewards/margins_max": 0.18134915828704834, + "rewards/margins_min": 0.06410779803991318, + "rewards/margins_std": 0.08290217071771622, + "rewards/rejected": -0.09303363412618637, + "step": 2620 + }, + { + "epoch": 0.83, + "grad_norm": 0.5546875, + "learning_rate": 8.688061284200265e-08, + "logits/chosen": -1.5567282438278198, + "logits/rejected": -1.3019843101501465, + "logps/chosen": -263.53070068359375, + "logps/rejected": -248.8045654296875, + "loss": 0.6245, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0438472144305706, + "rewards/margins": 0.15204408764839172, + "rewards/margins_max": 0.24873380362987518, + "rewards/margins_min": 0.055354367941617966, + "rewards/margins_std": 0.13673990964889526, + "rewards/rejected": -0.10819686949253082, + "step": 2630 + }, + { + "epoch": 0.83, + "grad_norm": 0.341796875, + "learning_rate": 8.380741560249726e-08, + "logits/chosen": -1.3044310808181763, + "logits/rejected": -1.0237683057785034, + "logps/chosen": -185.43040466308594, + "logps/rejected": -250.0048828125, + "loss": 0.6321, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.042631763964891434, + "rewards/margins": 0.12767274677753448, + "rewards/margins_max": 0.19831757247447968, + "rewards/margins_min": 0.05702788755297661, + "rewards/margins_std": 0.09990689903497696, + "rewards/rejected": -0.08504097163677216, + "step": 2640 + }, + { + "epoch": 0.83, + "grad_norm": 0.3984375, + "learning_rate": 8.078457692239809e-08, + "logits/chosen": -1.2682950496673584, + "logits/rejected": -1.0586285591125488, + "logps/chosen": -223.0974884033203, + "logps/rejected": -226.95260620117188, + "loss": 0.6261, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04470429569482803, + "rewards/margins": 0.17180202901363373, + "rewards/margins_max": 0.2570766806602478, + "rewards/margins_min": 0.08652739226818085, + "rewards/margins_std": 0.12059654295444489, + "rewards/rejected": -0.1270977258682251, + "step": 2650 + }, + { + "epoch": 0.84, + "grad_norm": 0.39453125, + "learning_rate": 7.781246255978685e-08, + "logits/chosen": -1.250165581703186, + "logits/rejected": -1.0497492551803589, + "logps/chosen": -234.21694946289062, + "logps/rejected": -198.62652587890625, + "loss": 0.6232, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.024458685889840126, + "rewards/margins": 0.16458727419376373, + "rewards/margins_max": 0.2734147012233734, + "rewards/margins_min": 0.055759839713573456, + "rewards/margins_std": 0.1539052277803421, + "rewards/rejected": -0.14012858271598816, + "step": 2660 + }, + { + "epoch": 0.84, + "grad_norm": 0.42578125, + "learning_rate": 7.4891432135193e-08, + "logits/chosen": -1.3641499280929565, + "logits/rejected": -1.0219401121139526, + "logps/chosen": -206.70797729492188, + "logps/rejected": -205.9700927734375, + "loss": 0.6325, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.035322077572345734, + "rewards/margins": 0.13450254499912262, + "rewards/margins_max": 0.18772754073143005, + "rewards/margins_min": 0.0812775120139122, + "rewards/margins_std": 0.07527154684066772, + "rewards/rejected": -0.0991804450750351, + "step": 2670 + }, + { + "epoch": 0.84, + "grad_norm": 0.408203125, + "learning_rate": 7.202183908808124e-08, + "logits/chosen": -1.22100031375885, + "logits/rejected": -0.9506253004074097, + "logps/chosen": -237.82974243164062, + "logps/rejected": -252.8350372314453, + "loss": 0.6205, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03242502361536026, + "rewards/margins": 0.1422858089208603, + "rewards/margins_max": 0.2102351188659668, + "rewards/margins_min": 0.07433655858039856, + "rewards/margins_std": 0.0960947722196579, + "rewards/rejected": -0.10986080020666122, + "step": 2680 + }, + { + "epoch": 0.85, + "grad_norm": 0.5, + "learning_rate": 6.920403063408526e-08, + "logits/chosen": -1.39353346824646, + "logits/rejected": -0.8322644233703613, + "logps/chosen": -393.1592102050781, + "logps/rejected": -256.74774169921875, + "loss": 0.6135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.043674807995557785, + "rewards/margins": 0.1816544234752655, + "rewards/margins_max": 0.2530291676521301, + "rewards/margins_min": 0.11027966439723969, + "rewards/margins_std": 0.10093915462493896, + "rewards/rejected": -0.13797961175441742, + "step": 2690 + }, + { + "epoch": 0.85, + "grad_norm": 0.44921875, + "learning_rate": 6.643834772299544e-08, + "logits/chosen": -1.4720598459243774, + "logits/rejected": -1.0714927911758423, + "logps/chosen": -207.9117889404297, + "logps/rejected": -220.92147827148438, + "loss": 0.6249, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0551542267203331, + "rewards/margins": 0.12827317416667938, + "rewards/margins_max": 0.2003893405199051, + "rewards/margins_min": 0.05615702271461487, + "rewards/margins_std": 0.10198765993118286, + "rewards/rejected": -0.07311895489692688, + "step": 2700 + }, + { + "epoch": 0.85, + "grad_norm": 0.46484375, + "learning_rate": 6.372512499750471e-08, + "logits/chosen": -1.4824740886688232, + "logits/rejected": -1.176748275756836, + "logps/chosen": -191.59780883789062, + "logps/rejected": -214.6171875, + "loss": 0.6386, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05065562576055527, + "rewards/margins": 0.12029320001602173, + "rewards/margins_max": 0.16716250777244568, + "rewards/margins_min": 0.07342389971017838, + "rewards/margins_std": 0.0662832111120224, + "rewards/rejected": -0.06963758170604706, + "step": 2710 + }, + { + "epoch": 0.86, + "grad_norm": 0.49609375, + "learning_rate": 6.106469075271714e-08, + "logits/chosen": -1.5376232862472534, + "logits/rejected": -1.1263943910598755, + "logps/chosen": -180.64654541015625, + "logps/rejected": -257.4786071777344, + "loss": 0.6108, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.055960871279239655, + "rewards/margins": 0.17603328824043274, + "rewards/margins_max": 0.26209133863449097, + "rewards/margins_min": 0.08997530490159988, + "rewards/margins_std": 0.12170439958572388, + "rewards/rejected": -0.12007243931293488, + "step": 2720 + }, + { + "epoch": 0.86, + "grad_norm": 0.55859375, + "learning_rate": 5.845736689642472e-08, + "logits/chosen": -1.3399255275726318, + "logits/rejected": -0.8938875198364258, + "logps/chosen": -219.58438110351562, + "logps/rejected": -196.69725036621094, + "loss": 0.6215, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0496777668595314, + "rewards/margins": 0.14592623710632324, + "rewards/margins_max": 0.22191736102104187, + "rewards/margins_min": 0.06993507593870163, + "rewards/margins_std": 0.10746772587299347, + "rewards/rejected": -0.09624846279621124, + "step": 2730 + }, + { + "epoch": 0.86, + "grad_norm": 0.47265625, + "learning_rate": 5.590346891015757e-08, + "logits/chosen": -1.5089246034622192, + "logits/rejected": -0.9761932492256165, + "logps/chosen": -251.63235473632812, + "logps/rejected": -270.4111328125, + "loss": 0.626, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06311796605587006, + "rewards/margins": 0.16048596799373627, + "rewards/margins_max": 0.2200118750333786, + "rewards/margins_min": 0.10096003860235214, + "rewards/margins_std": 0.08418238908052444, + "rewards/rejected": -0.09736800938844681, + "step": 2740 + }, + { + "epoch": 0.87, + "grad_norm": 0.546875, + "learning_rate": 5.340330581101088e-08, + "logits/chosen": -1.4080811738967896, + "logits/rejected": -0.965406060218811, + "logps/chosen": -207.83154296875, + "logps/rejected": -219.427978515625, + "loss": 0.6289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04717772826552391, + "rewards/margins": 0.13114681839942932, + "rewards/margins_max": 0.18514610826969147, + "rewards/margins_min": 0.07714752852916718, + "rewards/margins_std": 0.07636652886867523, + "rewards/rejected": -0.08396908640861511, + "step": 2750 + }, + { + "epoch": 0.87, + "grad_norm": 0.427734375, + "learning_rate": 5.0957180114254536e-08, + "logits/chosen": -1.4429428577423096, + "logits/rejected": -1.047139048576355, + "logps/chosen": -210.0402069091797, + "logps/rejected": -201.76473999023438, + "loss": 0.6171, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04892607405781746, + "rewards/margins": 0.1490209847688675, + "rewards/margins_max": 0.2127954214811325, + "rewards/margins_min": 0.08524654805660248, + "rewards/margins_std": 0.09019068628549576, + "rewards/rejected": -0.10009489953517914, + "step": 2760 + }, + { + "epoch": 0.87, + "grad_norm": 0.490234375, + "learning_rate": 4.8565387796728864e-08, + "logits/chosen": -1.4443720579147339, + "logits/rejected": -0.9906848073005676, + "logps/chosen": -175.06602478027344, + "logps/rejected": -188.9914093017578, + "loss": 0.6216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05647973343729973, + "rewards/margins": 0.17043130099773407, + "rewards/margins_max": 0.23241189122200012, + "rewards/margins_min": 0.10845069587230682, + "rewards/margins_std": 0.08765380084514618, + "rewards/rejected": -0.11395156383514404, + "step": 2770 + }, + { + "epoch": 0.88, + "grad_norm": 0.375, + "learning_rate": 4.622821826103285e-08, + "logits/chosen": -1.3417904376983643, + "logits/rejected": -0.9641525149345398, + "logps/chosen": -182.08653259277344, + "logps/rejected": -219.1973876953125, + "loss": 0.6257, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03520121052861214, + "rewards/margins": 0.16146528720855713, + "rewards/margins_max": 0.23596426844596863, + "rewards/margins_min": 0.08696627616882324, + "rewards/margins_std": 0.10535748302936554, + "rewards/rejected": -0.1262640655040741, + "step": 2780 + }, + { + "epoch": 0.88, + "grad_norm": 0.33984375, + "learning_rate": 4.394595430050613e-08, + "logits/chosen": -1.5283844470977783, + "logits/rejected": -0.9562716484069824, + "logps/chosen": -246.2748260498047, + "logps/rejected": -299.6006774902344, + "loss": 0.6238, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.05441068485379219, + "rewards/margins": 0.1941131055355072, + "rewards/margins_max": 0.32748061418533325, + "rewards/margins_min": 0.06074561923742294, + "rewards/margins_std": 0.18861012160778046, + "rewards/rejected": -0.1397024393081665, + "step": 2790 + }, + { + "epoch": 0.88, + "grad_norm": 0.6484375, + "learning_rate": 4.17188720650119e-08, + "logits/chosen": -1.4502097368240356, + "logits/rejected": -1.1390039920806885, + "logps/chosen": -188.01138305664062, + "logps/rejected": -214.70059204101562, + "loss": 0.6255, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04769737645983696, + "rewards/margins": 0.1742611676454544, + "rewards/margins_max": 0.24131028354167938, + "rewards/margins_min": 0.10721202194690704, + "rewards/margins_std": 0.09482181072235107, + "rewards/rejected": -0.12656378746032715, + "step": 2800 + }, + { + "epoch": 0.89, + "grad_norm": 0.337890625, + "learning_rate": 3.954724102752316e-08, + "logits/chosen": -1.4321156740188599, + "logits/rejected": -1.0594213008880615, + "logps/chosen": -205.7993927001953, + "logps/rejected": -190.0954132080078, + "loss": 0.6276, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.05445154756307602, + "rewards/margins": 0.1348884403705597, + "rewards/margins_max": 0.20686748623847961, + "rewards/margins_min": 0.06290940940380096, + "rewards/margins_std": 0.10179372131824493, + "rewards/rejected": -0.08043689280748367, + "step": 2810 + }, + { + "epoch": 0.89, + "grad_norm": 0.4375, + "learning_rate": 3.743132395151705e-08, + "logits/chosen": -1.2473394870758057, + "logits/rejected": -1.0283358097076416, + "logps/chosen": -165.55337524414062, + "logps/rejected": -248.9470977783203, + "loss": 0.6066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0528048500418663, + "rewards/margins": 0.17907902598381042, + "rewards/margins_max": 0.27151721715927124, + "rewards/margins_min": 0.08664089441299438, + "rewards/margins_std": 0.13072729110717773, + "rewards/rejected": -0.12627418339252472, + "step": 2820 + }, + { + "epoch": 0.89, + "grad_norm": 0.470703125, + "learning_rate": 3.537137685918074e-08, + "logits/chosen": -1.4500157833099365, + "logits/rejected": -1.026012659072876, + "logps/chosen": -259.57025146484375, + "logps/rejected": -245.7373046875, + "loss": 0.6139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03459121286869049, + "rewards/margins": 0.16094355285167694, + "rewards/margins_max": 0.2114548683166504, + "rewards/margins_min": 0.1104322299361229, + "rewards/margins_std": 0.07143379747867584, + "rewards/rejected": -0.12635232508182526, + "step": 2830 + }, + { + "epoch": 0.89, + "grad_norm": 0.45703125, + "learning_rate": 3.336764900043332e-08, + "logits/chosen": -1.3304523229599, + "logits/rejected": -1.061004400253296, + "logps/chosen": -204.9029083251953, + "logps/rejected": -177.60275268554688, + "loss": 0.6227, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.052144668996334076, + "rewards/margins": 0.1289752721786499, + "rewards/margins_max": 0.19827334582805634, + "rewards/margins_min": 0.05967719480395317, + "rewards/margins_std": 0.09800229221582413, + "rewards/rejected": -0.07683060318231583, + "step": 2840 + }, + { + "epoch": 0.9, + "grad_norm": 0.59375, + "learning_rate": 3.142038282276732e-08, + "logits/chosen": -1.378488540649414, + "logits/rejected": -1.1016395092010498, + "logps/chosen": -169.7418212890625, + "logps/rejected": -185.75503540039062, + "loss": 0.631, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04850053787231445, + "rewards/margins": 0.12216831743717194, + "rewards/margins_max": 0.18565914034843445, + "rewards/margins_min": 0.058677464723587036, + "rewards/margins_std": 0.08978961408138275, + "rewards/rejected": -0.07366776466369629, + "step": 2850 + }, + { + "epoch": 0.9, + "grad_norm": 0.6015625, + "learning_rate": 2.9529813941912284e-08, + "logits/chosen": -1.3763916492462158, + "logits/rejected": -1.0432021617889404, + "logps/chosen": -192.54698181152344, + "logps/rejected": -203.70191955566406, + "loss": 0.6215, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.045077886432409286, + "rewards/margins": 0.13763293623924255, + "rewards/margins_max": 0.18970827758312225, + "rewards/margins_min": 0.08555762469768524, + "rewards/margins_std": 0.07364563643932343, + "rewards/rejected": -0.09255506098270416, + "step": 2860 + }, + { + "epoch": 0.9, + "grad_norm": 0.53515625, + "learning_rate": 2.7696171113326394e-08, + "logits/chosen": -1.315382480621338, + "logits/rejected": -0.939505398273468, + "logps/chosen": -267.24749755859375, + "logps/rejected": -210.3874969482422, + "loss": 0.6245, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03983648866415024, + "rewards/margins": 0.14058911800384521, + "rewards/margins_max": 0.20600366592407227, + "rewards/margins_min": 0.07517461478710175, + "rewards/margins_std": 0.09251008927822113, + "rewards/rejected": -0.10075263679027557, + "step": 2870 + }, + { + "epoch": 0.91, + "grad_norm": 0.453125, + "learning_rate": 2.591967620451707e-08, + "logits/chosen": -1.398828148841858, + "logits/rejected": -1.0324281454086304, + "logps/chosen": -286.6908264160156, + "logps/rejected": -261.8348083496094, + "loss": 0.6276, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.040006984025239944, + "rewards/margins": 0.12854711711406708, + "rewards/margins_max": 0.18966877460479736, + "rewards/margins_min": 0.06742547452449799, + "rewards/margins_std": 0.08643907308578491, + "rewards/rejected": -0.08854014426469803, + "step": 2880 + }, + { + "epoch": 0.91, + "grad_norm": 0.361328125, + "learning_rate": 2.4200544168195557e-08, + "logits/chosen": -1.3377050161361694, + "logits/rejected": -1.0328240394592285, + "logps/chosen": -182.38858032226562, + "logps/rejected": -261.2072448730469, + "loss": 0.6172, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05574542284011841, + "rewards/margins": 0.1974661648273468, + "rewards/margins_max": 0.28793179988861084, + "rewards/margins_min": 0.10700048506259918, + "rewards/margins_std": 0.12793776392936707, + "rewards/rejected": -0.1417207270860672, + "step": 2890 + }, + { + "epoch": 0.91, + "grad_norm": 0.392578125, + "learning_rate": 2.253898301626789e-08, + "logits/chosen": -1.54763925075531, + "logits/rejected": -0.9423719644546509, + "logps/chosen": -311.39129638671875, + "logps/rejected": -230.20443725585938, + "loss": 0.6198, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05252306908369064, + "rewards/margins": 0.1566629707813263, + "rewards/margins_max": 0.22501006722450256, + "rewards/margins_min": 0.08831588923931122, + "rewards/margins_std": 0.09665738046169281, + "rewards/rejected": -0.10413990169763565, + "step": 2900 + }, + { + "epoch": 0.92, + "grad_norm": 0.470703125, + "learning_rate": 2.0935193794666016e-08, + "logits/chosen": -1.3872407674789429, + "logits/rejected": -0.9147874116897583, + "logps/chosen": -230.1566619873047, + "logps/rejected": -230.2250518798828, + "loss": 0.6168, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05687636137008667, + "rewards/margins": 0.15954247117042542, + "rewards/margins_max": 0.23747679591178894, + "rewards/margins_min": 0.0816081315279007, + "rewards/margins_std": 0.11021579802036285, + "rewards/rejected": -0.10266611725091934, + "step": 2910 + }, + { + "epoch": 0.92, + "grad_norm": 0.375, + "learning_rate": 1.9389370559021345e-08, + "logits/chosen": -1.3618733882904053, + "logits/rejected": -1.2244329452514648, + "logps/chosen": -185.58168029785156, + "logps/rejected": -240.06930541992188, + "loss": 0.6358, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.021337289363145828, + "rewards/margins": 0.12384787946939468, + "rewards/margins_max": 0.1760438084602356, + "rewards/margins_min": 0.07165192067623138, + "rewards/margins_std": 0.0738162100315094, + "rewards/rejected": -0.10251058638095856, + "step": 2920 + }, + { + "epoch": 0.92, + "grad_norm": 0.416015625, + "learning_rate": 1.7901700351184655e-08, + "logits/chosen": -1.4315520524978638, + "logits/rejected": -1.1171866655349731, + "logps/chosen": -231.65274047851562, + "logps/rejected": -276.26422119140625, + "loss": 0.634, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05234185978770256, + "rewards/margins": 0.13299915194511414, + "rewards/margins_max": 0.18938472867012024, + "rewards/margins_min": 0.07661359012126923, + "rewards/margins_std": 0.0797412246465683, + "rewards/rejected": -0.08065730333328247, + "step": 2930 + }, + { + "epoch": 0.93, + "grad_norm": 0.484375, + "learning_rate": 1.647236317659423e-08, + "logits/chosen": -1.3440803289413452, + "logits/rejected": -1.0078037977218628, + "logps/chosen": -256.86236572265625, + "logps/rejected": -324.974609375, + "loss": 0.6218, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029775535687804222, + "rewards/margins": 0.18259739875793457, + "rewards/margins_max": 0.273844838142395, + "rewards/margins_min": 0.09134997427463531, + "rewards/margins_std": 0.1290433555841446, + "rewards/rejected": -0.1528218686580658, + "step": 2940 + }, + { + "epoch": 0.93, + "grad_norm": 0.46484375, + "learning_rate": 1.5101531982495308e-08, + "logits/chosen": -1.383049726486206, + "logits/rejected": -1.0008172988891602, + "logps/chosen": -236.1011505126953, + "logps/rejected": -224.7187042236328, + "loss": 0.626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0493309423327446, + "rewards/margins": 0.13633129000663757, + "rewards/margins_max": 0.20582230389118195, + "rewards/margins_min": 0.06684030592441559, + "rewards/margins_std": 0.09827511012554169, + "rewards/rejected": -0.08700035512447357, + "step": 2950 + }, + { + "epoch": 0.93, + "grad_norm": 0.3671875, + "learning_rate": 1.3789372637014129e-08, + "logits/chosen": -1.3957499265670776, + "logits/rejected": -1.0853421688079834, + "logps/chosen": -206.4956817626953, + "logps/rejected": -263.0644836425781, + "loss": 0.6279, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.047060225158929825, + "rewards/margins": 0.1514916718006134, + "rewards/margins_max": 0.20764505863189697, + "rewards/margins_min": 0.09533828496932983, + "rewards/margins_std": 0.07941287755966187, + "rewards/rejected": -0.10443145036697388, + "step": 2960 + }, + { + "epoch": 0.94, + "grad_norm": 0.474609375, + "learning_rate": 1.253604390908819e-08, + "logits/chosen": -1.1620736122131348, + "logits/rejected": -1.0084644556045532, + "logps/chosen": -197.8514862060547, + "logps/rejected": -299.61712646484375, + "loss": 0.628, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03723246604204178, + "rewards/margins": 0.1657179892063141, + "rewards/margins_max": 0.2536987364292145, + "rewards/margins_min": 0.0777372270822525, + "rewards/margins_std": 0.12442357838153839, + "rewards/rejected": -0.1284855306148529, + "step": 2970 + }, + { + "epoch": 0.94, + "grad_norm": 0.578125, + "learning_rate": 1.1341697449255061e-08, + "logits/chosen": -1.484635829925537, + "logits/rejected": -1.0910263061523438, + "logps/chosen": -301.75091552734375, + "logps/rejected": -205.1429443359375, + "loss": 0.6268, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023064447566866875, + "rewards/margins": 0.09722733497619629, + "rewards/margins_max": 0.1512151062488556, + "rewards/margins_min": 0.04323957860469818, + "rewards/margins_std": 0.07635021954774857, + "rewards/rejected": -0.07416288554668427, + "step": 2980 + }, + { + "epoch": 0.94, + "grad_norm": 0.50390625, + "learning_rate": 1.0206477771303234e-08, + "logits/chosen": -1.451160192489624, + "logits/rejected": -1.0006252527236938, + "logps/chosen": -236.98428344726562, + "logps/rejected": -230.22091674804688, + "loss": 0.6202, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.048841338604688644, + "rewards/margins": 0.15246547758579254, + "rewards/margins_max": 0.22143657505512238, + "rewards/margins_min": 0.08349435031414032, + "rewards/margins_std": 0.09753988683223724, + "rewards/rejected": -0.103624127805233, + "step": 2990 + }, + { + "epoch": 0.95, + "grad_norm": 0.5078125, + "learning_rate": 9.130522234786497e-09, + "logits/chosen": -1.5553325414657593, + "logits/rejected": -1.1860215663909912, + "logps/chosen": -237.86328125, + "logps/rejected": -204.50979614257812, + "loss": 0.6222, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.046629082411527634, + "rewards/margins": 0.11921097338199615, + "rewards/margins_max": 0.17536310851573944, + "rewards/margins_min": 0.06305884569883347, + "rewards/margins_std": 0.0794111043214798, + "rewards/rejected": -0.07258189469575882, + "step": 3000 + }, + { + "epoch": 0.95, + "grad_norm": 0.43359375, + "learning_rate": 8.113961028402894e-09, + "logits/chosen": -1.3244291543960571, + "logits/rejected": -0.9829646944999695, + "logps/chosen": -202.76742553710938, + "logps/rejected": -225.45693969726562, + "loss": 0.6174, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04439740255475044, + "rewards/margins": 0.16769596934318542, + "rewards/margins_max": 0.2617490887641907, + "rewards/margins_min": 0.07364289462566376, + "rewards/margins_std": 0.13301116228103638, + "rewards/rejected": -0.12329860031604767, + "step": 3010 + }, + { + "epoch": 0.95, + "grad_norm": 0.41015625, + "learning_rate": 7.156917154243047e-09, + "logits/chosen": -1.3500535488128662, + "logits/rejected": -0.9996110796928406, + "logps/chosen": -176.10202026367188, + "logps/rejected": -185.0475311279297, + "loss": 0.6364, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.034986961632966995, + "rewards/margins": 0.11707176268100739, + "rewards/margins_max": 0.17164544761180878, + "rewards/margins_min": 0.0624980702996254, + "rewards/margins_std": 0.07717885076999664, + "rewards/rejected": -0.0820847898721695, + "step": 3020 + }, + { + "epoch": 0.95, + "grad_norm": 0.419921875, + "learning_rate": 6.259506412906402e-09, + "logits/chosen": -1.493024230003357, + "logits/rejected": -1.05169677734375, + "logps/chosen": -234.1255340576172, + "logps/rejected": -242.9952392578125, + "loss": 0.6167, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.047915857285261154, + "rewards/margins": 0.13081197440624237, + "rewards/margins_max": 0.20202195644378662, + "rewards/margins_min": 0.05960196256637573, + "rewards/margins_std": 0.10070616006851196, + "rewards/rejected": -0.08289609849452972, + "step": 3030 + }, + { + "epoch": 0.96, + "grad_norm": 0.359375, + "learning_rate": 5.4218373894898696e-09, + "logits/chosen": -1.4888322353363037, + "logits/rejected": -1.1974503993988037, + "logps/chosen": -184.48179626464844, + "logps/rejected": -212.9262237548828, + "loss": 0.6319, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.042879484593868256, + "rewards/margins": 0.12140518426895142, + "rewards/margins_max": 0.1660289615392685, + "rewards/margins_min": 0.07678140699863434, + "rewards/margins_std": 0.06310755014419556, + "rewards/rejected": -0.07852570712566376, + "step": 3040 + }, + { + "epoch": 0.96, + "grad_norm": 0.40625, + "learning_rate": 4.644011440449236e-09, + "logits/chosen": -1.241875410079956, + "logits/rejected": -0.8721693158149719, + "logps/chosen": -175.67909240722656, + "logps/rejected": -243.4571990966797, + "loss": 0.624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.050394732505083084, + "rewards/margins": 0.16494488716125488, + "rewards/margins_max": 0.24216532707214355, + "rewards/margins_min": 0.087724469602108, + "rewards/margins_std": 0.10920616239309311, + "rewards/rejected": -0.1145501583814621, + "step": 3050 + }, + { + "epoch": 0.96, + "grad_norm": 0.421875, + "learning_rate": 3.926122681335353e-09, + "logits/chosen": -1.3381614685058594, + "logits/rejected": -1.1396186351776123, + "logps/chosen": -203.67967224121094, + "logps/rejected": -181.2338104248047, + "loss": 0.6357, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.034172505140304565, + "rewards/margins": 0.09833844006061554, + "rewards/margins_max": 0.14537864923477173, + "rewards/margins_min": 0.05129823088645935, + "rewards/margins_std": 0.06652490049600601, + "rewards/rejected": -0.06416593492031097, + "step": 3060 + }, + { + "epoch": 0.97, + "grad_norm": 0.57421875, + "learning_rate": 3.268257975405697e-09, + "logits/chosen": -1.248270034790039, + "logits/rejected": -0.9826574325561523, + "logps/chosen": -219.90811157226562, + "logps/rejected": -227.92318725585938, + "loss": 0.6273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04911006614565849, + "rewards/margins": 0.166696235537529, + "rewards/margins_max": 0.2482796609401703, + "rewards/margins_min": 0.08511278033256531, + "rewards/margins_std": 0.11537641286849976, + "rewards/rejected": -0.1175861582159996, + "step": 3070 + }, + { + "epoch": 0.97, + "grad_norm": 0.439453125, + "learning_rate": 2.67049692311494e-09, + "logits/chosen": -1.5718332529067993, + "logits/rejected": -1.1442277431488037, + "logps/chosen": -270.47186279296875, + "logps/rejected": -239.42147827148438, + "loss": 0.6136, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05378204584121704, + "rewards/margins": 0.14788804948329926, + "rewards/margins_max": 0.21103155612945557, + "rewards/margins_min": 0.08474452048540115, + "rewards/margins_std": 0.08929841965436935, + "rewards/rejected": -0.09410599619150162, + "step": 3080 + }, + { + "epoch": 0.97, + "grad_norm": 0.482421875, + "learning_rate": 2.132911852482766e-09, + "logits/chosen": -1.3375688791275024, + "logits/rejected": -1.0424758195877075, + "logps/chosen": -217.1631317138672, + "logps/rejected": -195.4293212890625, + "loss": 0.6254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.041864458471536636, + "rewards/margins": 0.16093741357326508, + "rewards/margins_max": 0.2589831054210663, + "rewards/margins_min": 0.06289170682430267, + "rewards/margins_std": 0.1386575698852539, + "rewards/rejected": -0.11907295137643814, + "step": 3090 + }, + { + "epoch": 0.98, + "grad_norm": 0.58203125, + "learning_rate": 1.6555678103425397e-09, + "logits/chosen": -1.3077802658081055, + "logits/rejected": -0.9605744481086731, + "logps/chosen": -252.1176300048828, + "logps/rejected": -258.5810241699219, + "loss": 0.6259, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05393581837415695, + "rewards/margins": 0.12319433689117432, + "rewards/margins_max": 0.18517567217350006, + "rewards/margins_min": 0.06121302396059036, + "rewards/margins_std": 0.08765482902526855, + "rewards/rejected": -0.06925852596759796, + "step": 3100 + }, + { + "epoch": 0.98, + "grad_norm": 0.6015625, + "learning_rate": 1.2385225544709887e-09, + "logits/chosen": -1.3411720991134644, + "logits/rejected": -1.007331132888794, + "logps/chosen": -249.7186279296875, + "logps/rejected": -262.7857360839844, + "loss": 0.6249, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.051185525953769684, + "rewards/margins": 0.15800713002681732, + "rewards/margins_max": 0.23525485396385193, + "rewards/margins_min": 0.0807594358921051, + "rewards/margins_std": 0.10924477875232697, + "rewards/rejected": -0.10682161897420883, + "step": 3110 + }, + { + "epoch": 0.98, + "grad_norm": 0.33984375, + "learning_rate": 8.818265465991293e-10, + "logits/chosen": -1.4163812398910522, + "logits/rejected": -1.0825564861297607, + "logps/chosen": -185.9810028076172, + "logps/rejected": -186.9814453125, + "loss": 0.622, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06464166939258575, + "rewards/margins": 0.1453738510608673, + "rewards/margins_max": 0.21843528747558594, + "rewards/margins_min": 0.07231242954730988, + "rewards/margins_std": 0.10332445055246353, + "rewards/rejected": -0.08073217421770096, + "step": 3120 + }, + { + "epoch": 0.99, + "grad_norm": 0.5078125, + "learning_rate": 5.855229463068712e-10, + "logits/chosen": -1.3847262859344482, + "logits/rejected": -1.087875485420227, + "logps/chosen": -213.4091339111328, + "logps/rejected": -301.65533447265625, + "loss": 0.6268, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04394926875829697, + "rewards/margins": 0.15025286376476288, + "rewards/margins_max": 0.2213151901960373, + "rewards/margins_min": 0.07919053733348846, + "rewards/margins_std": 0.10049732029438019, + "rewards/rejected": -0.1063036099076271, + "step": 3130 + }, + { + "epoch": 0.99, + "grad_norm": 0.4375, + "learning_rate": 3.4964760580069585e-10, + "logits/chosen": -1.5246410369873047, + "logits/rejected": -1.0728760957717896, + "logps/chosen": -187.12368774414062, + "logps/rejected": -201.53306579589844, + "loss": 0.625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04492334648966789, + "rewards/margins": 0.1522952765226364, + "rewards/margins_max": 0.2121496945619583, + "rewards/margins_min": 0.09244086593389511, + "rewards/margins_std": 0.08464692533016205, + "rewards/rejected": -0.10737194120883942, + "step": 3140 + }, + { + "epoch": 0.99, + "grad_norm": 0.6171875, + "learning_rate": 1.742290655755707e-10, + "logits/chosen": -1.3557556867599487, + "logits/rejected": -0.9096490740776062, + "logps/chosen": -241.07785034179688, + "logps/rejected": -226.32803344726562, + "loss": 0.6191, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.045760609209537506, + "rewards/margins": 0.16645553708076477, + "rewards/margins_max": 0.2321435958147049, + "rewards/margins_min": 0.10076741874217987, + "rewards/margins_std": 0.09289699047803879, + "rewards/rejected": -0.12069491297006607, + "step": 3150 + }, + { + "epoch": 1.0, + "grad_norm": 0.5, + "learning_rate": 5.928855096154483e-11, + "logits/chosen": -1.4452476501464844, + "logits/rejected": -1.1195826530456543, + "logps/chosen": -207.44882202148438, + "logps/rejected": -200.09043884277344, + "loss": 0.6394, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04787784069776535, + "rewards/margins": 0.1137089729309082, + "rewards/margins_max": 0.15814949572086334, + "rewards/margins_min": 0.06926842778921127, + "rewards/margins_std": 0.06284840404987335, + "rewards/rejected": -0.06583113223314285, + "step": 3160 + }, + { + "epoch": 1.0, + "grad_norm": 0.58984375, + "learning_rate": 4.839969555581192e-12, + "logits/chosen": -1.35175359249115, + "logits/rejected": -0.9090153574943542, + "logps/chosen": -226.45285034179688, + "logps/rejected": -246.6789093017578, + "loss": 0.6183, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.06078491359949112, + "rewards/margins": 0.1678098440170288, + "rewards/margins_max": 0.27439039945602417, + "rewards/margins_min": 0.061229269951581955, + "rewards/margins_std": 0.15072770416736603, + "rewards/rejected": -0.10702493041753769, + "step": 3170 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.034969449043274, + "eval_logits/rejected": -0.9127383232116699, + "eval_logps/chosen": -324.2853088378906, + "eval_logps/rejected": -316.0207824707031, + "eval_loss": 0.689082682132721, + "eval_rewards/accuracies": 0.5569999814033508, + "eval_rewards/chosen": 0.006784230004996061, + "eval_rewards/margins": 0.010349688120186329, + "eval_rewards/margins_max": 0.12646563351154327, + "eval_rewards/margins_min": -0.10888691246509552, + "eval_rewards/margins_std": 0.07831301540136337, + "eval_rewards/rejected": -0.003565457882359624, + "eval_runtime": 1446.1513, + "eval_samples_per_second": 2.766, + "eval_steps_per_second": 0.173, + "step": 3174 + }, + { + "epoch": 1.0, + "step": 3174, + "total_flos": 0.0, + "train_loss": 0.6393037238208168, + "train_runtime": 24580.6168, + "train_samples_per_second": 1.033, + "train_steps_per_second": 0.129 + } + ], + "logging_steps": 10, + "max_steps": 3174, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}